diff --git a/src/arch/helperavx.h b/src/arch/helperavx.h index 4e4f9960..4d2b9788 100644 --- a/src/arch/helperavx.h +++ b/src/arch/helperavx.h @@ -345,9 +345,11 @@ static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, #if CONFIG == 1 static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } diff --git a/src/arch/helperavx512f.h b/src/arch/helperavx512f.h index 3577b1df..7778d6b6 100644 --- a/src/arch/helperavx512f.h +++ b/src/arch/helperavx512f.h @@ -369,6 +369,7 @@ static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _ #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } #endif static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); } diff --git a/src/arch/helperpurec_scalar.h b/src/arch/helperpurec_scalar.h index 948fdaf7..4612396d 100644 --- a/src/arch/helperpurec_scalar.h +++ b/src/arch/helperpurec_scalar.h @@ -303,6 +303,7 @@ static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return x < y ? x : y; } #ifndef ENABLE_FMA_SP static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return x * y + z; } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return - x * y + z; } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x * y - z; } #else static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); } static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); } diff --git a/src/arch/helpersse2.h b/src/arch/helpersse2.h index 95a27c76..2fd78511 100644 --- a/src/arch/helpersse2.h +++ b/src/arch/helpersse2.h @@ -308,6 +308,7 @@ static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); } static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); } static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); } static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); } static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); } diff --git a/src/libm-tester/iut.c b/src/libm-tester/iut.c index cbb9c3d1..1f9f94c2 100644 --- a/src/libm-tester/iut.c +++ b/src/libm-tester/iut.c @@ -569,6 +569,16 @@ int main(int argc, char **argv) { sscanf(buf, "cospif_u05 %x", &u); u = f2u(xcospif_u05(u2f(u))); printf("%x\n", u); + } else if (startsWith(buf, "fastsinf_u100000 ")) { + uint32_t u; + sscanf(buf, "fastsinf_u100000 %x", &u); + u = f2u(xfastsinf_u100000(u2f(u))); + printf("%x\n", u); + } else if (startsWith(buf, "fastcosf_u100000 ")) { + uint32_t u; + sscanf(buf, "fastcosf_u100000 %x", &u); + u = f2u(xfastcosf_u100000(u2f(u))); + printf("%x\n", u); } else if (startsWith(buf, "tanf_u1 ")) { uint32_t u; sscanf(buf, "tanf_u1 %x", &u); diff --git a/src/libm-tester/iutsimd.c b/src/libm-tester/iutsimd.c index 41a7cc90..a1f73e1a 100644 --- a/src/libm-tester/iutsimd.c +++ b/src/libm-tester/iutsimd.c @@ -570,6 +570,11 @@ int do_test(int argc, char **argv) { func_f_f("atanf_u1", xatanf_u1); func_f_f("logf_u1", xlogf_u1); + func_f_f("fastsinf_u100000", xfastsinf_u100000); + func_f_f("fastcosf_u100000", xfastcosf_u100000); + func_f_f("fastsinf_u35", xfastsinf_u35); + func_f_f("fastcosf_u35", xfastcosf_u35); + func_f_f("exp2f", xexp2f); func_f_f("exp10f", xexp10f); func_f_f("expm1f", xexpm1f); diff --git a/src/libm-tester/tester.c b/src/libm-tester/tester.c index dcc2b170..821fd955 100644 --- a/src/libm-tester/tester.c +++ b/src/libm-tester/tester.c @@ -289,6 +289,11 @@ float child_cbrtf_u1(float x) { child_f_f("cbrtf_u1", x); } float child_atan2f_u1(float y, float x) { child_f_f_f("atan2f_u1", y, x); } Sleef_float2 child_sincosf_u1(float x) { child_f2_f("sincosf_u1", x); } +float child_fastsinf_u100000(float x) { child_f_f("fastsinf_u100000", x); } +float child_fastcosf_u100000(float x) { child_f_f("fastcosf_u100000", x); } +float child_fastsinf_u35(float x) { child_f_f("fastsinf_u35", x); } +float child_fastcosf_u35(float x) { child_f_f("fastcosf_u35", x); } + float child_powf(float x, float y) { child_f_f_f("powf", x, y); } float child_sqrtf(float x) { child_f_f("sqrtf", x); } float child_sqrtf_u05(float x) { child_f_f("sqrtf_u05", x); } @@ -4122,6 +4127,19 @@ void do_test() { break; \ } \ } while(0) + +#define checkAccuracy2_f(mpfrFunc, childFunc, argx, bound, abound) do { \ + mpfr_set_d(frx, (float)flushToZero(argx), GMP_RNDN); \ + mpfrFunc(frc, frx, GMP_RNDN); \ + double t = childFunc((float)flushToZero(argx)); \ + double ae = fabs(mpfr_get_d(frc, GMP_RNDN) - t); \ + if (countULPsp(t, frc) > bound && ae > abound) { \ + fprintf(stderr, "\narg = %.20g, test = %.20g, correct = %.20g, ULP = %lf, abserror = %g\n", \ + (float)flushToZero(argx), (double)childFunc((float)flushToZero(argx)), mpfr_get_d(frc, GMP_RNDN), countULPsp(childFunc((float)flushToZero(argx)), frc), ae); \ + success = 0; \ + break; \ + } \ + } while(0) // @@ -4469,6 +4487,16 @@ void do_test() { mpfr_set_default_prec(53); + // + + fprintf(stderr, "fastsinf_u100000 : "); + for(d = -9;d < 9 && success;d += 0.001) checkAccuracy2_f(mpfr_sin, child_fastsinf_u100000, d, 100000, 1e-6); + showResult(success); + + fprintf(stderr, "fastcosf_u100000 : "); + for(d = -9;d < 9 && success;d += 0.001) checkAccuracy2_f(mpfr_cos, child_fastcosf_u100000, d, 100000, 1e-6); + showResult(success); + // fprintf(stderr, "tanf : "); diff --git a/src/libm/funcproto.h b/src/libm/funcproto.h index 3639ba62..de9f5645 100644 --- a/src/libm/funcproto.h +++ b/src/libm/funcproto.h @@ -20,6 +20,7 @@ typedef struct { 2 : "_u05" 3 : "_u35" 4 : "_u15" + 5 : "_u100000" funcType: 0 : vdouble func(vdouble); @@ -34,6 +35,7 @@ typedef struct { flags: 1 : No GNUABI + 2 : No double func */ funcSpec funcList[] = { @@ -66,6 +68,11 @@ funcSpec funcList[] = { { "cosh", 35, 3, 0, 0 }, { "tanh", 35, 3, 0, 0 }, + { "fastsin", 100000, 5, 0, 2 }, + { "fastcos", 100000, 5, 0, 2 }, + { "fastsin", 35, 3, 0, 2 }, + { "fastcos", 35, 3, 0, 2 }, + { "asinh", 10, 0, 0, 0 }, { "acosh", 10, 0, 0, 0 }, { "atanh", 10, 0, 0, 0 }, diff --git a/src/libm/mkalias.c b/src/libm/mkalias.c index 97c12b23..4b40aa30 100644 --- a/src/libm/mkalias.c +++ b/src/libm/mkalias.c @@ -36,7 +36,7 @@ int main(int argc, char **argv) { }; static char *typeSpecS[] = { "", "f" }; static char *typeSpec[] = { "d", "f" }; - static char *ulpSuffixStr[] = { "", "_u1", "_u05", "_u35", "_u15" }; + static char *ulpSuffixStr[] = { "", "_u1", "_u05", "_u35", "_u15", "_u100000" }; static char *vparameterStr[7] = { "v", "vv", "", "vv", "v", "vvv", "" }; static char returnType[9][1000]; @@ -84,6 +84,7 @@ int main(int argc, char **argv) { if (argc == 6) { for(int i=0;funcList[i].name != NULL;i++) { + if ((funcList[i].flags & 2) != 0) continue; if (funcList[i].ulp >= 0) { printf("EXPORT CONST %s Sleef_%s%s%d_u%02d(%s) __attribute__((alias(\"Sleef_%s%s%d_u%02d%s\"))) %s;\n", returnType[funcList[i].funcType], diff --git a/src/libm/mkdisp.c b/src/libm/mkdisp.c index 131d0770..54cc65ca 100644 --- a/src/libm/mkdisp.c +++ b/src/libm/mkdisp.c @@ -30,13 +30,15 @@ int main(int argc, char **argv) { switch(funcList[i].funcType) { case 0: - printf("DISPATCH_vf_vf(%s, Sleef_%sd%d%s, pnt_%sd%d%s, disp_%sd%d%s", - vdoublename, - funcList[i].name, wdp, ulpSuffix0, - funcList[i].name, wdp, ulpSuffix0, - funcList[i].name, wdp, ulpSuffix0); - for(int j=0;j 9.0f)) u = xsinf(t); + + return u; +} + +EXPORT CONST float xfastcosf_u100000(float d) { + int q; + float u, s, t = d, dq; + + dq = rintfk(mlaf(d, (float)M_1_PI, - 0.5f)); + d = mlaf(d, (float)M_1_PI, -dq - 0.5f); + q = (int)mlaf(2, dq, 1); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = +0.2324385881e+1; + u = mlaf(u, s, -0.5145016193e+1f); + u = mlaf(u, s, +0.3141218185e+1f); + + u *= d; + + return u; +} + +EXPORT CONST float xfastcosf_u35(float d) { + int q; + float u, s, t = d, dq; + + dq = rintfk(mlaf(d, (float)M_1_PI, - 0.5f)); + d = mlaf(d, (float)M_1_PI, -dq - 0.5f); + q = (int)mlaf(2, dq, 1); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = +0.7737486064e-1; + u = mlaf(u, s, -0.5981476903e+0f); + u = mlaf(u, s, +0.2550054789e+1f); + u = mlaf(u, s, -0.5167709351e+1f); + u = mlaf(u, s, +0.3141592741e+1f); + + u *= d; + + if (UNLIKELY(fabsfk(t) > 9.0f)) u = xcosf(t); + + return u; +} + EXPORT CONST Sleef_float2 xsincosf(float d) { int q; float u, s, t;