diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index 4fa29c00dde..7a77fd8be8c 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -48,7 +48,7 @@ namespace impl { namespace HiFi { namespace native { -Tensor& clamp_tensor_out( +Tensor& clamp_Tensor_out( RuntimeContext& ctx, const Tensor& in, const executorch::aten::optional& min_opt, diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c index 6f95360ed99..65ff9f735ad 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c @@ -33,63 +33,65 @@ #include "xa_nnlib_common.h" -const union ufloat32uint32 xa_nnlib_plusInff ={0x7f800000}; -const union ufloat32uint32 xa_nnlib_qNaNf = { 0x7fc00000 }; -const union ufloat32uint32 pif ={0x40490fdb}; /* pi */ -const union ufloat32uint32 pi2f={0x3fc90fdb}; /* pi/2 */ - -const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl1[8] = -{ - {0x3dbc14c0},/* 9.183645248413086e-002 */ - {0xbe30c39c},/*-1.726211905479431e-001 */ - {0x3b2791e4},/* 2.556913532316685e-003 */ - {0x3e4dac9d},/* 2.008537799119949e-001 */ - {0xb97d9a57},/*-2.418545627733693e-004 */ - {0xbeaaa7b5},/*-3.333107531070709e-001 */ - {0xb54f34c8},/*-7.719031600572635e-007 */ +const union ufloat32uint32 xa_nnlib_plusInff_local = {0x7f800000}; +const union ufloat32uint32 xa_nnlib_qNaNf = {0x7fc00000}; +const union ufloat32uint32 pif = {0x40490fdb}; /* pi */ +const union ufloat32uint32 pi2f = {0x3fc90fdb}; /* pi/2 */ + +const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl1[8] = { + {0x3dbc14c0}, /* 9.183645248413086e-002 */ + {0xbe30c39c}, /*-1.726211905479431e-001 */ + {0x3b2791e4}, /* 2.556913532316685e-003 */ + {0x3e4dac9d}, /* 2.008537799119949e-001 */ + {0xb97d9a57}, /*-2.418545627733693e-004 */ + {0xbeaaa7b5}, /*-3.333107531070709e-001 */ + {0xb54f34c8}, /*-7.719031600572635e-007 */ {0x31cf3fa2} /* 6.031727117772334e-009 */ }; -const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl2[8]= -{ - {0xbcccc037},/*-2.499399892985821e-002 */ - {0x3e217c35},/* 1.577003747224808e-001 */ - {0xbecf4163},/*-4.047957360744476e-001 */ - {0x3ef7b762},/* 4.838209748268127e-001 */ - {0xbdf35059},/*-1.188055947422981e-001 */ - {0xbe9b8b75},/*-3.037983477115631e-001 */ - {0xbb80ed5c},/*-3.934545442461968e-003 */ +const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl2[8] = { + {0xbcccc037}, /*-2.499399892985821e-002 */ + {0x3e217c35}, /* 1.577003747224808e-001 */ + {0xbecf4163}, /*-4.047957360744476e-001 */ + {0x3ef7b762}, /* 4.838209748268127e-001 */ + {0xbdf35059}, /*-1.188055947422981e-001 */ + {0xbe9b8b75}, /*-3.037983477115631e-001 */ + {0xbb80ed5c}, /*-3.934545442461968e-003 */ {0x3956fc52} /* 2.050262701231986e-004 */ }; #if !HAVE_VFPU && !HAVE_FPU -DISCARD_FUN(void, xa_nn_elm_atan2_f32,( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, int N )) +DISCARD_FUN( + void, + xa_nn_elm_atan2_f32, + (FLOAT32 * z, const FLOAT32* y, const FLOAT32* x, int N)) #elif HAVE_VFPU -#define sz_f32 (int)sizeof(FLOAT32) +#define sz_f32 (int)sizeof(FLOAT32) /*=========================================================================== Vector matematics: - vec_atan2 full quadrant Arctangent + vec_atan2 full quadrant Arctangent ===========================================================================*/ /*------------------------------------------------------------------------- Full-Quadrant Arc Tangent The functions compute the arc tangent of the ratios y[N]/x[N] and store the - result to output vector z[N]. + result to output vector z[N]. Floating point functions output is in radians. Fixed point functions scale its output by pi. NOTE: - 1. Scalar floating point function is compatible with standard ANSI C routines and set - errno and exception flags accordingly - 2. Scalar floating point function assigns EDOM to errno whenever y==0 and x==0. + 1. Scalar floating point function is compatible with standard ANSI C routines +and set errno and exception flags accordingly + 2. Scalar floating point function assigns EDOM to errno whenever y==0 and +x==0. Accuracy: 24 bit version: 768 (3.57e-7) floating point: 2 ULP Special cases: - y | x | result | extra conditions + y | x | result | extra conditions --------|-------|-----------|--------------------- +/-0 | -0 | +/-pi | +/-0 | +0 | +/-0 | @@ -100,7 +102,7 @@ DISCARD_FUN(void, xa_nn_elm_atan2_f32,( FLOAT32 * z, const FLOAT32 * y, const FL +/-y | -inf | +/-pi | finite y>0 +/-y | +inf | +/-0 | finite y>0 +/-inf | x | +/-pi/2 | finite x - +/-inf | -inf | +/-3*pi/4 | + +/-inf | -inf | +/-3*pi/4 | +/-inf | +inf | +/-pi/4 | Input: @@ -112,8 +114,11 @@ DISCARD_FUN(void, xa_nn_elm_atan2_f32,( FLOAT32 * z, const FLOAT32 * y, const FL ---------------------------------------------------------------------------*/ -void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WORD32 N ) -{ +void xa_nn_elm_atan2_f32( + FLOAT32* z, + const FLOAT32* y, + const FLOAT32* x, + WORD32 N) { /* const union ufloat32uint32* p; int sx,sy,big; @@ -135,11 +140,11 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO } else { - // compare x==y is necessary to support (+/-Inf, +/-Inf) cases + // compare x==y is necessary to support (+/-Inf, +/-Inf) cases x = (x == y) ? 1.0f : x / y; } p = (x<0.5f) ? atanftbl1 : atanftbl2; - // approximate atan(x)/x-1 + // approximate atan(x)/x-1 y = p[0].f; y = x*y + p[1].f; y = x*y + p[2].f; @@ -148,7 +153,7 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO y = x*y + p[5].f; y = x*y + p[6].f; y = x*y + p[7].f; - // convert result to true atan(x) + // convert result to true atan(x) y = x*y + x; if (!big) y = pi2f.f - y; @@ -157,26 +162,28 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO return y; */ - const xtfloatx2 * X; - const xtfloatx2 * Y; - xtfloatx2 * restrict Z; - const xtfloatx2 * S_rd; - xtfloatx2 * restrict S_wr; + const xtfloatx2* X; + const xtfloatx2* Y; + xtfloatx2* restrict Z; + const xtfloatx2* S_rd; + xtfloatx2* restrict S_wr; ae_valign X_va, Y_va, Z_va; - /* Current block index; overall number of blocks; number of values in the current block */ + /* Current block index; overall number of blocks; number of values in the + * current block */ int blkIx, blkNum, blkLen; /* Block size, blkLen <= blkSize */ - const int blkSize = MAX_ALLOCA_SZ/sz_f32; + const int blkSize = MAX_ALLOCA_SZ / sz_f32; /* Allocate a fixed-size scratch area on the stack. */ FLOAT32 ALIGN(8) scr[blkSize]; int n; - if ( N<=0 ) return; + if (N <= 0) + return; - NASSERT_ALIGN8( scr ); + NASSERT_ALIGN8(scr); /* * Data are processed in blocks of scratch area size. Further, the algorithm @@ -184,30 +191,29 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO * few loops of managable size. */ - blkNum = ( N + blkSize-1 )/blkSize; + blkNum = (N + blkSize - 1) / blkSize; - for ( blkIx=0; blkIxy0 ) p0 = y0/x0; - * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases. - * else p0 = ( x0==y0 ? 1.f : x0/y0 ); - * + * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) + * cases. else p0 = ( x0==y0 ? 1.f : x0/y0 ); + * * scr[n] = p0; * } * } @@ -223,65 +229,65 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */ xtbool2 b_nan, b_num_inf, b_den_inf, b_eqz, b_subn; - X = (xtfloatx2*)( (uintptr_t)x + blkIx*blkSize*sz_f32 ); - Y = (xtfloatx2*)( (uintptr_t)y + blkIx*blkSize*sz_f32 ); + X = (xtfloatx2*)((uintptr_t)x + blkIx * blkSize * sz_f32); + Y = (xtfloatx2*)((uintptr_t)y + blkIx * blkSize * sz_f32); S_wr = (xtfloatx2*)scr; - X_va = XT_LASX2PP( X ); - Y_va = XT_LASX2PP( Y ); + X_va = XT_LASX2PP(X); + Y_va = XT_LASX2PP(Y); - __Pragma( "loop_count min=1" ); - for ( n=0; n<(blkLen+1)/2; n++ ) - { - XT_LASX2IP( x0, X_va, X ); - XT_LASX2IP( y0, Y_va, Y ); + __Pragma("loop_count min=1"); + for (n = 0; n < (blkLen + 1) / 2; n++) { + XT_LASX2IP(x0, X_va, X); + XT_LASX2IP(y0, Y_va, Y); /* Replicate NaNs in both x and y to ensure NaN propagation. */ - b_nan = XT_UN_SX2( x0, y0 ); - XT_MOVT_SX2( x0, xa_nnlib_qNaNf.f, b_nan ); - XT_MOVT_SX2( y0, xa_nnlib_qNaNf.f, b_nan ); + b_nan = XT_UN_SX2(x0, y0); + XT_MOVT_SX2(x0, xa_nnlib_qNaNf.f, b_nan); + XT_MOVT_SX2(y0, xa_nnlib_qNaNf.f, b_nan); - x0 = XT_ABS_SX2( x0 ); - y0 = XT_ABS_SX2( y0 ); + x0 = XT_ABS_SX2(x0); + y0 = XT_ABS_SX2(y0); /* num <= den */ - num = XT_MIN_SX2( x0, y0 ); - den = XT_MAX_SX2( y0, x0 ); + num = XT_MIN_SX2(x0, y0); + den = XT_MAX_SX2(y0, x0); /* Scale up numerator and denominator if BOTH are subnormal. */ - b_subn = XT_OLT_SX2( num, FLT_MIN ); - scl = (xtfloatx2)8388608.f; XT_MOVF_SX2( scl, (xtfloatx2)1.0f, b_subn ); - num = XT_MUL_SX2( num, scl ); - den = XT_MUL_SX2( den, scl ); + b_subn = XT_OLT_SX2(num, FLT_MIN); + scl = (xtfloatx2)8388608.f; + XT_MOVF_SX2(scl, (xtfloatx2)1.0f, b_subn); + num = XT_MUL_SX2(num, scl); + den = XT_MUL_SX2(den, scl); /* Classify numerator and denominator. */ - b_num_inf = XT_OEQ_SX2( num, xa_nnlib_plusInff.f ); /* Inf/Inf */ - b_den_inf = XT_OEQ_SX2( den, xa_nnlib_plusInff.f ); /* x/Inf */ - b_eqz = XT_OEQ_SX2( den, (xtfloatx2)(xtfloatx2)(0.0f) ); /* 0/0 */ + b_num_inf = XT_OEQ_SX2(num, xa_nnlib_plusInff_local.f); /* Inf/Inf */ + b_den_inf = XT_OEQ_SX2(den, xa_nnlib_plusInff_local.f); /* x/Inf */ + b_eqz = XT_OEQ_SX2(den, (xtfloatx2)(xtfloatx2)(0.0f)); /* 0/0 */ /* Initial appromimation for 1/den. */ - rcp = XT_RECIP0_SX2( den ); + rcp = XT_RECIP0_SX2(den); /* Newton-Raphson iteration for 1/den. */ eps = (xtfloatx2)1.0f; - XT_MSUB_SX2( eps, rcp, den ); - XT_MADD_SX2( rcp, rcp, eps ); + XT_MSUB_SX2(eps, rcp, den); + XT_MADD_SX2(rcp, rcp, eps); /* Approximation for the quotient num/den. */ - quo = XT_MUL_SX2( num, rcp ); + quo = XT_MUL_SX2(num, rcp); /* Refine the quotient by a modified Newton-Raphson iteration. */ eps = num; - XT_MSUB_SX2( eps, quo, den ); - XT_MADD_SX2( quo, rcp, eps ); + XT_MSUB_SX2(eps, quo, den); + XT_MADD_SX2(quo, rcp, eps); /* Force conventional results for special cases. */ - XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_den_inf ); /* x/Inf -> 0 */ - XT_MOVT_SX2( quo, (xtfloatx2)1.0f, b_num_inf ); /* Inf/Inf -> 1 */ - XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_eqz ); /* 0/0 -> 0 */ + XT_MOVT_SX2(quo, (xtfloatx2)(0.0f), b_den_inf); /* x/Inf -> 0 */ + XT_MOVT_SX2(quo, (xtfloatx2)1.0f, b_num_inf); /* Inf/Inf -> 1 */ + XT_MOVT_SX2(quo, (xtfloatx2)(0.0f), b_eqz); /* 0/0 -> 0 */ - XT_SSX2IP( quo, S_wr, +2*sz_f32 ); + XT_SSX2IP(quo, S_wr, +2 * sz_f32); } } - __Pragma( "no_reorder" ); + __Pragma("no_reorder"); /* * Part II, polynomial approximation and full quadrant restoration. @@ -291,18 +297,18 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WO * const union ufloat32uint32 * ptbl; * float32_t x0, y0, z0, p0; * int sx, sy; - * + * * for ( n=0; n0 +/-y | +inf | +/-0 | finite y>0 +/-inf | x | +/-pi/2 | finite x - +/-inf | -inf | +/-3*pi/4 | + +/-inf | -inf | +/-3*pi/4 | +/-inf | +inf | +/-pi/4 | Input: @@ -538,65 +579,69 @@ Special cases: N length of vectors Output: z[N] result, Q15 or floating point - + Restrictions: x, y, z should not overlap ---------------------------------------------------------------------------*/ // Taken from Fusion -void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WORD32 N ) -{ +void xa_nn_elm_atan2_f32( + FLOAT32* z, + const FLOAT32* y, + const FLOAT32* x, + WORD32 N) { /* - * const union ufloat32uint32* p; - * int sx,sy,big; - * sx=takesignf(x); - * sy=takesignf(y); - * x=fabs(x); - * y=fabs(y); - * if(x==0.f && y==0.f) - * { - * // The actual result depends on input signs. - * x = 1.f; - * y = 0.f; - * } - * - * big=x>y; - * if(big) - * { - * x=y/x; - * } - * else - * { - * // compare x==y is necessary to support (+/-Inf, +/-Inf) cases - * x = (x == y) ? 1.0f : x / y; - * } - * p = (x<0.5f) ? atanftbl1 : atanftbl2; - * // approximate atan(x)/x-1 - * y = p[0].f; - * y = x*y + p[1].f; - * y = x*y + p[2].f; - * y = x*y + p[3].f; - * y = x*y + p[4].f; - * y = x*y + p[5].f; - * y = x*y + p[6].f; - * y = x*y + p[7].f; - * // convert result to true atan(x) - * y = x*y + x; - * - * if (!big) y = pi2f.f - y; - * if (sx) y = pif.f - y; - * if (sy) y = -y; - * return y; - */ - const xtfloat * restrict X; - const xtfloat * restrict Y; - int32_t * restrict Z; - const xtfloat * restrict S_rd; - xtfloat * restrict S_wr; - const xtfloat * restrict POLY_TBL1; - const xtfloat * restrict POLY_TBL2; - - /* Current block index; overall number of blocks; number of values in the current block */ + * const union ufloat32uint32* p; + * int sx,sy,big; + * sx=takesignf(x); + * sy=takesignf(y); + * x=fabs(x); + * y=fabs(y); + * if(x==0.f && y==0.f) + * { + * // The actual result depends on input signs. + * x = 1.f; + * y = 0.f; + * } + * + * big=x>y; + * if(big) + * { + * x=y/x; + * } + * else + * { + * // compare x==y is necessary to support (+/-Inf, +/-Inf) cases + * x = (x == y) ? 1.0f : x / y; + * } + * p = (x<0.5f) ? atanftbl1 : atanftbl2; + * // approximate atan(x)/x-1 + * y = p[0].f; + * y = x*y + p[1].f; + * y = x*y + p[2].f; + * y = x*y + p[3].f; + * y = x*y + p[4].f; + * y = x*y + p[5].f; + * y = x*y + p[6].f; + * y = x*y + p[7].f; + * // convert result to true atan(x) + * y = x*y + x; + * + * if (!big) y = pi2f.f - y; + * if (sx) y = pif.f - y; + * if (sy) y = -y; + * return y; + */ + const xtfloat* restrict X; + const xtfloat* restrict Y; + int32_t* restrict Z; + const xtfloat* restrict S_rd; + xtfloat* restrict S_wr; + const xtfloat* restrict POLY_TBL1; + const xtfloat* restrict POLY_TBL2; + + /* Current block index; overall number of blocks; number of values in the + * current block */ int blkIx, blkNum, blkLen; /* Block size, blkLen <= blkSize */ const int blkSize = MAX_ALLOCA_SZ / sz_f32; @@ -605,45 +650,45 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WOR int n; - if (N <= 0) return; + if (N <= 0) + return; NASSERT_ALIGN8(scr); /* - * Data are processed in blocks of scratch area size. Further, the algorithm - * implementation is splitted in order to feed the optimizing compiler with a - * few loops of managable size. - */ + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ blkNum = (N + blkSize - 1) / blkSize; POLY_TBL1 = (xtfloat*)xa_nnlib_atanftbl1; POLY_TBL2 = (xtfloat*)xa_nnlib_atanftbl2; - for (blkIx = 0; blkIxy0 ) p0 = y0/x0; - * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases. - * else p0 = ( x0==y0 ? 1.f : x0/y0 ); - * - * scr[n] = p0; - * } - * } - */ + * Part I, reduction to [0,pi/4]. Reference C code: + * + * { + * float32_t x0, y0, p0; + * + * for ( n=0; ny0 ) p0 = y0/x0; + * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) + * cases. else p0 = ( x0==y0 ? 1.f : x0/y0 ); + * + * scr[n] = p0; + * } + * } + */ { /* Input values */ @@ -654,26 +699,24 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WOR xtfloat s, eps; /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */ xtbool b_nan, b_num_inf, b_den_inf, b_eqz, b_subn; - const xtfloat * pT; + const xtfloat* pT; - X = (xtfloat*)((uintptr_t)x + blkIx*blkSize*sz_f32); - Y = (xtfloat*)((uintptr_t)y + blkIx*blkSize*sz_f32); + X = (xtfloat*)((uintptr_t)x + blkIx * blkSize * sz_f32); + Y = (xtfloat*)((uintptr_t)y + blkIx * blkSize * sz_f32); S_wr = (xtfloat*)scr; - static const uint32_t TAB[4] = { 0x7fc00000, 0x00800000, - 0x4b000000, 0x7f800000 - }; - pT = (xtfloat *)TAB; + static const uint32_t TAB[4] = { + 0x7fc00000, 0x00800000, 0x4b000000, 0x7f800000}; + pT = (xtfloat*)TAB; __Pragma("loop_count min=1"); - for (n = 0; n 0 or x/Inf -> 0*/ + XT_MOVT_S(quo, XT_CONST_S(0), b_eqz); /* 0/0 -> 0 or x/Inf -> 0*/ XT_MOVT_S(quo, XT_CONST_S(1), b_num_inf); /* Inf/Inf -> 1 */ XT_SSIP(quo, S_wr, sz_f32); @@ -720,46 +765,46 @@ void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WOR __Pragma("no_reorder"); /* - * Part II, polynomial approximation and full quadrant restoration. - * Reference C code: - * - * { - * const union ufloat32uint32 * ptbl; - * float32_t x0, y0, z0, p0; - * int sx, sy; - * - * for ( n=0; n