Skip to content

Commit

Permalink
[AArch64] Add alternative division and sqrt methods
Browse files Browse the repository at this point in the history
With this patch, alternative division and sqrt can be chosen by specifying -DENABLE_ALTDIV=TRUE -DENABLE_ALTSQRT=TRUE as cmake options.
The alternative methods use combinations of FMA operations to compute division and sqrt.
These methods could possibly be beneficial for micro-architectures on which the corresponding instructions are non-pipelined and have long latencies.

Co-authored-by: shibatch <shibatch.sf.net@gmail.com>
  • Loading branch information
shibatch and shibatch committed Mar 21, 2020
1 parent 374fa97 commit 1ba1836
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 40 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)

option(ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)

option(ENABLE_ALTDIV "Enable alternative division method (aarch64 only)" OFF)
option(ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)

# See doc/build-with-cmake.md for instructions on how to build Sleef.
cmake_minimum_required(VERSION 3.4.3)
# Set to NEW when updating cmake_minimum_required to VERSION >= 3.7.2
Expand All @@ -23,7 +26,7 @@ endif()
enable_testing()

set(SLEEF_VERSION_MAJOR 3)
set(SLEEF_VERSION_MINOR 4)
set(SLEEF_VERSION_MINOR 5)
set(SLEEF_VERSION_PATCHLEVEL 0)
set(SLEEF_VERSION ${SLEEF_VERSION_MAJOR}.${SLEEF_VERSION_MINOR}.${SLEEF_VERSION_PATCHLEVEL})
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
Expand Down
147 changes: 124 additions & 23 deletions src/arch/helperadvsimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ static INLINE VECTOR_CC vint2 vreinterpret_vi2_vd(vdouble vd) {
// Broadcast
static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }

// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
// Add, Sub, Mul
static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
return vaddq_f32(x, y);
}
Expand All @@ -144,13 +144,6 @@ static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
return vmulq_f32(x, y);
}
static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
return vdivq_f32(vcast_vf_f(1.0f), d);
}
static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
return vdivq_f32(n, d);
}
static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) { return vsqrtq_f32(d); }

// |x|, -x
static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
Expand All @@ -175,6 +168,74 @@ static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
#endif

static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
return vfmaq_f32(z, x, y);
}

static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
return vfmsq_f32(z, x, y);
}

static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
}

// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
#ifndef ENABLE_ALTDIV
return vdivq_f32(n, d);
#else
// Finite numbers (including denormal) only, gives correctly rounded result
float32x4_t t, u, x, y;
uint32x4_t i0, i1;
i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
t = vreinterpretq_f32_u32(i0);
y = vmulq_f32(d, t);
x = vmulq_f32(n, t);
t = vrecpeq_f32(y);
t = vmulq_f32(t, vrecpsq_f32(y, t));
t = vmulq_f32(t, vrecpsq_f32(y, t));
u = vmulq_f32(x, t);
u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
return u;
#endif
}
static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
#ifndef ENABLE_ALTDIV
return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
#else
return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
#endif
}

static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
#ifndef ENABLE_ALTSQRT
return vsqrtq_f32(d);
#else
// Gives correctly rounded result for all input range
vfloat w, x, y, z;

y = vrsqrteq_f32(d);
x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w);

y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w);
w = vmul_vf_vf_vf(w, y);
x = vmul_vf_vf_vf(w, d);
y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
w = vfma_vf_vf_vf_vf(w, z, y);
w = vadd_vf_vf_vf(w, x);

return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
#endif
}

// max, min
static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
return vmaxq_f32(x, y);
Expand Down Expand Up @@ -271,7 +332,7 @@ static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
// Broadcast
static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }

// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
// Add, Sub, Mul
static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
return vaddq_f64(x, y);
}
Expand All @@ -281,13 +342,6 @@ static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
return vmulq_f64(x, y);
}
static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
return vdivq_f64(vcast_vd_d(1.0f), d);
}
static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
return vdivq_f64(n, d);
}
static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) { return vsqrtq_f64(d); }

// |x|, -x
static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
Expand Down Expand Up @@ -332,16 +386,63 @@ static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble
return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));
}

static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
return vfmaq_f32(z, x, y);
// Reciprocal 1/x, Division, Square root
static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
#ifndef ENABLE_ALTDIV
return vdivq_f64(n, d);
#else
// Finite numbers (including denormal) only, gives correctly rounded result
float64x2_t t, u, x, y;
uint64x2_t i0, i1;
i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
t = vreinterpretq_f64_u64(i0);
y = vmulq_f64(d, t);
x = vmulq_f64(n, t);
t = vrecpeq_f64(y);
t = vmulq_f64(t, vrecpsq_f64(y, t));
t = vmulq_f64(t, vrecpsq_f64(y, t));
t = vmulq_f64(t, vrecpsq_f64(y, t));
u = vmulq_f64(x, t);
u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
return u;
#endif
}

static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
return vfmsq_f32(z, x, y);
static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
#ifndef ENABLE_ALTDIV
return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
#else
return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
#endif
}

static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
#ifndef ENABLE_ALTSQRT
return vsqrtq_f64(d);
#else
// Gives correctly rounded result for all input range
vdouble w, x, y, z;

y = vrsqrteq_f64(d);
x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);

y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w);
w = vmul_vd_vd_vd(w, y);
x = vmul_vd_vd_vd(w, d);
y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
w = vfma_vd_vd_vd_vd(w, z, y);
w = vadd_vd_vd_vd(w, x);

return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
#endif
}

/* Comparisons */
Expand Down
128 changes: 113 additions & 15 deletions src/arch/helpersve.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
// Broadcast
static INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); }

// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
// Add, Sub, Mul
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
return svadd_f32_x(ptrue, x, y);
}
Expand All @@ -194,13 +194,6 @@ static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
return svmul_f32_x(ptrue, x, y);
}
static INLINE vfloat vrec_vf_vf(vfloat d) {
return svdivr_n_f32_x(ptrue, d, 1.0f);
}
static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
return svdiv_f32_x(ptrue, n, d);
}
static INLINE vfloat vsqrt_vf_vf(vfloat d) { return svsqrt_f32_x(ptrue, d); }

// |x|, -x
static INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); }
Expand Down Expand Up @@ -263,6 +256,60 @@ static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
return svsel_f32(mask, x, y);
}

// Reciprocal 1/x, Division, Square root
static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
#ifndef ENABLE_ALTDIV
return svdiv_f32_x(ptrue, n, d);
#else
// Finite numbers (including denormal) only, gives correctly rounded result
vfloat t, u, x, y;
svuint32_t i0, i1;
i0 = svand_u32_x(ptrue, svreinterpret_u32_f32(n), svdup_n_u32(0x7c000000));
i1 = svand_u32_x(ptrue, svreinterpret_u32_f32(d), svdup_n_u32(0x7c000000));
i0 = svsub_u32_x(ptrue, svdup_n_u32(0x7d000000), svlsr_n_u32_x(ptrue, svadd_u32_x(ptrue, i0, i1), 1));
t = svreinterpret_f32_u32(i0);
y = svmul_f32_x(ptrue, d, t);
x = svmul_f32_x(ptrue, n, t);
t = svrecpe_f32(y);
t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
u = svmul_f32_x(ptrue, x, t);
u = svmad_f32_x(ptrue, svmsb_f32_x(ptrue, y, u, x), t, u);
return u;
#endif
}
static INLINE vfloat vrec_vf_vf(vfloat d) {
#ifndef ENABLE_ALTDIV
return svdivr_n_f32_x(ptrue, d, 1.0f);
#else
return vsel_vf_vo_vf_vf(svcmpeq_f32(ptrue, vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
#endif
}
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
#ifndef ENABLE_ALTSQRT
return svsqrt_f32_x(ptrue, d);
#else
// Gives correctly rounded result for all input range
vfloat w, x, y, z;

y = svrsqrte_f32(d);
x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w);

y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w);
w = vmul_vf_vf_vf(w, y);
x = vmul_vf_vf_vf(w, d);
y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
w = vfma_vf_vf_vf_vf(w, z, y);
w = vadd_vf_vf_vf(w, x);

return svsel_f32(svorr_b_z(ptrue, svcmpeq_f32(ptrue, d, vcast_vf_f(0)),
svcmpeq_f32(ptrue, d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
#endif
}
//
//
//
Expand Down Expand Up @@ -526,13 +573,6 @@ static INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
return svmul_f64_x(ptrue, x, y);
}
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) {
return svdiv_f64_x(ptrue, x, y);
}
static INLINE vdouble vrec_vd_vd(vdouble x) {
return svdivr_n_f64_x(ptrue, x, 1.0);
}
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return svsqrt_f64_x(ptrue, x); }
static INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
return svmax_f64_x(ptrue, x, y);
Expand Down Expand Up @@ -572,6 +612,64 @@ static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y,
return svnmsb_f64_x(ptrue, x, y, z);
}

// Reciprocal 1/x, Division, Square root
static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
#ifndef ENABLE_ALTDIV
return svdiv_f64_x(ptrue, n, d);
#else
// Finite numbers (including denormal) only, gives correctly rounded result
vdouble t, u, x, y;
svuint64_t i0, i1;
i0 = svand_u64_x(ptrue, svreinterpret_u64_f64(n), svdup_n_u64(0x7fc0000000000000L));
i1 = svand_u64_x(ptrue, svreinterpret_u64_f64(d), svdup_n_u64(0x7fc0000000000000L));
i0 = svsub_u64_x(ptrue, svdup_n_u64(0x7fd0000000000000L), svlsr_n_u64_x(ptrue, svadd_u64_x(ptrue, i0, i1), 1));
t = svreinterpret_f64_u64(i0);
y = svmul_f64_x(ptrue, d, t);
x = svmul_f64_x(ptrue, n, t);
t = svrecpe_f64(y);
t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
u = svmul_f64_x(ptrue, x, t);
u = svmad_f64_x(ptrue, svmsb_f64_x(ptrue, y, u, x), t, u);
return u;
#endif
}
static INLINE vdouble vrec_vd_vd(vdouble d) {
#ifndef ENABLE_ALTDIV
return svdivr_n_f64_x(ptrue, d, 1.0);
#else
return vsel_vd_vo_vd_vd(svcmpeq_f64(ptrue, vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
#endif
}
static INLINE vdouble vsqrt_vd_vd(vdouble d) {
#ifndef ENABLE_ALTSQRT
return svsqrt_f64_x(ptrue, d);
#else
// Gives correctly rounded result for all input range
vdouble w, x, y, z;

y = svrsqrte_f64(d);
x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);

y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w);
w = vmul_vd_vd_vd(w, y);
x = vmul_vd_vd_vd(w, d);
y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
w = vfma_vd_vd_vd_vd(w, z, y);
w = vadd_vd_vd_vd(w, x);

return svsel_f64(svorr_b_z(ptrue, svcmpeq_f64(ptrue, d, vcast_vd_d(0)),
svcmpeq_f64(ptrue, d, vcast_vd_d(SLEEF_INFINITY))), d, w);
#endif
}

// Float comparison
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
return svcmplt_f64(ptrue, x, y);
Expand Down
Loading

0 comments on commit 1ba1836

Please sign in to comment.