[AArch64] Add alternative division and sqrt methods

With this patch, alternative division and sqrt can be chosen by specifying -DENABLE_ALTDIV=TRUE -DENABLE_ALTSQRT=TRUE as cmake options. The alternative methods use combinations of FMA operations to compute division and sqrt. These methods could possibly be beneficial for micro-architectures on which the corresponding instructions are non-pipelined and have long latencies. Co-authored-by: shibatch <shibatch.sf.net@gmail.com>
shibatch · Mar 21, 2020 · 1ba1836 · 1ba1836
1 parent 374fa97
commit 1ba1836
Show file tree

Hide file tree

Showing 5 changed files with 260 additions and 40 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,9 @@ option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
 
 option(ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
 
+option(ENABLE_ALTDIV  "Enable alternative division method (aarch64 only)" OFF)
+option(ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
+
 # See doc/build-with-cmake.md for instructions on how to build Sleef.
 cmake_minimum_required(VERSION 3.4.3)
 # Set to NEW when updating cmake_minimum_required to VERSION >= 3.7.2
@@ -23,7 +26,7 @@ endif()
 enable_testing()
 
 set(SLEEF_VERSION_MAJOR 3)
-set(SLEEF_VERSION_MINOR 4)
+set(SLEEF_VERSION_MINOR 5)
 set(SLEEF_VERSION_PATCHLEVEL 0)
 set(SLEEF_VERSION ${SLEEF_VERSION_MAJOR}.${SLEEF_VERSION_MINOR}.${SLEEF_VERSION_PATCHLEVEL})
 set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})

diff --git a/src/arch/helperadvsimd.h b/src/arch/helperadvsimd.h
@@ -134,7 +134,7 @@ static INLINE VECTOR_CC vint2 vreinterpret_vi2_vd(vdouble vd) {
 // Broadcast
 static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
 
-// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
+// Add, Sub, Mul
 static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
   return vaddq_f32(x, y);
 }
@@ -144,13 +144,6 @@ static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
 static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
   return vmulq_f32(x, y);
 }
-static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
-  return vdivq_f32(vcast_vf_f(1.0f), d);
-}
-static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
-  return vdivq_f32(n, d);
-}
-static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) { return vsqrtq_f32(d); }
 
 // |x|, -x
 static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
@@ -175,6 +168,74 @@ static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
 static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
 #endif
 
+static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
+  return vfmaq_f32(z, x, y);
+}
+
+static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
+  return vfmsq_f32(z, x, y);
+}
+
+static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
+  return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
+}
+
+// Reciprocal 1/x, Division, Square root
+static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+#ifndef ENABLE_ALTDIV
+  return vdivq_f32(n, d);
+#else
+  // Finite numbers (including denormal) only, gives correctly rounded result
+  float32x4_t t, u, x, y;
+  uint32x4_t i0, i1;
+  i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
+  i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
+  i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
+  t = vreinterpretq_f32_u32(i0);
+  y = vmulq_f32(d, t);
+  x = vmulq_f32(n, t);
+  t = vrecpeq_f32(y);
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  t = vmulq_f32(t, vrecpsq_f32(y, t));
+  u = vmulq_f32(x, t);
+  u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
+  return u;
+#endif
+}
+static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
+#ifndef ENABLE_ALTDIV
+  return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
+#else
+  return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
+		   vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
+#endif
+}
+
+static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
+#ifndef ENABLE_ALTSQRT
+  return vsqrtq_f32(d);
+#else
+  // Gives correctly rounded result for all input range
+  vfloat w, x, y, z;
+
+  y = vrsqrteq_f32(d);
+  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
+  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);
+
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
+  w = vmul_vf_vf_vf(w, y);
+  x = vmul_vf_vf_vf(w, d);
+  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
+  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
+  w = vfma_vf_vf_vf_vf(w, z, y);
+  w = vadd_vf_vf_vf(w, x);
+
+  return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
+			     vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
+#endif
+}
+
 // max, min
 static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
   return vmaxq_f32(x, y);
@@ -271,7 +332,7 @@ static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
 // Broadcast
 static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
 
-// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
+// Add, Sub, Mul
 static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
   return vaddq_f64(x, y);
 }
@@ -281,13 +342,6 @@ static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
 static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
   return vmulq_f64(x, y);
 }
-static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
-  return vdivq_f64(vcast_vd_d(1.0f), d);
-}
-static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
-  return vdivq_f64(n, d);
-}
-static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) { return vsqrtq_f64(d); }
 
 // |x|, -x
 static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
@@ -332,16 +386,63 @@ static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble
   return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));
 }
 
-static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
-  return vfmaq_f32(z, x, y);
+// Reciprocal 1/x, Division, Square root
+static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
+#ifndef ENABLE_ALTDIV
+  return vdivq_f64(n, d);
+#else
+  // Finite numbers (including denormal) only, gives correctly rounded result
+  float64x2_t t, u, x, y;
+  uint64x2_t i0, i1;
+  i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
+  i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
+  i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
+  t = vreinterpretq_f64_u64(i0);
+  y = vmulq_f64(d, t);
+  x = vmulq_f64(n, t);
+  t = vrecpeq_f64(y);
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  t = vmulq_f64(t, vrecpsq_f64(y, t));
+  u = vmulq_f64(x, t);
+  u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
+  return u;
+#endif
 }
-
-static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
-  return vfmsq_f32(z, x, y);
+static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
+#ifndef ENABLE_ALTDIV
+  return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
+#else
+  return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
+		   vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
+#endif
 }
 
-static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
-  return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
+static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
+#ifndef ENABLE_ALTSQRT
+  return vsqrtq_f64(d);
+#else
+  // Gives correctly rounded result for all input range
+  vdouble w, x, y, z;
+
+  y = vrsqrteq_f64(d);
+  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
+  w = vmul_vd_vd_vd(w, y);
+  x = vmul_vd_vd_vd(w, d);
+  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
+  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
+  w = vfma_vd_vd_vd_vd(w, z, y);
+  w = vadd_vd_vd_vd(w, x);
+
+  return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
+			     vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
+#endif
 }
 
 /* Comparisons */

diff --git a/src/arch/helpersve.h b/src/arch/helpersve.h
@@ -184,7 +184,7 @@ static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
 // Broadcast
 static INLINE vfloat vcast_vf_f(float f) { return svdup_n_f32(f); }
 
-// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
+// Add, Sub, Mul
 static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
   return svadd_f32_x(ptrue, x, y);
 }
@@ -194,13 +194,6 @@ static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
 static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
   return svmul_f32_x(ptrue, x, y);
 }
-static INLINE vfloat vrec_vf_vf(vfloat d) {
-  return svdivr_n_f32_x(ptrue, d, 1.0f);
-}
-static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
-  return svdiv_f32_x(ptrue, n, d);
-}
-static INLINE vfloat vsqrt_vf_vf(vfloat d) { return svsqrt_f32_x(ptrue, d); }
 
 // |x|, -x
 static INLINE vfloat vabs_vf_vf(vfloat f) { return svabs_f32_x(ptrue, f); }
@@ -263,6 +256,60 @@ static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
   return svsel_f32(mask, x, y);
 }
 
+// Reciprocal 1/x, Division, Square root
+static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+#ifndef ENABLE_ALTDIV
+  return svdiv_f32_x(ptrue, n, d);
+#else
+  // Finite numbers (including denormal) only, gives correctly rounded result
+  vfloat t, u, x, y;
+  svuint32_t i0, i1;
+  i0 = svand_u32_x(ptrue, svreinterpret_u32_f32(n), svdup_n_u32(0x7c000000));
+  i1 = svand_u32_x(ptrue, svreinterpret_u32_f32(d), svdup_n_u32(0x7c000000));
+  i0 = svsub_u32_x(ptrue, svdup_n_u32(0x7d000000), svlsr_n_u32_x(ptrue, svadd_u32_x(ptrue, i0, i1), 1));
+  t = svreinterpret_f32_u32(i0);
+  y = svmul_f32_x(ptrue, d, t);
+  x = svmul_f32_x(ptrue, n, t);
+  t = svrecpe_f32(y);
+  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
+  t = svmul_f32_x(ptrue, t, svrecps_f32(y, t));
+  u = svmul_f32_x(ptrue, x, t);
+  u = svmad_f32_x(ptrue, svmsb_f32_x(ptrue, y, u, x), t, u);
+  return u;
+#endif
+}
+static INLINE vfloat vrec_vf_vf(vfloat d) {
+#ifndef ENABLE_ALTDIV
+  return svdivr_n_f32_x(ptrue, d, 1.0f);
+#else
+  return vsel_vf_vo_vf_vf(svcmpeq_f32(ptrue, vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
+			  vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
+#endif
+}
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+#ifndef ENABLE_ALTSQRT
+  return svsqrt_f32_x(ptrue, d);
+#else
+  // Gives correctly rounded result for all input range
+  vfloat w, x, y, z;
+
+  y = svrsqrte_f32(d);
+  x = vmul_vf_vf_vf(d, y);         w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
+  x = vfma_vf_vf_vf_vf(x, y, x);   w = vfma_vf_vf_vf_vf(w, y, w);
+
+  y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5));  w = vadd_vf_vf_vf(w, w);
+  w = vmul_vf_vf_vf(w, y);
+  x = vmul_vf_vf_vf(w, d);
+  y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
+  z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
+  w = vfma_vf_vf_vf_vf(w, z, y);
+  w = vadd_vf_vf_vf(w, x);
+
+  return svsel_f32(svorr_b_z(ptrue, svcmpeq_f32(ptrue, d, vcast_vf_f(0)),
+			     svcmpeq_f32(ptrue, d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
+#endif
+}
 //
 //
 //
@@ -526,13 +573,6 @@ static INLINE vdouble vneg_vd_vd(vdouble x) { return svneg_f64_x(ptrue, x); }
 static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
   return svmul_f64_x(ptrue, x, y);
 }
-static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) {
-  return svdiv_f64_x(ptrue, x, y);
-}
-static INLINE vdouble vrec_vd_vd(vdouble x) {
-  return svdivr_n_f64_x(ptrue, x, 1.0);
-}
-static INLINE vdouble vsqrt_vd_vd(vdouble x) { return svsqrt_f64_x(ptrue, x); }
 static INLINE vdouble vabs_vd_vd(vdouble x) { return svabs_f64_x(ptrue, x); }
 static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
   return svmax_f64_x(ptrue, x, y);
@@ -572,6 +612,64 @@ static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y,
   return svnmsb_f64_x(ptrue, x, y, z);
 }
 
+// Reciprocal 1/x, Division, Square root
+static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
+#ifndef ENABLE_ALTDIV
+  return svdiv_f64_x(ptrue, n, d);
+#else
+  // Finite numbers (including denormal) only, gives correctly rounded result
+  vdouble t, u, x, y;
+  svuint64_t i0, i1;
+  i0 = svand_u64_x(ptrue, svreinterpret_u64_f64(n), svdup_n_u64(0x7fc0000000000000L));
+  i1 = svand_u64_x(ptrue, svreinterpret_u64_f64(d), svdup_n_u64(0x7fc0000000000000L));
+  i0 = svsub_u64_x(ptrue, svdup_n_u64(0x7fd0000000000000L), svlsr_n_u64_x(ptrue, svadd_u64_x(ptrue, i0, i1), 1));
+  t = svreinterpret_f64_u64(i0);
+  y = svmul_f64_x(ptrue, d, t);
+  x = svmul_f64_x(ptrue, n, t);
+  t = svrecpe_f64(y);
+  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
+  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
+  t = svmul_f64_x(ptrue, t, svrecps_f64(y, t));
+  u = svmul_f64_x(ptrue, x, t);
+  u = svmad_f64_x(ptrue, svmsb_f64_x(ptrue, y, u, x), t, u);
+  return u;
+#endif
+}
+static INLINE vdouble vrec_vd_vd(vdouble d) {
+#ifndef ENABLE_ALTDIV
+  return svdivr_n_f64_x(ptrue, d, 1.0);
+#else
+  return vsel_vd_vo_vd_vd(svcmpeq_f64(ptrue, vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
+			  vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
+#endif
+}
+static INLINE vdouble vsqrt_vd_vd(vdouble d) {
+#ifndef ENABLE_ALTSQRT
+  return svsqrt_f64_x(ptrue, d);
+#else
+  // Gives correctly rounded result for all input range
+  vdouble w, x, y, z;
+
+  y = svrsqrte_f64(d);
+  x = vmul_vd_vd_vd(d, y);         w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
+  x = vfma_vd_vd_vd_vd(x, y, x);   w = vfma_vd_vd_vd_vd(w, y, w);
+
+  y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5));  w = vadd_vd_vd_vd(w, w);
+  w = vmul_vd_vd_vd(w, y);
+  x = vmul_vd_vd_vd(w, d);
+  y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
+  z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
+  w = vfma_vd_vd_vd_vd(w, z, y);
+  w = vadd_vd_vd_vd(w, x);
+
+  return svsel_f64(svorr_b_z(ptrue, svcmpeq_f64(ptrue, d, vcast_vd_d(0)),
+			     svcmpeq_f64(ptrue, d, vcast_vd_d(SLEEF_INFINITY))), d, w);
+#endif
+}
+
 // Float comparison
 static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
   return svcmplt_f64(ptrue, x, y);