more careful use of inline/template function in perfkernels (#15388)

Summary: Pull Request resolved: #15388 This is another pass to make perfkernels code safer from illegal instruction error. Removed dependency to c10/util/Logging.h We're err on the safer side at the expense of some verbosity. Reviewed By: dskhudia Differential Revision: D13502902 fbshipit-source-id: 4f833115df885c5b4f8c1ca83b9badea1553f944
pytorch · Jan 31, 2019 · db12137 · db12137
1 parent 26200eb
commit db12137
Show file tree

Hide file tree

Showing 14 changed files with 1,262 additions and 1,255 deletions.
diff --git a/caffe2/perfkernels/adagrad.cc b/caffe2/perfkernels/adagrad.cc
@@ -71,6 +71,22 @@ void rowwise_adagrad_update__base(
   internal::rowwise_adagrad_update_inlined(N, w, w_n, g, h, h_n, epsilon, lr);
 }
 
+// version without prefetching
+decltype(adagrad_update__base) adagrad_update__avx_f16c;
+void adagrad_update(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    float lr) {
+  AVX_F16C_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
+  BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
+}
+
 decltype(adagrad_update_prefetch__base) adagrad_update_prefetch__avx_f16c;
 void adagrad_update_prefetch(
     int N,
@@ -184,27 +200,11 @@ void rowwise_adagrad_update(
   BASE_DO(rowwise_adagrad_update, N, w, w_n, g, h, h_n, epsilon, lr);
 }
 
-// version without prefetching
-decltype(adagrad_update__base) adagrad_update__avx_f16c;
-void adagrad_update(
-    int N,
-    const float* w,
-    const float* g,
-    const float* h,
-    float* nw,
-    float* nh,
-    float epsilon,
-    float decay,
-    float lr) {
-  AVX_F16C_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
-  BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
-}
-
 SPARSE_ADAGRAD_SPECIALIZATION(int32_t, base);
 
 decltype(sparse_adagrad_int32_t__base) sparse_adagrad_int32_t__avx_f16c;
 template <>
-void sparse_adagrad(
+int sparse_adagrad(
     int num_rows,
     int block_size,
     uint64_t param_size,
@@ -215,8 +215,7 @@ void sparse_adagrad(
     float* nw,
     float* nh,
     float epsilon,
-    float lr,
-    const std::string& param_name) {
+    float lr) {
   AVX_F16C_DO(
       sparse_adagrad_int32_t,
       num_rows,
@@ -229,8 +228,7 @@ void sparse_adagrad(
       nw,
       nh,
       epsilon,
-      lr,
-      param_name);
+      lr);
   BASE_DO(
       sparse_adagrad_int32_t,
       num_rows,
@@ -243,15 +241,14 @@ void sparse_adagrad(
       nw,
       nh,
       epsilon,
-      lr,
-      param_name);
+      lr);
 }
 
 SPARSE_ADAGRAD_SPECIALIZATION(int64_t, base);
 
 decltype(sparse_adagrad_int64_t__base) sparse_adagrad_int64_t__avx_f16c;
 template <>
-void sparse_adagrad(
+int sparse_adagrad(
     int num_rows,
     int block_size,
     uint64_t param_size,
@@ -262,8 +259,7 @@ void sparse_adagrad(
     float* nw,
     float* nh,
     float epsilon,
-    float lr,
-    const std::string& param_name) {
+    float lr) {
   AVX_F16C_DO(
       sparse_adagrad_int64_t,
       num_rows,
@@ -276,8 +272,7 @@ void sparse_adagrad(
       nw,
       nh,
       epsilon,
-      lr,
-      param_name);
+      lr);
   BASE_DO(
       sparse_adagrad_int64_t,
       num_rows,
@@ -290,8 +285,7 @@ void sparse_adagrad(
       nw,
       nh,
       epsilon,
-      lr,
-      param_name);
+      lr);
 }
 
 } // namespace caffe2
diff --git a/caffe2/perfkernels/adagrad.h b/caffe2/perfkernels/adagrad.h
@@ -6,12 +6,14 @@
 #include <immintrin.h>
 #endif
 #include <c10/util/Half.h>
-#include <c10/util/Logging.h>
 
 namespace caffe2 {
 
 namespace internal {
 
+// The following functions inside internal namespace are inlined because they
+// are performance critical.
+
 template <typename T>
 static inline void adagrad_update_base_inlined(
     int N,
@@ -31,6 +33,23 @@ static inline void adagrad_update_base_inlined(
   }
 }
 
+// version with prefetching
+// TODO(msmelyan)
+// Crux of the computation is computing a  / (sqrt(b) + epsilon),
+// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
+// change. Today it's computed using two vector sqrt and vector divide simd
+// instructions. It is slow. We can take advantage of existing fast vector
+// VRSQRTPS instruction that computes approximate reciprocals of square roots
+// of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
+// addition of epislon is just done to avoid division by zero, we approximate a
+// / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
+// use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
+// the test on random numbers between 0.1 and 1 the absolute error was about
+// 10^-3 compared to using slower but more accurate combination of vsqrt and
+// vdiv. Extend Marat's function with more NR iterations to get more accuracy
+// for training
+// TODO(msmelyan)
+// explore streaming stores, but need to have unique indices (deduplication)
 inline void adagrad_update_prefetch_inlined(
     int N,
     const float* w,
@@ -238,8 +257,12 @@ void adagrad_update(
     float decay,
     float lr);
 
+/**
+ * @return num_rows if succeeds otherwise return the row idx where we pass
+ *         the boundary of param_size
+ */
 template <typename SIndex>
-void sparse_adagrad(
+int sparse_adagrad(
     int num_rows, // number of rows reading
     int block_size, // number of parameters per rows
     std::uint64_t param_size, // total number of parameters
@@ -250,11 +273,10 @@ void sparse_adagrad(
     float* nw, // output parameters
     float* nh, // output momentums
     float epsilon,
-    float lr,
-    const std::string& param_name); // name of parameters (for error reporting)
+    float lr);
 
 #define SPARSE_ADAGRAD_SPECIALIZATION(SIndex, ISA)                       \
-  void sparse_adagrad_##SIndex##__##ISA(                                 \
+  int sparse_adagrad_##SIndex##__##ISA(                                  \
       int num_rows,                                                      \
       int block_size,                                                    \
       std::uint64_t param_size,                                          \
@@ -265,25 +287,15 @@ void sparse_adagrad(
       float* nw,                                                         \
       float* nh,                                                         \
       float epsilon,                                                     \
-      float lr,                                                          \
-      const std::string& param_name) {                                   \
+      float lr) {                                                        \
     for (int i = 0; i < num_rows; ++i) {                                 \
       auto idx = indices[i];                                             \
       auto offsetI = i * block_size;                                     \
       auto offsetIdx = idx * block_size;                                 \
                                                                          \
-      CAFFE_ENFORCE_GE(                                                  \
-          param_size,                                                    \
-          block_size + offsetIdx,                                        \
-          param_name,                                                    \
-          ", out of bound,  idx:",                                       \
-          idx,                                                           \
-          " for input i:",                                               \
-          i,                                                             \
-          " and block size:",                                            \
-          block_size,                                                    \
-          " max size:",                                                  \
-          param_size);                                                   \
+      if (block_size + offsetIdx > param_size) {                         \
+        return i;                                                        \
+      }                                                                  \
                                                                          \
       if (block_size == 1) {                                             \
         float gi = g[i];                                                 \
@@ -309,6 +321,7 @@ void sparse_adagrad(
             lr);                                                         \
       }                                                                  \
     }                                                                    \
+    return num_rows;                                                     \
   };
 
 } // namespace caffe2

diff --git a/caffe2/perfkernels/adagrad_avx.cc b/caffe2/perfkernels/adagrad_avx.cc
@@ -6,23 +6,40 @@
 
 namespace caffe2 {
 
-// version with prefetching
-// TODO(msmelyan)
-// Crux of the computation is computing a  / (sqrt(b) + epsilon),
-// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
-// change. Today it's computed using two vector sqrt and vector divide simd
-// instructions. It is slow. We can take advantage of existing fast vector
-// VRSQRTPS instruction that computes approximate reciprocals of square roots
-// of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
-// addition of epislon is just done to avoid division by zero, we approximate a
-// / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
-// use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
-// the test on random numbers between 0.1 and 1 the absolute error was about
-// 10^-3 compared to using slower but more accurate combination of vsqrt and
-// vdiv. Extend Marat's function with more NR iterations to get more accuracy
-// for training
-// TODO(msmelyan)
-// explore streaming stores, but need to have unique indices (deduplication)
+// version without prefetching
+void adagrad_update__avx_f16c(
+    int N,
+    const float* w,
+    const float* g,
+    const float* h,
+    float* nw,
+    float* nh,
+    float epsilon,
+    float decay,
+    float lr) {
+  constexpr size_t kSize = 8;
+  auto i = 0;
+  for (; i + kSize <= N; i += kSize) {
+    __m256 gi = _mm256_loadu_ps(g + i);
+    __m256 hi = _mm256_loadu_ps(h + i);
+    __m256 wi = _mm256_loadu_ps(w + i);
+
+    __m256 nhi = _mm256_add_ps(
+        _mm256_mul_ps(_mm256_set1_ps(decay), hi), _mm256_mul_ps(gi, gi));
+    _mm256_storeu_ps(nh + i, nhi);
+    __m256 vtmp = _mm256_div_ps(
+        gi, _mm256_add_ps(_mm256_sqrt_ps(nhi), _mm256_set1_ps(epsilon)));
+    _mm256_storeu_ps(
+        nw + i, _mm256_add_ps(wi, _mm256_mul_ps(_mm256_set1_ps(lr), vtmp)));
+  }
+
+  for (; i < N; ++i) {
+    float gi = g[i];
+    float hi = nh[i] = decay * h[i] + gi * gi;
+    nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
+  }
+}
+
 void adagrad_update_prefetch__avx_f16c(
     int N,
     const float* w,
@@ -108,40 +125,6 @@ void rowwise_adagrad_update__avx_f16c(
   internal::rowwise_adagrad_update_inlined(N, w, w_n, g, h, h_n, epsilon, lr);
 }
 
-// version without prefetching
-void adagrad_update__avx_f16c(
-    int N,
-    const float* w,
-    const float* g,
-    const float* h,
-    float* nw,
-    float* nh,
-    float epsilon,
-    float decay,
-    float lr) {
-  constexpr int kSize = 8;
-  auto i = 0;
-  for (; i + kSize <= N; i += kSize) {
-    __m256 gi = _mm256_loadu_ps(g + i);
-    __m256 hi = _mm256_loadu_ps(h + i);
-    __m256 wi = _mm256_loadu_ps(w + i);
-
-    __m256 nhi = _mm256_add_ps(
-        _mm256_mul_ps(_mm256_set1_ps(decay), hi), _mm256_mul_ps(gi, gi));
-    _mm256_storeu_ps(nh + i, nhi);
-    __m256 vtmp = _mm256_div_ps(
-        gi, _mm256_add_ps(_mm256_sqrt_ps(nhi), _mm256_set1_ps(epsilon)));
-    _mm256_storeu_ps(
-        nw + i, _mm256_add_ps(wi, _mm256_mul_ps(_mm256_set1_ps(lr), vtmp)));
-  }
-
-  for (; i < N; ++i) {
-    float gi = g[i];
-    float hi = nh[i] = decay * h[i] + gi * gi;
-    nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
-  }
-}
-
 SPARSE_ADAGRAD_SPECIALIZATION(int32_t, avx_f16c);
 SPARSE_ADAGRAD_SPECIALIZATION(int64_t, avx_f16c);