Skip to content

Commit

Permalink
more careful use of inline/template function in perfkernels (#15388)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #15388

This is another pass to make perfkernels code safer from illegal instruction error.
Removed dependency to c10/util/Logging.h
We're err on the safer side at the expense of some verbosity.

Reviewed By: dskhudia

Differential Revision: D13502902

fbshipit-source-id: 4f833115df885c5b4f8c1ca83b9badea1553f944
  • Loading branch information
jspark1105 authored and facebook-github-bot committed Jan 31, 2019
1 parent 26200eb commit db12137
Show file tree
Hide file tree
Showing 14 changed files with 1,262 additions and 1,255 deletions.
54 changes: 24 additions & 30 deletions caffe2/perfkernels/adagrad.cc
Expand Up @@ -71,6 +71,22 @@ void rowwise_adagrad_update__base(
internal::rowwise_adagrad_update_inlined(N, w, w_n, g, h, h_n, epsilon, lr);
}

// version without prefetching
decltype(adagrad_update__base) adagrad_update__avx_f16c;
void adagrad_update(
int N,
const float* w,
const float* g,
const float* h,
float* nw,
float* nh,
float epsilon,
float decay,
float lr) {
AVX_F16C_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
}

decltype(adagrad_update_prefetch__base) adagrad_update_prefetch__avx_f16c;
void adagrad_update_prefetch(
int N,
Expand Down Expand Up @@ -184,27 +200,11 @@ void rowwise_adagrad_update(
BASE_DO(rowwise_adagrad_update, N, w, w_n, g, h, h_n, epsilon, lr);
}

// version without prefetching
decltype(adagrad_update__base) adagrad_update__avx_f16c;
void adagrad_update(
int N,
const float* w,
const float* g,
const float* h,
float* nw,
float* nh,
float epsilon,
float decay,
float lr) {
AVX_F16C_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
BASE_DO(adagrad_update, N, w, g, h, nw, nh, epsilon, decay, lr);
}

SPARSE_ADAGRAD_SPECIALIZATION(int32_t, base);

decltype(sparse_adagrad_int32_t__base) sparse_adagrad_int32_t__avx_f16c;
template <>
void sparse_adagrad(
int sparse_adagrad(
int num_rows,
int block_size,
uint64_t param_size,
Expand All @@ -215,8 +215,7 @@ void sparse_adagrad(
float* nw,
float* nh,
float epsilon,
float lr,
const std::string& param_name) {
float lr) {
AVX_F16C_DO(
sparse_adagrad_int32_t,
num_rows,
Expand All @@ -229,8 +228,7 @@ void sparse_adagrad(
nw,
nh,
epsilon,
lr,
param_name);
lr);
BASE_DO(
sparse_adagrad_int32_t,
num_rows,
Expand All @@ -243,15 +241,14 @@ void sparse_adagrad(
nw,
nh,
epsilon,
lr,
param_name);
lr);
}

SPARSE_ADAGRAD_SPECIALIZATION(int64_t, base);

decltype(sparse_adagrad_int64_t__base) sparse_adagrad_int64_t__avx_f16c;
template <>
void sparse_adagrad(
int sparse_adagrad(
int num_rows,
int block_size,
uint64_t param_size,
Expand All @@ -262,8 +259,7 @@ void sparse_adagrad(
float* nw,
float* nh,
float epsilon,
float lr,
const std::string& param_name) {
float lr) {
AVX_F16C_DO(
sparse_adagrad_int64_t,
num_rows,
Expand All @@ -276,8 +272,7 @@ void sparse_adagrad(
nw,
nh,
epsilon,
lr,
param_name);
lr);
BASE_DO(
sparse_adagrad_int64_t,
num_rows,
Expand All @@ -290,8 +285,7 @@ void sparse_adagrad(
nw,
nh,
epsilon,
lr,
param_name);
lr);
}

} // namespace caffe2
51 changes: 32 additions & 19 deletions caffe2/perfkernels/adagrad.h
Expand Up @@ -6,12 +6,14 @@
#include <immintrin.h>
#endif
#include <c10/util/Half.h>
#include <c10/util/Logging.h>

namespace caffe2 {

namespace internal {

// The following functions inside internal namespace are inlined because they
// are performance critical.

template <typename T>
static inline void adagrad_update_base_inlined(
int N,
Expand All @@ -31,6 +33,23 @@ static inline void adagrad_update_base_inlined(
}
}

// version with prefetching
// TODO(msmelyan)
// Crux of the computation is computing a / (sqrt(b) + epsilon),
// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
// change. Today it's computed using two vector sqrt and vector divide simd
// instructions. It is slow. We can take advantage of existing fast vector
// VRSQRTPS instruction that computes approximate reciprocals of square roots
// of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
// addition of epislon is just done to avoid division by zero, we approximate a
// / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
// use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
// the test on random numbers between 0.1 and 1 the absolute error was about
// 10^-3 compared to using slower but more accurate combination of vsqrt and
// vdiv. Extend Marat's function with more NR iterations to get more accuracy
// for training
// TODO(msmelyan)
// explore streaming stores, but need to have unique indices (deduplication)
inline void adagrad_update_prefetch_inlined(
int N,
const float* w,
Expand Down Expand Up @@ -238,8 +257,12 @@ void adagrad_update(
float decay,
float lr);

/**
* @return num_rows if succeeds otherwise return the row idx where we pass
* the boundary of param_size
*/
template <typename SIndex>
void sparse_adagrad(
int sparse_adagrad(
int num_rows, // number of rows reading
int block_size, // number of parameters per rows
std::uint64_t param_size, // total number of parameters
Expand All @@ -250,11 +273,10 @@ void sparse_adagrad(
float* nw, // output parameters
float* nh, // output momentums
float epsilon,
float lr,
const std::string& param_name); // name of parameters (for error reporting)
float lr);

#define SPARSE_ADAGRAD_SPECIALIZATION(SIndex, ISA) \
void sparse_adagrad_##SIndex##__##ISA( \
int sparse_adagrad_##SIndex##__##ISA( \
int num_rows, \
int block_size, \
std::uint64_t param_size, \
Expand All @@ -265,25 +287,15 @@ void sparse_adagrad(
float* nw, \
float* nh, \
float epsilon, \
float lr, \
const std::string& param_name) { \
float lr) { \
for (int i = 0; i < num_rows; ++i) { \
auto idx = indices[i]; \
auto offsetI = i * block_size; \
auto offsetIdx = idx * block_size; \
\
CAFFE_ENFORCE_GE( \
param_size, \
block_size + offsetIdx, \
param_name, \
", out of bound, idx:", \
idx, \
" for input i:", \
i, \
" and block size:", \
block_size, \
" max size:", \
param_size); \
if (block_size + offsetIdx > param_size) { \
return i; \
} \
\
if (block_size == 1) { \
float gi = g[i]; \
Expand All @@ -309,6 +321,7 @@ void sparse_adagrad(
lr); \
} \
} \
return num_rows; \
};

} // namespace caffe2
Expand Down
85 changes: 34 additions & 51 deletions caffe2/perfkernels/adagrad_avx.cc
Expand Up @@ -6,23 +6,40 @@

namespace caffe2 {

// version with prefetching
// TODO(msmelyan)
// Crux of the computation is computing a / (sqrt(b) + epsilon),
// where a and b are vectors and epislon is very small (eg., 10^-5) and does not
// change. Today it's computed using two vector sqrt and vector divide simd
// instructions. It is slow. We can take advantage of existing fast vector
// VRSQRTPS instruction that computes approximate reciprocals of square roots
// of the vector. It is 6x faster than vsrt and vdiv combinations. Since the
// addition of epislon is just done to avoid division by zero, we approximate a
// / (sqrt(b) + epsilon) by a / (sqrt(b + sqrt(epsilon)) If we do that, we can
// use VRSQRTPS instead now. VRSQRTPS is not very accurate. Specifically, for
// the test on random numbers between 0.1 and 1 the absolute error was about
// 10^-3 compared to using slower but more accurate combination of vsqrt and
// vdiv. Extend Marat's function with more NR iterations to get more accuracy
// for training
// TODO(msmelyan)
// explore streaming stores, but need to have unique indices (deduplication)
// version without prefetching
void adagrad_update__avx_f16c(
int N,
const float* w,
const float* g,
const float* h,
float* nw,
float* nh,
float epsilon,
float decay,
float lr) {
constexpr size_t kSize = 8;
auto i = 0;
for (; i + kSize <= N; i += kSize) {
__m256 gi = _mm256_loadu_ps(g + i);
__m256 hi = _mm256_loadu_ps(h + i);
__m256 wi = _mm256_loadu_ps(w + i);

__m256 nhi = _mm256_add_ps(
_mm256_mul_ps(_mm256_set1_ps(decay), hi), _mm256_mul_ps(gi, gi));
_mm256_storeu_ps(nh + i, nhi);
__m256 vtmp = _mm256_div_ps(
gi, _mm256_add_ps(_mm256_sqrt_ps(nhi), _mm256_set1_ps(epsilon)));
_mm256_storeu_ps(
nw + i, _mm256_add_ps(wi, _mm256_mul_ps(_mm256_set1_ps(lr), vtmp)));
}

for (; i < N; ++i) {
float gi = g[i];
float hi = nh[i] = decay * h[i] + gi * gi;
nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
}
}

void adagrad_update_prefetch__avx_f16c(
int N,
const float* w,
Expand Down Expand Up @@ -108,40 +125,6 @@ void rowwise_adagrad_update__avx_f16c(
internal::rowwise_adagrad_update_inlined(N, w, w_n, g, h, h_n, epsilon, lr);
}

// version without prefetching
void adagrad_update__avx_f16c(
int N,
const float* w,
const float* g,
const float* h,
float* nw,
float* nh,
float epsilon,
float decay,
float lr) {
constexpr int kSize = 8;
auto i = 0;
for (; i + kSize <= N; i += kSize) {
__m256 gi = _mm256_loadu_ps(g + i);
__m256 hi = _mm256_loadu_ps(h + i);
__m256 wi = _mm256_loadu_ps(w + i);

__m256 nhi = _mm256_add_ps(
_mm256_mul_ps(_mm256_set1_ps(decay), hi), _mm256_mul_ps(gi, gi));
_mm256_storeu_ps(nh + i, nhi);
__m256 vtmp = _mm256_div_ps(
gi, _mm256_add_ps(_mm256_sqrt_ps(nhi), _mm256_set1_ps(epsilon)));
_mm256_storeu_ps(
nw + i, _mm256_add_ps(wi, _mm256_mul_ps(_mm256_set1_ps(lr), vtmp)));
}

for (; i < N; ++i) {
float gi = g[i];
float hi = nh[i] = decay * h[i] + gi * gi;
nw[i] = w[i] + lr * gi / (std::sqrt(hi) + epsilon);
}
}

SPARSE_ADAGRAD_SPECIALIZATION(int32_t, avx_f16c);
SPARSE_ADAGRAD_SPECIALIZATION(int64_t, avx_f16c);

Expand Down

0 comments on commit db12137

Please sign in to comment.