Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ cc_library(
includes = [
"src",
],
copts = [
"-mf16c",
],
deps = [
":fbgemm_headers",
"@cpuinfo",
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ if(MSVC)
target_compile_options(fbgemm_avx2 PRIVATE "/arch:AVX2")
target_compile_options(fbgemm_avx512 PRIVATE "/arch:AVX512")
else(MSVC)
target_compile_options(fbgemm_generic PRIVATE "-mf16c")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't that makes fbgemm code incompatible with SandyBridge CPUs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@malfet That would be true. We need perhaps a pure C implementation of _cvtss_sh/_cvtsh_ss in the REF implementation (there is one already but missing the misc. rounding handling). I can create a PR next week for this purpose if needed. @jspark1105

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR has been reverted due to an illegal instruction error in PyTorch tests. Let me check if it's related and if so commit with a C implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the introducing this issue. And thanks for the fast turnaround! @jspark1105

target_compile_options(fbgemm_avx2 PRIVATE
"-m64" "-mavx2" "-mf16c" "-mfma" "-masm=intel")
target_compile_options(fbgemm_avx512 PRIVATE
Expand Down
152 changes: 95 additions & 57 deletions bench/RowwiseAdagradFusedBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ void run_benchmark(
int num_rows,
int embedding_dim,
int average_len,
bool use_fp16_weights,
bool use_32_bit_indices = false,
bool prefetch = false) {
vector<char> llc(64L * 1024L * 1024L, 1.0);
vector<float> g(batch_size * embedding_dim); // gradients
vector<float> h(num_rows); // input momentums
vector<float> w(num_rows * embedding_dim); // input params
vector<float> h_ref(h.size());
vector<float> w_ref(w.size());
vector<float16> w_fp16(w.size()); // input params

default_random_engine generator;
// normal_distribution<float> h_w_distribution;
Expand All @@ -62,10 +62,10 @@ void run_benchmark(
g[i] = 4 + i; // h_w_distribution(generator);
}
for (int i = 0; i < h.size(); ++i) {
h_ref[i] = h[i] = 2 + i; // h_w_distribution(generator);
h[i] = 2 + i; // h_w_distribution(generator);
}
for (int i = 0; i < w.size(); ++i) {
w_ref[i] = w[i] = 3 + i; // h_w_distribution(generator);
w[i] = 3 + i; // h_w_distribution(generator);
}

// Generate lengths
Expand Down Expand Up @@ -104,47 +104,81 @@ void run_benchmark(
constexpr int NUM_ITER = 10;
// Only counts the number of bytes for reading embedding table and ignore
// others. Should be good enough as long as embdding_dim is big enough.
double bytes = lengths_sum *
((embedding_dim + 1) * sizeof(float) * 2 +
(use_32_bit_indices ? 4 : 8)) +
batch_size * (embedding_dim * sizeof(float) + sizeof(int));
double bytes_padded = lengths_sum *
(((embedding_dim * sizeof(float) + 63) / 64 + 1) * 64 * 2 +
(use_32_bit_indices ? 4 : 8)) +
batch_size * (embedding_dim * sizeof(float) + sizeof(int));

auto kernel_i32 = GenerateRowWiseSparseAdaGradFused<int32_t>(
embedding_dim, prefetch ? 16 : 0);
auto kernel_i64 = GenerateRowWiseSparseAdaGradFused<int64_t>(
embedding_dim, prefetch ? 16 : 0);
double bytes =
lengths_sum * ((embedding_dim + 1) *
(use_fp16_weights ? sizeof(float16) : sizeof(float)) * 2 +
(use_32_bit_indices ? 4 : 8)) +
batch_size * (embedding_dim * sizeof(float) + sizeof(int));
double bytes_padded =
lengths_sum * (((embedding_dim * (use_fp16_weights ? sizeof(float16) :
sizeof(float)) + 63) / 64 + 1) * 64 * 2 +
(use_32_bit_indices ? 4 : 8)) +
batch_size * (embedding_dim * sizeof(float) + sizeof(int));

auto kernel_i32 = GenerateRowWiseSparseAdaGradFused
<int32_t, int32_t, float>(embedding_dim, prefetch ? 16 : 0);
auto kernel_i64 = GenerateRowWiseSparseAdaGradFused
<int64_t, int32_t, float>(embedding_dim, prefetch ? 16 : 0);
auto kernel_fp16_i32 = GenerateRowWiseSparseAdaGradFused
<int32_t, int32_t, float16>(embedding_dim, prefetch ? 16 : 0);
auto kernel_fp16_i64 = GenerateRowWiseSparseAdaGradFused
<int64_t, int32_t, float16>(embedding_dim, prefetch ? 16 : 0);

for (bool flush_cache : {false, true}) {
double t = measureWithWarmup(
[&]() {
if (use_32_bit_indices) {
kernel_i32(
batch_size,
lengths_sum,
num_rows,
w.data(),
g.data(),
h.data(),
indices_32.data(),
lengths.data(),
epsilon,
lr);
if (use_fp16_weights) {
if (use_32_bit_indices) {
kernel_fp16_i32(
batch_size,
lengths_sum,
num_rows,
w_fp16.data(),
g.data(),
h.data(),
indices_32.data(),
lengths.data(),
epsilon,
lr);
} else {
kernel_fp16_i64(
batch_size,
lengths_sum,
num_rows,
w_fp16.data(),
g.data(),
h.data(),
indices.data(),
lengths.data(),
epsilon,
lr);
}
} else {
kernel_i64(
batch_size,
lengths_sum,
num_rows,
w.data(),
g.data(),
h.data(),
indices.data(),
lengths.data(),
epsilon,
lr);
if (use_32_bit_indices) {
kernel_i32(
batch_size,
lengths_sum,
num_rows,
w.data(),
g.data(),
h.data(),
indices_32.data(),
lengths.data(),
epsilon,
lr);
} else {
kernel_i64(
batch_size,
lengths_sum,
num_rows,
w.data(),
g.data(),
h.data(),
indices.data(),
lengths.data(),
epsilon,
lr);
}
}
},
NUM_WARMUP,
Expand Down Expand Up @@ -183,23 +217,27 @@ int main() {
<< embedding_dim << setw(16) << "avg length" << setw(6) << average_len
<< endl;

for (bool use_32_bit_indices : {false, true}) {
for (bool prefetch : {false, true}) {
// args: batch sz, num rows, emb dim, avg len, use 32b, prefetch
cout << (use_32_bit_indices ? " 32" : " 64") << " bit indices";
if (prefetch) {
cout << " with prefetching";
}
cout << ", ";
run_benchmark(
batch_size,
num_rows,
embedding_dim,
average_len,
use_32_bit_indices,
prefetch);
} // prefetch
} // use_32_bit_indices
for (bool use_fp16_weights : {false, true}) {
for (bool use_32_bit_indices : {false, true}) {
for (bool prefetch : {false, true}) {
// args: batch sz, num rows, emb dim, avg len, use 32b, prefetch
cout << (use_fp16_weights ? " float16" : " float32") << " weights";
cout << (use_32_bit_indices ? " 32" : " 64") << " bit indices";
if (prefetch) {
cout << " with prefetching";
}
cout << ", ";
run_benchmark(
batch_size,
num_rows,
embedding_dim,
average_len,
use_fp16_weights,
use_32_bit_indices,
prefetch);
} // prefetch
} // use_32_bit_indices
} // use_fp16_weights
} // for each input

return 0;
Expand Down
15 changes: 10 additions & 5 deletions include/fbgemm/FbgemmEmbedding.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,16 @@ GenerateSparseAdaGrad(
float weight_decay = 0.0f);

// RowWiseSparseAdaGrad fused with SLS gradient
template <typename IndexType, typename OffsetType = std::int32_t>
// Weights can be either float or float16
template <typename IndexType, typename OffsetType = std::int32_t,
typename DataType = float>
class RowWiseSparseAdaGradFusedSignature {
public:
using Type = std::function<bool(
std::int64_t output_size,
std::int64_t index_size,
std::int64_t data_size, // number of rows in w
float* w, // input/output parameters
DataType* w, // input/output parameters
const float* g, // input gradients
float* h, // input/output momentums
const IndexType* indices, // indices of each row
Expand All @@ -177,13 +179,16 @@ class RowWiseSparseAdaGradFusedSignature {
float lr)>;
};

template <typename IndexType, typename OffsetType = std::int32_t>
template <typename IndexType, typename OffsetType = std::int32_t,
typename DataType = float>
FBGEMM_API
typename RowWiseSparseAdaGradFusedSignature<IndexType, OffsetType>::Type
typename
RowWiseSparseAdaGradFusedSignature<IndexType, OffsetType, DataType>::Type
GenerateRowWiseSparseAdaGradFused(
int block_size, // number of parameters per row
int prefetch = 16,
bool use_offsets = true);
bool use_offsets = true,
bool use_stochastic_rounding = true);

namespace internal {
// Specialization for block size 1 internally called by GenerateEmbeddingSpMDM
Expand Down
Loading