diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc index db40ee0cf2..f5ce9ef4d0 100644 --- a/bench/BenchUtils.cc +++ b/bench/BenchUtils.cc @@ -5,30 +5,41 @@ * LICENSE file in the root directory of this source tree. */ #include "BenchUtils.h" + +#include #include +#include + +#include namespace fbgemm { std::default_random_engine eng; template -void randFill(aligned_vector& vec, const int low, const int high) { - std::random_device r; - std::uniform_int_distribution dis(low, high); - for (auto& v : vec) { - v = static_cast(dis(eng)); - } +void randFill(aligned_vector& vec, T low, T high, std::true_type) { + std::uniform_int_distribution dis(low, high); + std::generate(vec.begin(), vec.end(), [&] { return dis(eng); }); +} + +template +void randFill(aligned_vector& vec, T low, T high, std::false_type) { + std::uniform_real_distribution dis(low, high); + std::generate(vec.begin(), vec.end(), [&] { return dis(eng); }); +} + +template +void randFill(aligned_vector& vec, T low, T high) { + randFill(vec, low, high, std::is_integral()); } template void -randFill(aligned_vector& vec, const int low, const int high); -template void -randFill(aligned_vector& vec, const int low, const int high); +randFill(aligned_vector& vec, float low, float high); template void -randFill(aligned_vector& vec, const int low, const int high); - +randFill(aligned_vector& vec, uint8_t low, uint8_t high); template void -randFill(aligned_vector& vec, const int low, const int high); +randFill(aligned_vector& vec, int8_t low, int8_t high); +template void randFill(aligned_vector& vec, int low, int high); void llc_flush(std::vector& llc) { volatile char* data = llc.data(); @@ -37,4 +48,20 @@ void llc_flush(std::vector& llc) { } } +int fbgemm_get_num_threads() { +#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) + return 1; +#else + return omp_get_num_threads(); +#endif +} + +int fbgemm_get_thread_num() { +#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) + return 0; +#else + return omp_get_thread_num(); +#endif +} + } // namespace fbgemm diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h index 8ca99df1a8..da2ef2d017 100644 --- a/bench/BenchUtils.h +++ b/bench/BenchUtils.h @@ -11,8 +11,11 @@ namespace fbgemm { template -void randFill(aligned_vector& vec, const int low, const int high); +void randFill(aligned_vector& vec, T low, T high); void llc_flush(std::vector& llc); +int fbgemm_get_num_threads(); +int fbgemm_get_thread_num(); + } // namespace fbgemm diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc index f53eeea9a7..c65839bb4b 100644 --- a/bench/Depthwise3DBenchmark.cc +++ b/bench/Depthwise3DBenchmark.cc @@ -62,10 +62,10 @@ int main() { aligned_vector C_ref(N * T_OUT * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3x3_pad_1_ref( @@ -129,13 +129,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#if _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3x3_pad_1( N, T, @@ -200,13 +195,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#if _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3x3_pad_1( N, T, diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc index 8e6d83d5ff..b922f90d82 100644 --- a/bench/DepthwiseBenchmark.cc +++ b/bench/DepthwiseBenchmark.cc @@ -161,10 +161,10 @@ int main() { aligned_vector B(G * R * S); aligned_vector C_ref(N * H_OUT * W_OUT * G), C(C_ref.size()); - randFill(A, 0, 86); + randFill(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3_pad_1_ref( @@ -221,13 +221,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3_pad_1( N, H, @@ -279,13 +274,8 @@ int main() { t_begin = chrono::system_clock::now(); #pragma omp parallel { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); depthwise_3x3_pad_1( N, H, diff --git a/bench/FP16Benchmark.cc b/bench/FP16Benchmark.cc index c03f18ab2f..fd9de5bc3c 100644 --- a/bench/FP16Benchmark.cc +++ b/bench/FP16Benchmark.cc @@ -73,20 +73,24 @@ void performance_test() { int n = s[1]; int k = s[2]; - aligned_vector A(m * k, 0.f); - aligned_vector B(k * n, 0.f); - aligned_vector Cg(m * n, 1.f); - aligned_vector Cp(m * n, NAN); + aligned_vector C_ref(m * n, 1.f); + aligned_vector C_fb(m * n, NAN); // initialize with small numbers - randFill(A, 0, 4); + aligned_vector Aint(m * k); + randFill(Aint, 0, 4); + aligned_vector A(Aint.begin(), Aint.end()); - randFill(B, 0, 4); + aligned_vector Bint(k * n); + randFill(Bint, 0, 4); + aligned_vector B(Bint.begin(), Bint.end()); PackedGemmMatrixFP16 Bp(btran, k, n, alpha, B.data()); if (beta != 0.0f) { - randFill(Cg, 0, 4); - Cp = Cg; + aligned_vector Cint(C_ref.size()); + randFill(Cint, 0, 4); + C_ref.assign(Cint.begin(), Cint.end()); + C_fb = C_ref; } double nflops = 2.0 * (double)m * (double)n * (double)k * (double)NITER; @@ -111,17 +115,17 @@ void performance_test() { B.data(), (btran == matrix_op_t::NoTranspose) ? n : k, beta, - Cg.data(), + C_ref.data(), n); #endif cblas_gemm_compute( - matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data()); + matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data()); #ifdef USE_MKL // Compare results - for (auto i = 0; i < Cg.size(); i++) { - // printf("%f %f\n", Cg[i], Cp[i]); - assert(std::abs(Cg[i] - Cp[i]) < 1e-3); + for (auto i = 0; i < C_ref.size(); i++) { + // printf("%f %f\n", C_ref[i], C_fb[i]); + assert(std::abs(C_ref[i] - C_fb[i]) < 1e-3); } #endif } @@ -151,7 +155,7 @@ void performance_test() { B.data(), (btran == matrix_op_t::NoTranspose) ? n : k, beta, - Cg.data(), + C_ref.data(), n); t_end = chrono::system_clock::now(); if (it >= 0) { @@ -184,7 +188,7 @@ void performance_test() { t_begin = chrono::system_clock::now(); cblas_gemm_compute( - matrix_op_t::NoTranspose, m, A.data(), Bp, beta, Cp.data()); + matrix_op_t::NoTranspose, m, A.data(), Bp, beta, C_fb.data()); t_end = chrono::system_clock::now(); if (it >= 0) { diff --git a/bench/I8SpmdmBenchmark.cc b/bench/I8SpmdmBenchmark.cc index 07b73dc08f..4223d0c534 100644 --- a/bench/I8SpmdmBenchmark.cc +++ b/bench/I8SpmdmBenchmark.cc @@ -77,7 +77,7 @@ int main() { cout << M << ", " << N << ", " << K << ", "; aligned_vector A(M * K); - randFill(A, 0, 255); + randFill(A, 0, 255); fbgemm::CompressedSparseColumn B_csc(K, N); vector C(M * N); @@ -156,13 +156,8 @@ int main() { #pragma omp parallel #endif { -#if defined(FBGEMM_MEASURE_TIME_BREAKDOWN) || !defined(_OPENMP) - int num_threads = 1; - int tid = 0; -#else - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); int i_per_thread = ((M + 31) / 32 + num_threads - 1) / num_threads * 32; int i_begin = std::min(tid * i_per_thread, M); diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc index e3c9da2c7c..cb2edf57c3 100644 --- a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc @@ -125,43 +125,29 @@ void performance_test() { chrono::time_point begin, end; for (auto conv_p : shapes) { - aligned_vector Afp32( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f); aligned_vector Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); - + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector Aint8_out( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] * - conv_p.K[1] * conv_p.IC, - 0); + conv_p.K[1] * conv_p.IC); - aligned_vector Bfp32( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f); aligned_vector Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector Cint32_fb( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector Cint32_fb2( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint32_fb2(Cint32_ref.size()); // A matrix (input activations) - randFill(Afp32, 0, 5); + randFill(Aint8, 0, 5); int32_t Aint8_zero_point = 4; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast(Afp32[i]); - } + aligned_vector Afp32(Aint8.begin(), Aint8.end()); // B matrix (weights) - randFill(Bfp32, -4, 4); + randFill(Bint8, -4, 4); // int32_t Bint8_zero_point = -3; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast(Bfp32[i]); - } + aligned_vector Bfp32(Bint8.begin(), Bint8.end()); // reference implementation conv_ref( @@ -184,8 +170,7 @@ void performance_test() { double ttot = 0.0; string runType; - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithIm2Col::rowOffsetBufferSize()); PackAWithIm2Col packA( @@ -307,7 +292,6 @@ void performance_test() { KDim, nullptr, 1, - Aint8_zero_point, row_offset_buf.data()); fbgemmPacked( diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc index 153dc3b617..8e112d83a2 100644 --- a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc +++ b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc @@ -125,45 +125,32 @@ void performance_test() { chrono::time_point begin, end; for (auto conv_p : shapes) { - aligned_vector Afp32( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0.0f); aligned_vector Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector Aint8_out( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.K[0] * - conv_p.K[1] * conv_p.IC, - 0); + conv_p.K[1] * conv_p.IC); - aligned_vector Bfp32( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0.0f); aligned_vector Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector Cint32_fb( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - - aligned_vector Cint32_fb2( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint32_fb2(Cint32_ref.size()); // cout << conv_p.toString() << endl; // A matrix (input activations) - randFill(Afp32, 0, 5); + randFill(Aint8, 0, 5); int32_t Aint8_zero_point = 4; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast(Afp32[i]); - } + aligned_vector Apf32(Aint8.begin(), Aint8.end()); // B matrix (weights) - randFill(Bfp32, -4, 4); + randFill(Bint8, -4, 4); // int32_t Bint8_zero_point = -3; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast(Bfp32[i]); - } + aligned_vector Bfp32(Bint8.begin(), Bint8.end()); // reference implementation conv_ref( @@ -186,8 +173,7 @@ void performance_test() { double ttot = 0.0; string runType; - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithIm2Col::rowOffsetBufferSize()); PackAWithIm2Col packA( @@ -307,7 +293,6 @@ void performance_test() { KDim, nullptr, 1, - Aint8_zero_point, row_offset_buf.data()); fbgemmPacked( diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc index dc9536e586..79a750eac1 100644 --- a/bench/PackedFloatInOutBenchmark.cc +++ b/bench/PackedFloatInOutBenchmark.cc @@ -86,27 +86,27 @@ void performance_test() { int k = shape[2]; float alpha = 1.f, beta = 0.f; - aligned_vector Afp32(m * k, 0.0f); - aligned_vector Aint8(m * k, 0); + aligned_vector Afp32(m * k); + aligned_vector Aint8(Afp32.size()); - aligned_vector Bfp32(k * n, 0.0f); - aligned_vector Bint8(k * n, 0); + aligned_vector Bfp32(k * n); + aligned_vector Bint8(Bfp32.size()); - aligned_vector Cfp32_mkl(m * n, 0.0f); - aligned_vector Cfp32_fb(m * n, 0.0f); + aligned_vector Cfp32_mkl(m * n); + aligned_vector Cfp32_fb(Cfp32_mkl.size()); - aligned_vector Cint8_fb(m * n, 0); - aligned_vector Cint32_buffer(m * n, 0); + aligned_vector Cint8_fb(Cfp32_mkl.size()); + aligned_vector Cint32_buffer(Cfp32_mkl.size()); // A matrix - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); float Aint8_scale = 0.11; int32_t Aint8_zero_point = 43; for (auto i = 0; i < Afp32.size(); ++i) { Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); } - randFill(Bint8, -128, 127); + randFill(Bint8, -128, 127); avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); float Bint8_scale = 0.49; @@ -116,10 +116,9 @@ void performance_test() { } // computing column offset - vector col_offsets; - col_offsets.resize(n); + vector col_offsets(n); col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); double ttot = 0; std::string type; @@ -172,8 +171,7 @@ void performance_test() { // printMatrix(matrix_op_t::NoTranspose, col_offsets.data(), 1, n, n, "col // offsets before"); - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithQuantRowOffset::rowOffsetBufferSize()); PackAWithQuantRowOffset packAN( @@ -195,19 +193,19 @@ void performance_test() { Bint8.data(), n, nullptr, - 1, - Bint8_zero_point); + 1); DoNothing doNothingObj{}; ReQuantizeForFloat outputProcObj( doNothingObj, Aint8_scale, - Bint8_scale, + &Bint8_scale, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, packAN.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); ttot = 0; type = "FBGEMM_i8_acc32"; diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc index de15ccecd4..f60332fc40 100644 --- a/bench/PackedRequantizeAcc16Benchmark.cc +++ b/bench/PackedRequantizeAcc16Benchmark.cc @@ -100,29 +100,26 @@ void performance_test() { int n = shape[1]; int k = shape[2]; - float alpha = 1.f, beta = 0.f; - aligned_vector Afp32(m * k, 0.0f); - aligned_vector Aint8(m * k, 0); + float alpha = 1.0f, beta = 0.0f; + aligned_vector Aint8(m * k); + aligned_vector Bint8(k * n); - aligned_vector Bfp32(k * n, 0.0f); - aligned_vector Bint8(k * n, 0); - - aligned_vector Cfp32_mkl(m * n, 0.0f); + aligned_vector Cfp32_mkl(m * n); // just used for result comparisons - aligned_vector Cint32_mkl(m * n, 0.0f); + aligned_vector Cint32_mkl(Cfp32_mkl.size()); // requantize results - aligned_vector Cint8_mkl(m * n, 0.0f); - aligned_vector Cint32_fb(m * n, 0.0f); - aligned_vector Cint8_fb(m * n, 0.0f); + aligned_vector Cint8_mkl(Cfp32_mkl.size()); + aligned_vector Cint32_fb(Cfp32_mkl.size()); + aligned_vector Cint8_fb(Cfp32_mkl.size()); // A matrix - randFill(Afp32, 0, 50); + randFill(Aint8, 0, 50); int32_t Aint8_zero_point = 43; - for (auto i = 0; i < Afp32.size(); ++i) { - Aint8[i] = static_cast(Afp32[i]); - } + aligned_vector Afp32(Aint8.begin(), Aint8.end()); - randFill(Bfp32, -8, 8); + randFill(Bint8, -8, 8); + aligned_vector Bint8_copy(Bint8); + aligned_vector Bfp32(Bint8.begin(), Bint8.end()); double nops = 2.0 * static_cast(NITER) * m * n * k; double ttot = 0.0; @@ -163,9 +160,7 @@ void performance_test() { cout << setw(16) << runType << ", " << fixed << setw(5) << setprecision(1) << nops / ttot << endl; - for (auto i = 0; i < Cfp32_mkl.size(); ++i) { - Cint32_mkl[i] = static_cast(Cfp32_mkl[i]); - } + Cint32_mkl.assign(Cfp32_mkl.begin(), Cfp32_mkl.end()); #endif for (BenchmarkType bench_type : @@ -179,23 +174,19 @@ void performance_test() { bench_type == BenchmarkType::REQUANTIZATION) ? 0 : -30; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = static_cast(Bfp32[i]); - } // computing column offset - vector col_offsets; - col_offsets.resize(n); + vector col_offsets(n); + Bint8 = Bint8_copy; col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); row_offsets_u8acc32_ref(m, k, k, Aint8.data(), row_offsets.data()); float C_multiplier = - (bench_type == BenchmarkType::BARE_BONE) ? 1 : 0.1234; + (bench_type == BenchmarkType::BARE_BONE) ? 1.0f : 0.1234f; int32_t C_zero_pt = (bench_type == BenchmarkType::BARE_BONE) ? 0 : 5; // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k, @@ -235,16 +226,14 @@ void performance_test() { n, Cint32_mkl.data(), Cint8_mkl.data(), - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, row_offsets.data(), col_offsets.data(), - nullptr); // bias - - PackBMatrix packedB( - matrix_op_t::NoTranspose, k, n, Bint8.data(), n); + nullptr, // bias + n); // ncols per quant group CompressedSparseColumn B_csc(k, n); @@ -254,30 +243,35 @@ void performance_test() { default_random_engine eng; binomial_distribution<> per_col_nnz_dist(k, density); - vector row_indices(k); - - int total_nnz = 0; - for (int j = 0; j < n; ++j) { - B_csc.ColPtr()[j] = total_nnz; - - int nnz_of_j = per_col_nnz_dist(eng); - total_nnz += nnz_of_j; - - iota(row_indices.begin(), row_indices.end(), 0); - shuffle(row_indices.begin(), row_indices.end(), eng); - sort(row_indices.begin(), row_indices.begin() + nnz_of_j); - - for (int kidx = 0; kidx < nnz_of_j; ++kidx) { - B_csc.RowIdx().push_back(row_indices[kidx]); - // put the current B value - B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]); - // make current B value zero - Bint8[row_indices[kidx] * n + j] = 0; - // std::cout << "(" << row_indices[kidx] << ", " << j << ")" << - // endl; + if (bench_type == BenchmarkType::EVERYTHING) { + vector row_indices(k); + + int total_nnz = 0; + for (int j = 0; j < n; ++j) { + B_csc.ColPtr()[j] = total_nnz; + + int nnz_of_j = per_col_nnz_dist(eng); + total_nnz += nnz_of_j; + + iota(row_indices.begin(), row_indices.end(), 0); + shuffle(row_indices.begin(), row_indices.end(), eng); + sort(row_indices.begin(), row_indices.begin() + nnz_of_j); + + for (int kidx = 0; kidx < nnz_of_j; ++kidx) { + B_csc.RowIdx().push_back(row_indices[kidx]); + // put the current B value + B_csc.Values().push_back(Bint8[row_indices[kidx] * n + j]); + // make current B value zero + Bint8[row_indices[kidx] * n + j] = 0; + // std::cout << "(" << row_indices[kidx] << ", " << j << ")" << + // endl; + } } + B_csc.ColPtr()[n] = total_nnz; } - B_csc.ColPtr()[n] = total_nnz; + + PackBMatrix packedB( + matrix_op_t::NoTranspose, k, n, Bint8.data(), n); // printMatrix(matrix_op_t::NoTranspose, // Cint32_mkl.data(), m, n, n, "C mkl"); @@ -298,8 +292,7 @@ void performance_test() { #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAMatrix packA( @@ -309,8 +302,7 @@ void performance_test() { Aint8.data(), k, nullptr, - 1, - Aint8_zero_point); + 1); PackAWithRowOffset packAWithRowOffset( matrix_op_t::NoTranspose, m, @@ -319,7 +311,6 @@ void performance_test() { k, nullptr, 1, - Aint8_zero_point, row_offset_buf.data()); // no-op output process objects @@ -335,15 +326,16 @@ void performance_test() { // Requantization back to int8 ReQuantizeOutput reqObj( doNothingObj, - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, bench_type == BenchmarkType::REQUANTIZATION ? nullptr : packAWithRowOffset.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); // the top most (first) operation in the output processing // pipeline is spmdm @@ -356,13 +348,8 @@ void performance_test() { ReQuantizeOutput> spmdmObj(reqObj, Aint8.data(), k, B_csc); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); // printf ( "tid: %d, num_threads: %d\n", tid, num_threads ); switch (bench_type) { case BenchmarkType::BARE_BONE: diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc index 0e98234869..b255b8cd7f 100644 --- a/bench/PackedRequantizeAcc32Benchmark.cc +++ b/bench/PackedRequantizeAcc32Benchmark.cc @@ -103,42 +103,35 @@ void performance_test() { int k = shape[2]; float alpha = 1.f, beta = 0.f; - aligned_vector Afp32(m * k, 0.0f); - aligned_vector Aint8(m * k, 0); + aligned_vector Aint8(m * k); - aligned_vector Bfp32(k * n, 0.0f); - aligned_vector Bint8(k * n, 0); + aligned_vector Bint8(k * n); - aligned_vector Cfp32_mkl(m * n, 0.0f); - aligned_vector Cint32_mkl(m * n, 0.0f); - aligned_vector Cint32_fb(m * n, 0); - aligned_vector Cint8_fb(m * n, 0); - aligned_vector Cint32_local(m * n, 0); - aligned_vector Cint32_buffer(m * n, 0); - aligned_vector Cint8_local(m * n, 0); + aligned_vector Cfp32_mkl(m * n); + aligned_vector Cint32_mkl(Cfp32_mkl.size()); + aligned_vector Cint32_fb(Cfp32_mkl.size()); + aligned_vector Cint8_fb(Cfp32_mkl.size()); + aligned_vector Cint32_local(Cfp32_mkl.size()); + aligned_vector Cint32_buffer(Cfp32_mkl.size()); + aligned_vector Cint8_local(Cfp32_mkl.size()); // A matrix - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); // float Aint8_scale = 0.11; int32_t Aint8_zero_point = 43; - for (auto i = 0; i < Afp32.size(); ++i) { - Afp32[i] = (float)Aint8[i]; - } + aligned_vector Afp32(Aint8.begin(), Aint8.end()); - randFill(Bint8, -128, 127); + randFill(Bint8, -128, 127); avoidOverflow(m, n, k, Aint8.data(), Bint8.data()); // float Bint8_scale = 0.49; int32_t Bint8_zero_point = -30; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bfp32[i] = (float)Bint8[i]; - } + aligned_vector Bfp32(Bint8.begin(), Bint8.end()); // computing column offset - vector col_offsets; - col_offsets.resize(n); + vector col_offsets(n); col_offsets_with_zero_pt_s8acc32_ref( - k, n, n, Bint8.data(), Bint8_zero_point, col_offsets.data()); + k, n, n, Bint8.data(), &Bint8_zero_point, col_offsets.data(), n); double nops = 2.0 * static_cast(NITER) * m * n * k; double ttot = 0.0; @@ -180,8 +173,7 @@ void performance_test() { } #endif - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); float C_multiplier = 0.1234; int32_t C_zero_pt = 5; @@ -197,13 +189,14 @@ void performance_test() { n, Cint32_local.data(), Cint8_local.data(), - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, row_offsets.data(), col_offsets.data(), - nullptr); // bias + nullptr, // bias + n); // ncols per quant group // printMatrix(matrix_op_t::NoTranspose, Bint8.data(), k, n, n, "B // unpacked"); // printMatrix(matrix_op_t::NoTranspose, Aint8.data(), m, k, k, @@ -222,8 +215,7 @@ void performance_test() { Bint8.data(), n, nullptr, - 1, - Bint8_zero_point); + 1); ttot = 0.0; runType = "FBGEMM_i8_acc32"; @@ -249,8 +241,7 @@ void performance_test() { #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( @@ -261,27 +252,22 @@ void performance_test() { k, nullptr, 1, - Aint8_zero_point, row_offset_buf.data()); DoNothing<> doNothingObj{}; ReQuantizeOutput outputProcObj( doNothingObj, - C_multiplier, + &C_multiplier, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + &Bint8_zero_point, packAN.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + n); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); // printf ( "tid: %d, num_threads: %d\n", tid, num_threads ); fbgemmPacked( packAN, diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h index e779571f8b..63c75fa0dc 100644 --- a/include/fbgemm/Fbgemm.h +++ b/include/fbgemm/Fbgemm.h @@ -92,14 +92,12 @@ class PackMatrix { * dimension A.rows by B.cols*B.groups . * A.groups must be same as B.groups, A.groups must divide * A.cols, and B.groups must divide B.rows and C.cols. - * @param zero_pt the quantized value that maps to 0.0f floating-point number. */ PackMatrix( std::int32_t rows, std::int32_t cols, inpType* pmat, - int groups = 1, - std::int32_t zero_pt = 0); + int groups = 1); /** * @return true usually when the matrix is constant matrix (e.g., weight @@ -276,13 +274,6 @@ class PackMatrix { : (numPackedCols() % blockColSize()); } - /** - * @return the quantized value that maps to 0.0f floating-point number - */ - std::int32_t zeroPoint() const { - return zero_pt_; - } - inpType* buf_; std::int32_t brow_; ///< the number of rows in each block std::int32_t bcol_; ///< the number of columns in each block @@ -293,7 +284,6 @@ class PackMatrix { private: std::int32_t nrows_, ncols_; int G_; - std::int32_t zero_pt_; block_type_t packedBlock_; ///< The block in the source matrix just packed std::int32_t last_brow_, last_bcol_; }; @@ -320,8 +310,7 @@ class PackAMatrix final : public PackMatrix, T, accT> { const inpType* smat, std::int32_t ld, inpType* pmat = nullptr, - int groups = 1, - std::int32_t zero_pt = 0); + int groups = 1); /** * Activation matrices are not constant so cannot amortize the cost of @@ -384,6 +373,12 @@ class PackBMatrix final : public PackMatrix, T, accT> { PackBMatrix() = delete; // no default constructor + /** + * @params groups if > 1 and trans == NoTranspose, smat is nRow x nCol + * and each group is (nRow / groups) x nCol + * if > 1 and trans == Transpose, smat is (nCol * groups) x + * (nRow / groups). + */ PackBMatrix( matrix_op_t trans, std::int32_t nRow, @@ -391,8 +386,7 @@ class PackBMatrix final : public PackMatrix, T, accT> { const inpType* smat, std::int32_t ld, inpType* pmat = nullptr, - int groups = 1, - std::int32_t zero_pt = 0); + int groups = 1); /** * Weight matrices are usually constant so worth pre-packing. @@ -468,7 +462,7 @@ class PackAWithIm2Col PackAWithIm2Col() = delete; // no default constructor /** - * TODO: Currently only groups == 1 supported + * @param zero_pt the quantized value that maps to 0.0f floating-point number. */ PackAWithIm2Col( const conv_param_t& conv_param, @@ -523,6 +517,7 @@ class PackAWithIm2Col private: const conv_param_t& conv_p_; const T* sdata_; + std::int32_t zero_pt_; std::int32_t* row_offset_; bool rowOffsetAllocatedHere; std::int32_t row_interleave_B_; @@ -551,7 +546,6 @@ class PackAWithRowOffset final std::uint32_t ld, inpType* pmat = nullptr, int groups = 1, - std::int32_t zero_pt = 0, std::int32_t* row_offset = nullptr); /** @@ -693,6 +687,7 @@ class PackAWithQuantRowOffset final const float* smat_; std::int32_t ld_; float scale_; + std::int32_t zero_pt_; std::int32_t* row_offset_; bool rowOffsetAllocatedHere; std::int32_t row_interleave_B_; @@ -848,13 +843,19 @@ class DoSpmdmOnInpBuffer { const int groups_; }; +enum class QuantizationGranularity { + TENSOR, + GROUP, + OUT_CHANNEL, +}; + /** * @brief Requantize values in inp buffer and write to out buffer. * pass the out buffer to next op for further processing. - * */ template < bool FUSE_RELU, + QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, typename outT = std::uint8_t, typename inT = std::int32_t, typename nextOPType = DoNothing> @@ -864,13 +865,15 @@ class ReQuantizeOutput { using inpType = inT; ReQuantizeOutput( nextOPType& nextop, - float C_multiplier, + const float* C_multiplier, std::int32_t C_zero_point, std::int32_t Aq_zero_point, - std::int32_t Bq_zero_point, + const std::int32_t* Bq_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, - const std::int32_t* bias) + const std::int32_t* bias, + std::uint32_t nCol, + int groups = 1) : nextop_(nextop), C_multiplier_(C_multiplier), C_zero_point_(C_zero_point), @@ -878,7 +881,9 @@ class ReQuantizeOutput { Bq_zero_point_(Bq_zero_point), q_row_offsets_(row_offsets), q_col_offsets_(col_offsets), - bias_(bias) {} + bias_(bias), + ncols_(nCol), + groups_(groups) {} template inline int f( @@ -898,13 +903,15 @@ class ReQuantizeOutput { int ld_in) const; nextOPType& nextop_; - float C_multiplier_; + const float* C_multiplier_; std::int32_t C_zero_point_; std::int32_t Aq_zero_point_; - std::int32_t Bq_zero_point_; + const std::int32_t* Bq_zero_point_; const std::int32_t* q_row_offsets_; const std::int32_t* q_col_offsets_; const std::int32_t* bias_; + std::uint32_t ncols_; + int groups_; }; /** @@ -913,6 +920,7 @@ class ReQuantizeOutput { */ template < bool FUSE_RELU, + QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR, typename outT = float, typename inT = std::int32_t, typename nextOPType = DoNothing> @@ -923,12 +931,14 @@ class ReQuantizeForFloat { ReQuantizeForFloat( nextOPType& nextop, float Aq_scale, - float Bq_scale, + const float* Bq_scale, std::int32_t Aq_zero_point, - std::int32_t Bq_zero_point, + const std::int32_t* Bq_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, - const float* bias) + const float* bias, + std::uint32_t nCol, + int groups = 1) : nextop_(nextop), Aq_scale_(Aq_scale), Bq_scale_(Bq_scale), @@ -936,7 +946,9 @@ class ReQuantizeForFloat { Bq_zero_point_(Bq_zero_point), q_row_offsets_(row_offsets), q_col_offsets_(col_offsets), - bias_(bias) {} + bias_(bias), + ncols_(nCol), + groups_(groups) {} template inline int f( @@ -948,12 +960,15 @@ class ReQuantizeForFloat { private: nextOPType& nextop_; - float Aq_scale_, Bq_scale_; + float Aq_scale_; + const float* Bq_scale_; std::int32_t Aq_zero_point_; - std::int32_t Bq_zero_point_; + const std::int32_t* Bq_zero_point_; const std::int32_t* q_row_offsets_; const std::int32_t* q_col_offsets_; const float* bias_; + std::uint32_t ncols_; + int groups_; }; // type specialized implementation in an include file diff --git a/include/fbgemm/OutputProcessing-inl.h b/include/fbgemm/OutputProcessing-inl.h index 59a6e0e6a1..88a10bc3df 100644 --- a/include/fbgemm/OutputProcessing-inl.h +++ b/include/fbgemm/OutputProcessing-inl.h @@ -44,9 +44,14 @@ inline int DoSpmdmOnInpBuffer::f( return nextop_.template f(out, inp, block, ld_out, ld_in); } -template +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template -void ReQuantizeOutput::f_( +void ReQuantizeOutput::f_( outT* out, const inT* inp, const block_type_t& block, @@ -54,7 +59,13 @@ void ReQuantizeOutput::f_( int ld_in) const { // Adoption of implementation at QNNPACK/src/requantization/fp32-sse2.c // using AVX2 instructions - __m256 multiplier_v = _mm256_set1_ps(C_multiplier_); + int quant_param_idx = 0; + if (Q_GRAN == QuantizationGranularity::GROUP) { + int ncol_per_group = ncols_ / groups_; + int g = block.col_start / ncol_per_group; + quant_param_idx = g; + } + __m256 multiplier_v = _mm256_set1_ps(C_multiplier_[quant_param_idx]); __m256i min_v = _mm256_set1_epi8(std::numeric_limits::min()); __m256i max_v = _mm256_set1_epi8(std::numeric_limits::max()); @@ -63,7 +74,9 @@ void ReQuantizeOutput::f_( (A_SYMMETRIC == (Aq_zero_point_ == 0)) && "A_SYMMETRIC == true if and only if Aq_zero_point == 0"); assert( - (B_SYMMETRIC == (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr)) && + (B_SYMMETRIC == + ((Q_GRAN == QuantizationGranularity::TENSOR && Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr)) && "B_SYMMETRIC == true if and only if Bq_zero_point == 0 " "or q_row_offsets_ == nullptr"); assert( @@ -79,10 +92,22 @@ void ReQuantizeOutput::f_( constexpr int VLEN = 8; for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { - std::int32_t row_offset = (q_row_offsets_ && !B_SYMMETRIC) - ? q_row_offsets_[i - block.row_start] * Bq_zero_point_ - : 0; + // Scale row_offset with Bq_zero_point + int32_t row_offset = 0; + if (B_SYMMETRIC) { + row_offset = 0; + } else if ( + Q_GRAN == QuantizationGranularity::TENSOR || + Q_GRAN == QuantizationGranularity::GROUP) { + row_offset = + q_row_offsets_[i - block.row_start] * Bq_zero_point_[quant_param_idx]; + } else { + assert( + Q_GRAN == QuantizationGranularity::OUT_CHANNEL && + "unknown quantization granularity"); + } __m256i row_offset_v = _mm256_set1_epi32(row_offset); + int j = block.col_start; for (; j < block.col_start + (block.col_size / (VLEN * 4) * (VLEN * 4)); j += (VLEN * 4)) { @@ -122,9 +147,33 @@ void ReQuantizeOutput::f_( } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast(Bq_zero_point_ + j))); + } x_v = _mm256_sub_epi32(x_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast(Bq_zero_point_ + j + VLEN))); + } y_v = _mm256_sub_epi32(y_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256(reinterpret_cast( + Bq_zero_point_ + j + 2 * VLEN))); + } z_v = _mm256_sub_epi32(z_v, row_offset_v); + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256(reinterpret_cast( + Bq_zero_point_ + j + 3 * VLEN))); + } w_v = _mm256_sub_epi32(w_v, row_offset_v); } if (HAS_BIAS) { @@ -157,10 +206,24 @@ void ReQuantizeOutput::f_( * representation as an FP32 value, and will be rounded to nearest * FP32 value with ties to even with default MXCSR rounding mode. */ - __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); - __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v); - __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v); - __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v); + __m256 x_scaled_v, y_scaled_v, z_scaled_v, w_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j)); + y_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(y_v), _mm256_loadu_ps(C_multiplier_ + j + VLEN)); + z_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(z_v), + _mm256_loadu_ps(C_multiplier_ + j + 2 * VLEN)); + w_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(w_v), + _mm256_loadu_ps(C_multiplier_ + j + 3 * VLEN)); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(y_v), multiplier_v); + z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(z_v), multiplier_v); + w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(w_v), multiplier_v); + } /* * Convert scaled FP32 result to int32_t using CVTPS2DQ instruction. @@ -238,6 +301,12 @@ void ReQuantizeOutput::f_( } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset_v = _mm256_mullo_epi32( + _mm256_set1_epi32(q_row_offsets_[i - block.row_start]), + _mm256_loadu_si256( + reinterpret_cast(Bq_zero_point_ + j))); + } x_v = _mm256_sub_epi32(x_v, row_offset_v); } if (HAS_BIAS) { @@ -246,7 +315,13 @@ void ReQuantizeOutput::f_( _mm256_loadu_si256(reinterpret_cast(bias_ + j))); } - __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + __m256 x_scaled_v; + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + x_scaled_v = _mm256_mul_ps( + _mm256_cvtepi32_ps(x_v), _mm256_loadu_ps(C_multiplier_ + j)); + } else { + x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(x_v), multiplier_v); + } __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v); __m256i x_packed_v = _mm256_adds_epi16( @@ -281,19 +356,26 @@ void ReQuantizeOutput::f_( _mm256_castsi256_si128(x_clamped_v)); } // j loop vectorized + // TODO: vectorize remainder using masking for (; j < block.col_start + block.col_size; ++j) { int32_t raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; if (!A_SYMMETRIC) { raw -= Aq_zero_point_ * q_col_offsets_[j]; } if (!B_SYMMETRIC) { + if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + row_offset = q_row_offsets_[i - block.row_start] * Bq_zero_point_[j]; + } raw -= row_offset; } if (HAS_BIAS) { raw += bias_[j]; } - float ab = raw * C_multiplier_; + float ab = raw * + ((Q_GRAN == QuantizationGranularity::OUT_CHANNEL) + ? C_multiplier_[j] + : C_multiplier_[quant_param_idx]); long rounded = std::lrintf(ab) + C_zero_point_; out[i * ld_out + j] = std::max( @@ -303,9 +385,14 @@ void ReQuantizeOutput::f_( } // i loop } -template +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template -inline int ReQuantizeOutput::f( +inline int ReQuantizeOutput::f( outT* out, const inT* inp, const block_type_t& block, @@ -314,19 +401,35 @@ inline int ReQuantizeOutput::f( static_assert( std::is_same::value, "input data type must be of int32_t type"); + int ncol_per_group = ncols_ / groups_; + assert( + block.col_size <= ncol_per_group && + "ReQuantizeOutput should be called at most 1 group at a time."); + int g = block.col_start / ncol_per_group; if (instSet == inst_set_t::anyarch) { for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + (j - block.col_start)]; raw -= Aq_zero_point_ * q_col_offsets_[j]; + int Bq_zero_point_idx; + if (Q_GRAN == QuantizationGranularity::TENSOR) { + Bq_zero_point_idx = 0; + } else if (Q_GRAN == QuantizationGranularity::GROUP) { + Bq_zero_point_idx = g; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + Bq_zero_point_idx = j; + } else { + assert(false && "unknown quantization granularity"); + } if (q_row_offsets_) { - raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_; + raw -= q_row_offsets_[i - block.row_start] * + Bq_zero_point_[Bq_zero_point_idx]; } if (bias_) { raw += bias_[j]; } - float ab = raw * C_multiplier_; + float ab = raw * C_multiplier_[Bq_zero_point_idx]; long rounded = std::lrintf(ab) + C_zero_point_; out[i * ld_out + j] = std::max( @@ -336,8 +439,11 @@ inline int ReQuantizeOutput::f( } } else if (instSet == inst_set_t::avx2 || instSet == inst_set_t::avx512) { if (std::is_same::value) { + bool b_symmetric = (Q_GRAN == QuantizationGranularity::TENSOR && + Bq_zero_point_[0] == 0) || + q_row_offsets_ == nullptr; if (Aq_zero_point_ == 0) { - if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) { + if (b_symmetric) { if (bias_ == nullptr) { f_(out, inp, block, ld_out, ld_in); } else { @@ -351,7 +457,7 @@ inline int ReQuantizeOutput::f( } } } else { - if (Bq_zero_point_ == 0 || q_row_offsets_ == nullptr) { + if (b_symmetric) { if (bias_ == nullptr) { f_(out, inp, block, ld_out, ld_in); } else { @@ -374,9 +480,14 @@ inline int ReQuantizeOutput::f( return nextop_.template f(out, out, block, ld_out, ld_out); } -template +template < + bool FUSE_RELU, + QuantizationGranularity Q_GRAN, + typename outT, + typename inT, + typename nextOPType> template -inline int ReQuantizeForFloat::f( +inline int ReQuantizeForFloat::f( outT* out, inT* inp, const block_type_t& block, @@ -388,12 +499,28 @@ inline int ReQuantizeForFloat::f( static_assert( std::is_same::value, "output data type is of not expected type"); + int ncol_per_group = ncols_ / groups_; + assert( + block.col_size <= ncol_per_group && + "ReQuantizeOutput should be called at most 1 group at a time."); + int g = block.col_start / ncol_per_group; for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { inT raw = inp[(i - block.row_start) * ld_in + j - block.col_start]; raw -= Aq_zero_point_ * q_col_offsets_[j]; - raw -= q_row_offsets_[i - block.row_start] * Bq_zero_point_; - float res = raw * Aq_scale_ * Bq_scale_; + int Bq_zero_point_idx; + if (Q_GRAN == QuantizationGranularity::TENSOR) { + Bq_zero_point_idx = 0; + } else if (Q_GRAN == QuantizationGranularity::GROUP) { + Bq_zero_point_idx = g; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + Bq_zero_point_idx = j; + } else { + assert(false && "unknown quantization granularity"); + } + raw -= q_row_offsets_[i - block.row_start] * + Bq_zero_point_[Bq_zero_point_idx]; + float res = raw * Aq_scale_ * Bq_scale_[Bq_zero_point_idx]; if (bias_) { res += bias_[j]; } diff --git a/include/fbgemm/QuantUtils.h b/include/fbgemm/QuantUtils.h index bfd63de69c..3abcb7a052 100644 --- a/include/fbgemm/QuantUtils.h +++ b/include/fbgemm/QuantUtils.h @@ -6,8 +6,6 @@ #include #include -#include - namespace fbgemm { // Copied from gemmlowp diff --git a/src/ExecuteKernelU8S8.cc b/src/ExecuteKernelU8S8.cc index 2e2035cee2..f1ec882a06 100644 --- a/src/ExecuteKernelU8S8.cc +++ b/src/ExecuteKernelU8S8.cc @@ -240,47 +240,60 @@ void ExecuteKernel< } // for each j block } -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; - -template class ExecuteKernel< - PackAWithQuantRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithQuantRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - -template class ExecuteKernel< - PackAMatrix, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeOutput +#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset, \ + PackBMatrix, \ + uint8_t, \ + ReQuantizeOutput>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_Q_GRANS(ACC_T, false); \ + INSTANTIATE_Q_GRANS(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + uint8_t, \ + ReQuantizeOutput>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAMatrix, @@ -288,110 +301,127 @@ template class ExecuteKernel< uint8_t, ReQuantizeOutput>; -template class ExecuteKernel< - PackAMatrix, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeForFloat +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PACK_A, \ + PackBMatrix, \ + float, \ + ReQuantizeForFloat>; + +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAWithRowOffset); +INSTANTIATE_RELU(PackAWithQuantRowOffset); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + float, \ + ReQuantizeForFloat>; + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset, PackBMatrix, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput::outType, - int32_t, - ReQuantizeOutput>>; + float, + ReQuantizeForFloat>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - DoSpmdmOnInpBuffer< - ReQuantizeOutput::outType, - int32_t, - ReQuantizeOutput>>; +//////////////////////////////////////////////////////////////////////////////// +// DoSpmdmOnInpBuffer +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template class ExecuteKernel< \ + PackAWithRowOffset, \ + PackBMatrix, \ + uint8_t, \ + DoSpmdmOnInpBuffer>>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - DoSpmdmOnInpBuffer< - ReQuantizeForFloat::outType, - int32_t, - ReQuantizeForFloat>>; +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithRowOffset, PackBMatrix, - int32_t, - memCopy<>>; + float, + DoSpmdmOnInpBuffer>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +//////////////////////////////////////////////////////////////////////////////// +// memCopy +#define INSTANTIATE_BASE(PACK_A, ACC_T) \ + template class ExecuteKernel< \ + PACK_A, \ + PackBMatrix, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_BASE(PACK_A, int32_t) \ + INSTANTIATE_BASE(PACK_A, int16_t) -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_ACC_T +#undef INSTANTIATE_BASE -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - int32_t, - memCopy<>>; - -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ + template class ExecuteKernel< \ + PackAWithIm2Col, \ + PackBMatrix, \ + int32_t, \ + memCopy<>>; -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - int32_t, - memCopy<>>; +#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 2); \ + INSTANTIATE_BASE(ACC_T, 3); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +INSTANTIATE_SPATIAL_DIM(int32_t); +INSTANTIATE_SPATIAL_DIM(int16_t); -template class ExecuteKernel< - PackAWithIm2Col, - PackBMatrix, - uint8_t, - ReQuantizeOutput>; +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BASE template class ExecuteKernel< PackAWithQuantRowOffset, @@ -399,12 +429,6 @@ template class ExecuteKernel< int32_t, memCopy<>>; -template class ExecuteKernel< - PackAWithRowOffset, - PackBMatrix, - float, - ReQuantizeForFloat>; - template class ExecuteKernel< PackAMatrix, PackBMatrix, diff --git a/src/Fbgemm.cc b/src/Fbgemm.cc index 0039dafe14..a8bf02f1e1 100644 --- a/src/Fbgemm.cc +++ b/src/Fbgemm.cc @@ -198,149 +198,70 @@ bool fbgemmSupportedCPU() { return (cpuinfo_initialize() && cpuinfo_has_x86_avx2()); } -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& - packA, - PackMatrix, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& - packA, - PackMatrix, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& packA, - PackMatrix, int8_t, int32_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int32_t>& - packA, - PackMatrix, int8_t, int32_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -// 16 bit accumulation functions -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeOutput +#define INSTANTIATE_BASE(ACC_T, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix, uint8_t, ACC_T>& packA, \ + PackMatrix, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU) \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(ACC_T, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_Q_GRANS(ACC_T, false); \ + INSTANTIATE_Q_GRANS(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix, int8_t, ACC_T>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeOutput& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template void fbgemmPacked( PackMatrix, uint8_t, int16_t>& packA, @@ -352,28 +273,109 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const DoSpmdmOnInpBuffer>& - outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// ReQuantizeForFloat +#define INSTANTIATE_BASE(PACK_A, RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix, uint8_t, int32_t>& packA, \ + PackMatrix, int8_t, int32_t>& packB, \ + float* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeForFloat& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(PACK_A, RELU) \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(PACK_A, RELU, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_RELU(PACK_A) \ + INSTANTIATE_Q_GRANS(PACK_A, false); \ + INSTANTIATE_Q_GRANS(PACK_A, true); + +INSTANTIATE_RELU(PackAWithRowOffset); +INSTANTIATE_RELU(PackAWithQuantRowOffset); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix, int8_t, ACC_T>& packB, \ + float* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const ReQuantizeForFloat& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(ACC_T, RELU, SPATIAL_DIM) \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE( \ + ACC_T, RELU, SPATIAL_DIM, QuantizationGranularity::OUT_CHANNEL); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T, RELU) \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 2); \ + INSTANTIATE_Q_GRANS(ACC_T, RELU, 3); + +#define INSTANTIATE_RELU(ACC_T) \ + INSTANTIATE_SPATIAL_DIM(ACC_T, false); \ + INSTANTIATE_SPATIAL_DIM(ACC_T, true); + +INSTANTIATE_RELU(int32_t); +INSTANTIATE_RELU(int16_t); + +#undef INSTANTIATE_RELU +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE template void fbgemmPacked( PackMatrix, uint8_t, int16_t>& packA, PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, + float* C, int32_t* C_buffer, uint32_t ldc, - const DoSpmdmOnInpBuffer>& - outProcess, + const ReQuantizeForFloat& outProcess, int thread_id, int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// DoSpmdmOnInpBuffer +#define INSTANTIATE_BASE(RELU, Q_GRAN) \ + template void fbgemmPacked( \ + PackMatrix, uint8_t, int16_t>& \ + packA, \ + PackMatrix, int8_t, int16_t>& packB, \ + uint8_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const DoSpmdmOnInpBuffer< \ + uint8_t, \ + int32_t, \ + ReQuantizeOutput>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_Q_GRANS(RELU) \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::TENSOR); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::GROUP); \ + INSTANTIATE_BASE(RELU, QuantizationGranularity::OUT_CHANNEL); + +INSTANTIATE_Q_GRANS(false); +INSTANTIATE_Q_GRANS(true); + +#undef INSTANTIATE_Q_GRANS +#undef INSTANTIATE_BASE + template void fbgemmPacked( PackMatrix, uint8_t, int16_t>& packA, PackMatrix, int8_t, int16_t>& packB, @@ -385,49 +387,57 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); +//////////////////////////////////////////////////////////////////////////////// +// memCopy +#define INSTANTIATE_BASE(PACK_A, ACC_T) \ + template void fbgemmPacked( \ + PackMatrix, uint8_t, ACC_T>& packA, \ + PackMatrix, int8_t, ACC_T>& packB, \ + int32_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const memCopy<>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_ACC_T(PACK_A) \ + INSTANTIATE_BASE(PACK_A, int32_t) \ + INSTANTIATE_BASE(PACK_A, int16_t) + +INSTANTIATE_ACC_T(PackAMatrix); +INSTANTIATE_ACC_T(PackAWithRowOffset); + +#undef INSTANTIATE_ACC_T +#undef INSTANTIATE_BASE + +#define INSTANTIATE_BASE(ACC_T, SPATIAL_DIM) \ + template void fbgemmPacked( \ + PackMatrix< \ + PackAWithIm2Col, \ + uint8_t, \ + ACC_T>& packA, \ + PackMatrix, int8_t, ACC_T>& packB, \ + int32_t* C, \ + int32_t* C_buffer, \ + uint32_t ldc, \ + const memCopy<>& outProcess, \ + int thread_id, \ + int num_threads); + +#define INSTANTIATE_SPATIAL_DIM(ACC_T) \ + INSTANTIATE_BASE(ACC_T, 2); \ + INSTANTIATE_BASE(ACC_T, 3); + +INSTANTIATE_SPATIAL_DIM(int32_t); +INSTANTIATE_SPATIAL_DIM(int16_t); + +#undef INSTANTIATE_SPATIAL_DIM +#undef INSTANTIATE_BASE template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - int32_t* C, - int32_t* C_buffer, - uint32_t ldc, - const memCopy<>& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, + PackMatrix, uint8_t, int32_t>& + packA, + PackMatrix, int8_t, int32_t>& packB, int32_t* C, int32_t* C_buffer, uint32_t ldc, @@ -435,26 +445,6 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - uint8_t* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeOutput& outProcess, - int thread_id, - int num_threads); - template void fbgemmPacked( PackMatrix, uint8_t, int16_t>& packA, PackMatrix, int8_t, int16_t>& packB, @@ -465,14 +455,4 @@ template void fbgemmPacked( int thread_id, int num_threads); -template void fbgemmPacked( - PackMatrix, uint8_t, int16_t>& packA, - PackMatrix, int8_t, int16_t>& packB, - float* C, - int32_t* C_buffer, - uint32_t ldc, - const ReQuantizeForFloat& outProcess, - int thread_id, - int num_threads); - } // namespace fbgemm diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc index 988a27be4b..9487974e07 100644 --- a/src/PackAMatrix.cc +++ b/src/PackAMatrix.cc @@ -20,14 +20,8 @@ PackAMatrix::PackAMatrix( const T* smat, int32_t ld, inpType* pmat, - int groups, - std::int32_t zero_pt) - : PackMatrix, T, accT>( - nRow, - nCol, - pmat, - groups, - zero_pt), + int groups) + : PackMatrix, T, accT>(nRow, nCol, pmat, groups), trans_(trans), smat_(smat), ld_(ld) { diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc index 9929fc1124..367a7902c7 100644 --- a/src/PackAWithIm2Col.cc +++ b/src/PackAWithIm2Col.cc @@ -36,10 +36,10 @@ PackAWithIm2Col::PackAWithIm2Col( std::multiplies()) * conv_p.IC, pmat, - conv_p.G, - zero_pt), + conv_p.G), conv_p_(conv_p), - sdata_(sdata) { + sdata_(sdata), + zero_pt_(zero_pt) { static_assert( SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension "); if (cpuinfo_has_x86_avx512f()) { @@ -187,7 +187,7 @@ void PackAWithIm2Col::pack(const block_type_t& block) { std::memset( out + (i - block.row_start) * BaseType::blockColSize() + (j_blk_start - block.col_start), - BaseType::zeroPoint(), + zero_pt_, sizeof(T) * (j_blk_end - j_blk_start)); } else { std::memcpy( @@ -239,7 +239,7 @@ void PackAWithIm2Col::pack(const block_type_t& block) { &out [(i - block.row_start) * BaseType::blockColSize() + (j_blk_start - block.col_start)], - BaseType::zeroPoint(), + zero_pt_, sizeof(T) * (j_blk_end - j_blk_start)); } else { std::memcpy( diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 9eeee43f9f..c1e5b075da 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -31,12 +31,12 @@ PackAWithQuantRowOffset::PackAWithQuantRowOffset( nRow, nCol, pmat, - groups, - zero_pt), + groups), trans_(trans), smat_(smat), ld_(ld), scale_(scale), + zero_pt_(zero_pt), row_offset_(row_offset) { rowOffsetAllocatedHere = false; @@ -158,7 +158,7 @@ void PackAWithQuantRowOffset::pack(const block_type_t& block) { for (; j < block.col_size / VLEN * VLEN; j += VLEN) { __m256 val_v = _mm256_loadu_ps(smat_temp + i * ld_temp + j); __m256 transformed_v = _mm256_fmadd_ps( - val_v, inverse_scale_v, _mm256_set1_ps(BaseType::zeroPoint())); + val_v, inverse_scale_v, _mm256_set1_ps(zero_pt_)); __m256 clipped_v = _mm256_max_ps( _mm256_set1_ps(std::numeric_limits::min()), _mm256_min_ps( @@ -180,7 +180,7 @@ void PackAWithQuantRowOffset::pack(const block_type_t& block) { #endif for (; j < block.col_size; ++j) { float val = smat_temp[i * ld_temp + j]; - float transformed = val / scale_ + BaseType::zeroPoint(); + float transformed = val / scale_ + zero_pt_; float clipped = std::min( std::max(transformed, std::numeric_limits::min()), std::numeric_limits::max()); diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc index 39985a750b..1af1fa5a68 100644 --- a/src/PackAWithRowOffset.cc +++ b/src/PackAWithRowOffset.cc @@ -23,14 +23,12 @@ PackAWithRowOffset::PackAWithRowOffset( uint32_t ld, inpType* pmat, int groups, - int32_t zero_pt, int32_t* row_offset) : PackMatrix, T, accT>( nRow, nCol, pmat, - groups, - zero_pt), + groups), trans_(trans), smat_(smat), ld_(ld), diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc index 1bb7d4bb80..1b3899771d 100644 --- a/src/PackBMatrix.cc +++ b/src/PackBMatrix.cc @@ -20,14 +20,8 @@ PackBMatrix::PackBMatrix( const T* smat, int32_t ld, inpType* pmat, - int groups, - std::int32_t zero_pt) - : PackMatrix, T, accT>( - nRow, - nCol, - pmat, - groups, - zero_pt), + int groups) + : PackMatrix, T, accT>(nRow, nCol, pmat, groups), trans_(trans), smat_(smat), ld_(ld) { @@ -75,7 +69,7 @@ void PackBMatrix::pack(const block_type_t& block) { g * this->packedBufferSize(block.row_size, block.col_size); for (int i = block.row_start; i < block.row_start + block.row_size; ++i) { for (int j = block.col_start; j < block.col_start + block.col_size; ++j) { - T val = tr ? smat_[g * block.row_size + i + ld_ * j] + T val = tr ? smat_[i + (g * block.col_size + j) * ld_] : smat_[(g * block.row_size + i) * ld_ + j]; out[addr(i, j)] = tconv(val, out[addr(i, j)]); } @@ -162,8 +156,7 @@ bool PackBMatrix::metaEquals(const PackBMatrix& that) const { BaseType::blockCols() != that.blockCols() || BaseType::numPackedRows() != that.numPackedRows() || BaseType::numPackedCols() != that.numPackedCols() || - BaseType::zeroPoint() != that.zeroPoint() || trans_ != that.trans_ || - BaseType::numGroups() != that.numGroups() || + trans_ != that.trans_ || BaseType::numGroups() != that.numGroups() || row_interleave_ != that.row_interleave_) { return false; } diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc index a57705776b..0177a070f6 100644 --- a/src/PackMatrix.cc +++ b/src/PackMatrix.cc @@ -18,9 +18,8 @@ PackMatrix::PackMatrix( int32_t rows, int32_t cols, inpType* buf, - int groups, - int32_t zero_pt) - : buf_(buf), nrows_(rows), ncols_(cols), G_(groups), zero_pt_(zero_pt) { + int groups) + : buf_(buf), nrows_(rows), ncols_(cols), G_(groups) { bufAllocatedHere_ = false; if (!cpuinfo_initialize()) { throw std::runtime_error("Failed to initialize cpuinfo!"); diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc index 6ec3c414e4..50f619a317 100644 --- a/src/QuantUtils.cc +++ b/src/QuantUtils.cc @@ -1,5 +1,6 @@ #include "fbgemm/QuantUtils.h" +#include #include #include "fbgemm/Fbgemm.h" @@ -435,13 +436,14 @@ void RequantizeAvx2( DoNothing<> doNothingObj{}; ReQuantizeOutput requantizeObj( doNothingObj, - params.real_multiplier, + ¶ms.real_multiplier, params.target_qparams.zero_point, 0, 0, nullptr, nullptr, - nullptr); + nullptr, + len); requantizeObj.f(dst, src, {0, 1, 0, len}, 0, 0); } #endif diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc index 097e3b5207..369aea3076 100644 --- a/src/RefImplementations.cc +++ b/src/RefImplementations.cc @@ -57,24 +57,25 @@ void requantize_u8acc32_ref( int ld, const int32_t* inp, uint8_t* out, - float C_multiplier, + const float* C_multiplier, int32_t C_zero_point, int32_t A_zero_point, - int32_t B_zero_point, + const int32_t* B_zero_point, const int32_t* row_offsets, const int32_t* col_offsets, const int32_t* bias, + int ncols_per_quant_group, bool fuse_relu) { for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { int32_t raw = inp[i * ld + j]; raw -= A_zero_point * col_offsets[j]; - raw -= B_zero_point * row_offsets[i]; + raw -= B_zero_point[j / ncols_per_quant_group] * row_offsets[i]; if (bias) { raw += bias[j]; } - float result = raw * C_multiplier; + float result = raw * C_multiplier[j / ncols_per_quant_group]; long rounded = lrintf(result) + C_zero_point; out[i * ld + j] = std::max( fuse_relu ? static_cast(C_zero_point) : 0l, @@ -180,14 +181,15 @@ void col_offsets_with_zero_pt_s8acc32_ref( int N, int ld, const int8_t* Bint8, - int32_t B_zero_point, - int32_t* col_offsets) { + const int32_t* B_zero_point, + int32_t* col_offsets, + int ncols_per_quant_group) { for (int j = 0; j < N; ++j) { int32_t sum = 0; for (int k = 0; k < K; ++k) { sum += Bint8[k * ld + j]; } - col_offsets[j] = sum - B_zero_point * K; + col_offsets[j] = sum - B_zero_point[j / ncols_per_quant_group] * K; } } @@ -578,13 +580,14 @@ void depthwise_3x3_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier, + &C_multiplier, C_zero_point, A_zero_point, - B_zero_point, + &B_zero_point, &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; @@ -644,13 +647,14 @@ void depthwise_3x3_per_channel_quantization_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier[k], + &C_multiplier[k], C_zero_point, A_zero_point, - B_zero_point[k], + &B_zero_point[k], &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; @@ -781,13 +785,14 @@ void depthwise_3x3x3_pad_1_ref( 1, C_int32.data() + i * K + k, C + i * K + k, - C_multiplier, + &C_multiplier, C_zero_point, A_zero_point, - B_zero_point, + &B_zero_point, &row_offsets[i * K + k], col_offsets + k, - bias ? bias + k : nullptr); + bias ? bias + k : nullptr, + 1); } } }; diff --git a/src/RefImplementations.h b/src/RefImplementations.h index cec4bfffa8..6530eff6ad 100644 --- a/src/RefImplementations.h +++ b/src/RefImplementations.h @@ -39,6 +39,11 @@ void requantize_u8acc32_ref( * @brief Reference implementation of requantization step. * float multiplier * @params bias can be nullptr + * @params ncols_per_quant_group the number of columns share the same + * quantization parameter. + * ncols_per_quant_group == N : per-tensor quantization + * ncols_per_quant_group == N / groups : per-group quantization + * ncols_per_quant_group == 1 : per-channel quantization */ void requantize_u8acc32_ref( int M, @@ -46,13 +51,14 @@ void requantize_u8acc32_ref( int ld, const std::int32_t* inp, std::uint8_t* out, - float C_multiplier, + const float* C_multiplier, std::int32_t C_zero_point, std::int32_t A_zero_point, - std::int32_t B_zero_point, + const std::int32_t* B_zero_point, const std::int32_t* row_offsets, const std::int32_t* col_offsets, const std::int32_t* bias, + int ncols_per_quant_group, bool fuse_relu = false); /** @@ -114,14 +120,18 @@ void row_offsets_u8acc32_ref( /** * @brief Reference implementation to compute adjusted col_offsets (sum of * columns of B and adjusted with B_zero_point) + * + * @params ncols_per_quant_group see ncols_per_quant_group in + * requantize_u8acc32_ref */ void col_offsets_with_zero_pt_s8acc32_ref( int K, int N, int ld, const std::int8_t* Bint8, - std::int32_t B_zero_point, - std::int32_t* col_offsets); + const std::int32_t* B_zero_point, + std::int32_t* col_offsets, + int ncols_per_quant_group); /** * @brief Reference implementation of SPMDM (sparse matrix times dense matrix). diff --git a/test/FP16Test.cc b/test/FP16Test.cc index b5e4f136b6..0edcc4b616 100644 --- a/test/FP16Test.cc +++ b/test/FP16Test.cc @@ -73,19 +73,17 @@ TEST_P(FBGemmFP16Test, Test) { } cerr << endl; - aligned_vector A(m * k, 0.f); - aligned_vector B(k * n, 0.f); + // initialize with small numbers + aligned_vector Aint(m * k); + aligned_vector Bint(k * n); + randFill(Aint, 0, 4); + randFill(Bint, 0, 4); + aligned_vector A(Aint.begin(), Aint.end()); + aligned_vector B(Bint.begin(), Bint.end()); + aligned_vector C(m * n, NAN); - // initialize with small numbers - randFill(A, 0, 4); - randFill(B, 0, 4); - randFill(C, 0, 4); - - aligned_vector A_ref, B_ref, C_ref; - A_ref = A; - B_ref = B; - C_ref = C; + aligned_vector A_ref(A), B_ref(B), C_ref(C); if (atrans == matrix_op_t::Transpose) { transpose_matrix(A_ref.data(), k, m); diff --git a/test/I8DepthwiseTest.cc b/test/I8DepthwiseTest.cc index 9a19f0fd12..f4827833c4 100644 --- a/test/I8DepthwiseTest.cc +++ b/test/I8DepthwiseTest.cc @@ -85,10 +85,10 @@ TEST(FBGemmDepthWiseTest, Test3x3) { aligned_vector B(K * R * S); aligned_vector C_ref(N * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3_pad_1_ref( @@ -211,10 +211,10 @@ TEST(FBGemmDepthWiseTest, Test3x3x3) { aligned_vector C_ref(N * T_OUT * H_OUT * W_OUT * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill(A, 0, 86); int32_t A_zero_point = 43; - randFill(B, -16, 16); + randFill(B, -16, 16); int32_t B_zero_point = 5; depthwise_3x3x3_pad_1_ref( @@ -360,7 +360,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { int32_t C_num_rows = N * H_OUT * W_OUT; aligned_vector C_ref(C_num_rows * K), C(C_ref.size()); - randFill(A, 0, 86); + randFill(A, 0, 86); int32_t A_zero_point = 43; // Each row of G has a different range to really test per-channel @@ -368,7 +368,7 @@ TEST(FBGemmDepthWiseTest, Test3x3PerChannelQuantization) { vector B_zero_point(K); for (auto k = 0; k < K; ++k) { aligned_vector Bk(R * S); - randFill(Bk, -16 + k, 16 + k); + randFill(Bk, -16 + k, 16 + k); copy(Bk.begin(), Bk.end(), B.begin() + k * R * S); B_zero_point[k] = 5 + k; diff --git a/test/I8SpmdmTest.cc b/test/I8SpmdmTest.cc index 5bb7703225..2090b63533 100644 --- a/test/I8SpmdmTest.cc +++ b/test/I8SpmdmTest.cc @@ -66,7 +66,7 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) { } aligned_vector A(M * K); - randFill(A, 0, 255); + randFill(A, 0, 255); CompressedSparseColumn B_csc(K_adjusted, N_adjusted); vector C(M * N); @@ -127,13 +127,8 @@ TEST_P(fbgemmSPMDMTest, TestsSpMDM) { #pragma omp parallel #endif { -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); int i_per_thread = (M + num_threads - 1) / num_threads; int i_begin = std::min(tid * i_per_thread, M); int i_end = std::min(i_begin + i_per_thread, M); diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc index fdc9fe691f..d8f3f7a2a7 100644 --- a/test/Im2ColFusedRequantizeTest.cc +++ b/test/Im2ColFusedRequantizeTest.cc @@ -20,8 +20,22 @@ #include "src/RefImplementations.h" using namespace std; +using namespace fbgemm; -namespace fbgemm { +vector qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; + +namespace { +class fbgemmIm2colTest + : public testing::TestWithParam {}; +}; // namespace + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmIm2colTest, + ::testing::ValuesIn(qGranularityVals)); // From Faster-RCNN with ShuffleNet static vector> shapes = { @@ -71,7 +85,7 @@ static vector> shapes = { conv_param_t<>(1, 8, 8, {4, 4}, 1, {3, 3}, {1, 1}, {1, 1, 0, 0}), }; -template +template static void Im2colTest() { for (auto conv_p : shapes) { for (int groups : {1, 4}) { @@ -80,29 +94,38 @@ static void Im2colTest() { } conv_p.G = groups; aligned_vector Aint8( - conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC, 0); + conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IC); aligned_vector Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.IC * conv_p.OC); aligned_vector Cint32_ref( - conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC, 0); - aligned_vector Cint8_ref(Cint32_ref.size(), 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint8_fb(Cint32_ref.size(), 0); - - int32_t Aint8_zero_point, Bint8_zero_point; + conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * conv_p.OC); + aligned_vector Cint8_ref(Cint32_ref.size()); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint8_fb(Cint32_ref.size()); + + int ncols_per_quant_group = conv_p.OC; + if (Q_GRAN == QuantizationGranularity::GROUP) { + ncols_per_quant_group = conv_p.OC / conv_p.G; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + int32_t Aint8_zero_point; + aligned_vector Bint8_zero_point( + conv_p.OC / ncols_per_quant_group); if (is_same::value) { - randFill(Aint8, 0, 80); + randFill(Aint8, 0, 80); Aint8_zero_point = 43; - randFill(Bint8, -16, 16); - Bint8_zero_point = -30; + randFill(Bint8, -16, 16); + randFill(Bint8_zero_point, -50, -10); } else { - randFill(Aint8, 0, 5); + randFill(Aint8, 0, 5); Aint8_zero_point = 4; - randFill(Bint8, -4, 4); - Bint8_zero_point = -2; + randFill(Bint8, -4, 4); + randFill(Bint8_zero_point, -3, -1); } - float C_multiplier = 0.1234; + aligned_vector C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int MDim = conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]; @@ -116,16 +139,16 @@ static void Im2colTest() { im2col_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); // computing column offset - vector col_offsets; - col_offsets.resize(groups * NDim); + vector col_offsets(groups * NDim); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( KDimPerGroup, NDim, NDim, Bint8.data() + g * KDimPerGroup * NDim, - Bint8_zero_point, - col_offsets.data() + g * NDim); + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, + col_offsets.data() + g * NDim, + ncols_per_quant_group); } conv_ref( @@ -149,13 +172,14 @@ static void Im2colTest() { conv_p.G * NDim, Cint32_ref.data() + g * NDim, Cint8_ref.data() + g * NDim, - C_multiplier, + C_multiplier.data() + g * NDim / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * NDim, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix packedB( @@ -171,8 +195,7 @@ static void Im2colTest() { #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithIm2Col::rowOffsetBufferSize()); PackAWithIm2Col packA( @@ -183,23 +206,20 @@ static void Im2colTest() { row_offset_buf.data()); DoNothing<> doNothingObj{}; - ReQuantizeOutput outputProcObj( + ReQuantizeOutput outputProcObj( doNothingObj, - C_multiplier, + C_multiplier.data(), C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data(), packA.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + conv_p.G * NDim, + conv_p.G); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packA, @@ -236,12 +256,26 @@ static void Im2colTest() { } // for each shape } -TEST(FBGemmIm2colTest, Acc32Test) { - Im2colTest(); +TEST_P(fbgemmIm2colTest, Acc32Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2colTest(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2colTest(); + } else { + Im2colTest(); + } } -TEST(FBGemmIm2colTest, Acc16Test) { - Im2colTest(); +TEST_P(fbgemmIm2colTest, Acc16Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2colTest(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2colTest(); + } else { + Im2colTest(); + } } static vector> shapes_3d = { @@ -319,7 +353,7 @@ static vector> shapes_3d = { 3>(1, 8, 16, {8, 14, 14}, 1, {1, 1, 1}, {2, 2, 2}, {0, 0, 0, 0, 0, 0}), }; -template +template static void Im2col3DTest() { for (auto conv_p : shapes_3d) { for (int groups : {1, 4}) { @@ -329,32 +363,39 @@ static void Im2col3DTest() { conv_p.G = groups; aligned_vector Aint8( conv_p.MB * conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * conv_p.IN_DIM[2] * - conv_p.IC, - 0); + conv_p.IC); aligned_vector Bint8( - conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC, 0); + conv_p.K[0] * conv_p.K[1] * conv_p.K[2] * conv_p.IC * conv_p.OC); aligned_vector Cint32_ref( conv_p.MB * conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * - conv_p.OUT_DIM[2] * conv_p.OC, - 0); - aligned_vector Cint8_ref(Cint32_ref.size(), 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint8_fb(Cint32_ref.size(), 0); - - int32_t Aint8_zero_point, Bint8_zero_point; + conv_p.OUT_DIM[2] * conv_p.OC); + aligned_vector Cint8_ref(Cint32_ref.size()); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint8_fb(Cint32_ref.size()); + + int ncols_per_quant_group = conv_p.OC; + if (Q_GRAN == QuantizationGranularity::GROUP) { + ncols_per_quant_group = conv_p.OC / conv_p.G; + } else if (Q_GRAN == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + int32_t Aint8_zero_point; + aligned_vector Bint8_zero_point( + conv_p.OC / ncols_per_quant_group); if (is_same::value) { - randFill(Aint8, 0, 80); + randFill(Aint8, 0, 80); Aint8_zero_point = 43; - randFill(Bint8, -16, 16); - Bint8_zero_point = -30; + randFill(Bint8, -16, 16); + randFill(Bint8_zero_point, -50, -10); } else { - randFill(Aint8, 0, 5); + randFill(Aint8, 0, 5); Aint8_zero_point = 4; - randFill(Bint8, -4, 4); - Bint8_zero_point = -2; + randFill(Bint8, -4, 4); + randFill(Bint8_zero_point, -3, -1); } - float C_multiplier = 0.1234; + aligned_vector C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int MDim = @@ -369,16 +410,16 @@ static void Im2col3DTest() { im2col3d_ref(conv_p, Aint8.data(), Aint8_zero_point, Aint8_im2col.data()); // computing column offset - vector col_offsets; - col_offsets.resize(groups * NDim); + vector col_offsets(groups * NDim); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( KDimPerGroup, NDim, NDim, Bint8.data() + g * KDimPerGroup * NDim, - Bint8_zero_point, - col_offsets.data() + g * NDim); + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, + col_offsets.data() + g * NDim, + ncols_per_quant_group); } conv3d_ref( @@ -402,13 +443,14 @@ static void Im2col3DTest() { conv_p.G * NDim, Cint32_ref.data() + g * NDim, Cint8_ref.data() + g * NDim, - C_multiplier, + C_multiplier.data() + g * NDim / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * NDim / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * NDim, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix packedB( @@ -418,15 +460,13 @@ static void Im2col3DTest() { Bint8.data(), NDim, nullptr, - conv_p.G, - Bint8_zero_point); + conv_p.G); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithIm2Col::rowOffsetBufferSize()); PackAWithIm2Col packA( @@ -437,23 +477,20 @@ static void Im2col3DTest() { row_offset_buf.data()); DoNothing<> doNothingObj{}; - ReQuantizeOutput outputProcObj( + ReQuantizeOutput outputProcObj( doNothingObj, - C_multiplier, + C_multiplier.data(), C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data(), packA.getRowOffsetBuffer(), col_offsets.data(), - nullptr); + nullptr, + conv_p.G * NDim, + conv_p.G); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packA, @@ -496,12 +533,24 @@ static void Im2col3DTest() { } // for each shape } -TEST(FBGemmIm2colTest, 3DAcc32Test) { - Im2col3DTest(); +TEST_P(fbgemmIm2colTest, 3DAcc32Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2col3DTest(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2col3DTest(); + } else { + Im2col3DTest(); + } } -TEST(FBGemmIm2colTest, 3DAcc16Test) { - Im2col3DTest(); +TEST_P(fbgemmIm2colTest, 3DAcc16Test) { + QuantizationGranularity q_granularity = GetParam(); + if (q_granularity == QuantizationGranularity::TENSOR) { + Im2col3DTest(); + } else if (q_granularity == QuantizationGranularity::GROUP) { + Im2col3DTest(); + } else { + Im2col3DTest(); + } } - -} // namespace fbgemm diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc index 71cf8fe398..55f6e7fa6a 100644 --- a/test/PackedRequantizeAcc16Test.cc +++ b/test/PackedRequantizeAcc16Test.cc @@ -25,17 +25,34 @@ using namespace std; using namespace fbgemm; -std::vector transposeVals{matrix_op_t::NoTranspose, +vector transposeVals{matrix_op_t::NoTranspose, matrix_op_t::Transpose}; +vector qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; + namespace { -class fbgemmu8s8acc16test : public testing::TestWithParam< - std::tuple> {}; +class fbgemmu8s8acc16WithQuantGranularityTest + : public testing::TestWithParam< + tuple> {}; +class fbgemmu8s8acc16Test + : public testing::TestWithParam> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, - fbgemmu8s8acc16test, + fbgemmu8s8acc16WithQuantGranularityTest, + ::testing::Combine( + ::testing::Values(matrix_op_t::NoTranspose), + ::testing::ValuesIn(transposeVals), + ::testing::Bool(), + ::testing::ValuesIn(qGranularityVals))); + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmu8s8acc16Test, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), @@ -77,11 +94,12 @@ static vector> GetShapes_() { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: requantization -> nothing */ -TEST_P(fbgemmu8s8acc16test, Test) { +TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, Test) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -93,22 +111,21 @@ TEST_P(fbgemmu8s8acc16test, Test) { } int k_per_group = k / groups; - aligned_vector Aint8(m * k, 0); + aligned_vector Aint8(m * k); - aligned_vector Bint8(k * n, 0); - aligned_vector Bint8_ref(Bint8.size(), 0); + aligned_vector Bint8_ref(k * n); - aligned_vector Cint32_ref(m * n * groups, 0); - aligned_vector Cint8_ref(Cint32_ref.size(), 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint8_fb(Cint32_ref.size(), 0); - aligned_vector Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector Cint32_ref(m * n * groups); + aligned_vector Cint8_ref(Cint32_ref.size()); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint8_fb(Cint32_ref.size()); + aligned_vector Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); - Bint8 = Bint8_ref; + randFill(Bint8_ref, -128, 127); + aligned_vector Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); @@ -118,13 +135,12 @@ TEST_P(fbgemmu8s8acc16test, Test) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } - int32_t Bint8_zero_point = -30; // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; @@ -137,23 +153,33 @@ TEST_P(fbgemmu8s8acc16test, Test) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -60, 0); + // computing column offset - vector col_offsets; - col_offsets.resize(groups * n_adjusted); + vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); - float C_multiplier = 0.1234; + aligned_vector C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; @@ -183,13 +209,14 @@ TEST_P(fbgemmu8s8acc16test, Test) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix packedBN( @@ -197,17 +224,15 @@ TEST_P(fbgemmu8s8acc16test, Test) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, - groups, - Bint8_zero_point); + groups); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( @@ -218,37 +243,81 @@ TEST_P(fbgemmu8s8acc16test, Test) { k, nullptr, groups, - Aint8_zero_point, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + DoNothing<> doNothingObj{}; - ReQuantizeOutput outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - groups * n, - outputProcObj, - tid, - num_threads); + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeOutput + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } } // omp parallel compare_validate_buffers( @@ -266,11 +335,12 @@ TEST_P(fbgemmu8s8acc16test, Test) { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: spmdm -> requantization -> nothing */ -TEST_P(fbgemmu8s8acc16test, SpMDMTest) { +TEST_P(fbgemmu8s8acc16WithQuantGranularityTest, SpMDMTest) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -285,21 +355,19 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } int k_per_group = k / groups; - aligned_vector Aint8(m * k, 0); - - aligned_vector Bint8(k * n, 0); - aligned_vector Bint8_ref(Bint8.size(), 0); + aligned_vector Aint8(m * k); + aligned_vector Bint8(k * n); - aligned_vector Cint32_ref(m * n * groups, 0); - aligned_vector Cint8_ref(Cint32_ref.size(), 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint8_fb(Cint32_ref.size(), 0); - aligned_vector Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector Cint32_ref(m * n * groups); + aligned_vector Cint8_ref(Cint32_ref.size()); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint8_fb(Cint32_ref.size()); + aligned_vector Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8, -128, 127); + randFill(Bint8, -128, 127); // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -313,18 +381,27 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } } - int32_t Bint8_zero_point = -30; + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + // computing column offset - vector col_offsets; - col_offsets.resize(groups * n_adjusted); + vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, - Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8.data() + g * k_per_group * n, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } CompressedSparseColumn B_csc(k_per_group, groups * n_adjusted); @@ -368,7 +445,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { } B_csc.ColPtr()[groups * n_adjusted] = total_nnz; - Bint8_ref = Bint8; + aligned_vector Bint8_ref(Bint8); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); @@ -378,16 +455,16 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); - float C_multiplier = 0.1234; + aligned_vector C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; int brow = 256; @@ -430,13 +507,14 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - nullptr); + nullptr, + ncols_per_quant_group); } PackBMatrix packedB( @@ -444,17 +522,15 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, - groups, - Bint8_zero_point); + groups); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( @@ -465,54 +541,108 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { k, nullptr, groups, - Aint8_zero_point, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + // spmdm -> requantization -> nothing // construct an output processing pipeline in reverse order // i.e. last output operation first // Last operation should always be DoNothing with // correct input and output type. DoNothing<> doNothingObj{}; - // The second last operation is requantization back - // to int8 - ReQuantizeOutput reqObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); - // the top most (first) operation in the output processing - // pipeline is spmdm - // outType = final output type after fullly processing through - // pipeline inType = initial input type at the first call to the whole - // pipeline - DoSpmdmOnInpBuffer< - ReQuantizeOutput::outType, - int32_t, - ReQuantizeOutput> - spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); - -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif - fbgemmPacked( - packAN, - packedB, - Cint8_fb.data(), - Cint32_fb.data(), - groups * n, - spmdmObj, - tid, - num_threads); + if (q_granularity == QuantizationGranularity::TENSOR) { + // The second last operation is requantization back + // to int8 + ReQuantizeOutput reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + // the top most (first) operation in the output processing + // pipeline is spmdm + // outType = final output type after fullly processing through + // pipeline inType = initial input type at the first call to the + // whole pipeline + DoSpmdmOnInpBuffer< + ReQuantizeOutput::outType, + int32_t, + ReQuantizeOutput> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + DoSpmdmOnInpBuffer< + ReQuantizeOutput::outType, + int32_t, + ReQuantizeOutput> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } else { + ReQuantizeOutput + reqObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + DoSpmdmOnInpBuffer< + ReQuantizeOutput::outType, + int32_t, + ReQuantizeOutput> + spmdmObj(reqObj, Aint8.data(), k, B_csc, groups); + + fbgemmPacked( + packAN, + packedB, + Cint8_fb.data(), + Cint32_fb.data(), + groups * n, + spmdmObj, + tid, + num_threads); + } } compare_validate_buffers( @@ -531,7 +661,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) { * @brief Unit test for uint8 matrix A, int8 matrix B, and 16-bit * accumulation. Output processing: nothing */ -TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { +TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; @@ -547,20 +677,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { } int k_per_group = k / groups; - aligned_vector Aint8(m * k, 0); + aligned_vector Aint8(m * k); - aligned_vector Bint8(k * n, 0); - aligned_vector Bint8_ref(Bint8.size(), 0); + aligned_vector Bint8_ref(k * n); - aligned_vector Cint32_ref(m * n * groups, 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector Cint32_ref(m * n * groups); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); - Bint8 = Bint8_ref; + randFill(Bint8_ref, -128, 127); + aligned_vector Bint8(Bint8_ref); if (btrans == matrix_op_t::Transpose) { aligned_vector Bint8_temp(Bint8.size()); @@ -570,8 +699,8 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } @@ -590,20 +719,19 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { } // computing column offset - vector col_offsets; - col_offsets.resize(groups * n_adjusted); + vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + &Bint8_zero_point, + col_offsets.data() + g * n_adjusted, + n_adjusted); } - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); int brow = 256; for (int g = 0; g < groups; ++g) { @@ -632,17 +760,15 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, - groups, - Bint8_zero_point); + groups); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( @@ -653,20 +779,14 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) { k, nullptr, groups, - Aint8_zero_point, row_offset_buf.data()); // DoNothing<> doNothingObj{}; DoNothing doNothingObj{}; memCopy<> outputProcObj(doNothingObj); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packAN, diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc index 43cecf8f88..9873e3fa93 100644 --- a/test/PackedRequantizeTest.cc +++ b/test/PackedRequantizeTest.cc @@ -25,17 +25,35 @@ using namespace std; using namespace fbgemm; -std::vector transposeVals{matrix_op_t::NoTranspose, - matrix_op_t::Transpose}; +vector transposeVals{matrix_op_t::NoTranspose, + matrix_op_t::Transpose}; + +vector qGranularityVals{ + QuantizationGranularity::TENSOR, + QuantizationGranularity::GROUP, + QuantizationGranularity::OUT_CHANNEL}; namespace { -class fbgemmu8s8acc32test : public testing::TestWithParam< - std::tuple> {}; +class fbgemmu8s8acc32WithQuantGranularityTest + : public testing::TestWithParam< + tuple> {}; +class fbgemmu8s8acc32Test + : public testing::TestWithParam< + tuple> {}; }; // namespace INSTANTIATE_TEST_CASE_P( InstantiationName, - fbgemmu8s8acc32test, + fbgemmu8s8acc32WithQuantGranularityTest, + ::testing::Combine( + ::testing::Values(matrix_op_t::NoTranspose), + ::testing::ValuesIn(transposeVals), + ::testing::Bool(), + ::testing::ValuesIn(qGranularityVals))); + +INSTANTIATE_TEST_CASE_P( + InstantiationName, + fbgemmu8s8acc32Test, ::testing::Combine( ::testing::Values(matrix_op_t::NoTranspose), ::testing::ValuesIn(transposeVals), @@ -77,11 +95,12 @@ static vector> GetShapes_() { * @brief Unit test for uint8 matrix A, int8 matrix B, and 32-bit * accumulation. Output processing: requantization -> nothing */ -TEST_P(fbgemmu8s8acc32test, Test) { +TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, Test) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -95,22 +114,21 @@ TEST_P(fbgemmu8s8acc32test, Test) { int k_per_group = k / groups; // mxk matrix - aligned_vector Aint8(m * k, 0); + aligned_vector Aint8(m * k); // kxn matrix - aligned_vector Bint8(k * n, 0); - aligned_vector Bint8_ref(Bint8.size(), 0); + aligned_vector Bint8_ref(k * n); - aligned_vector Cint32_ref(m * n * groups, 0); - aligned_vector Cint8_ref(Cint32_ref.size(), 0); - aligned_vector Cint32_fb(Cint32_ref.size(), 0); - aligned_vector Cint8_fb(Cint32_ref.size(), 0); - aligned_vector Cint32_buffer(Cint32_ref.size(), 0); + aligned_vector Cint32_ref(m * n * groups); + aligned_vector Cint8_ref(Cint32_ref.size()); + aligned_vector Cint32_fb(Cint32_ref.size()); + aligned_vector Cint8_fb(Cint32_ref.size()); + aligned_vector Cint32_buffer(Cint32_ref.size()); - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; - randFill(Bint8_ref, -128, 127); + randFill(Bint8_ref, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -122,7 +140,7 @@ TEST_P(fbgemmu8s8acc32test, Test) { n); } - Bint8 = Bint8_ref; + aligned_vector Bint8(Bint8_ref); // initialize bias aligned_vector bias_int32(groups * n); @@ -140,13 +158,12 @@ TEST_P(fbgemmu8s8acc32test, Test) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } - int32_t Bint8_zero_point = -30; // To test lda != k , we just reduce k by half and use the original k // as lda. int n_adjusted = n; @@ -159,23 +176,33 @@ TEST_P(fbgemmu8s8acc32test, Test) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + // computing column offset - vector col_offsets; - col_offsets.resize(groups * n_adjusted); + vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8_ref.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } - vector row_offsets; - row_offsets.resize(m); + vector row_offsets(m); - float C_multiplier = 0.001234; + aligned_vector C_multiplier(Bint8_zero_point.size()); + randFill(C_multiplier, 0.001234f / 2, 0.001234f * 3 / 2); int32_t C_zero_pt = 5; for (int g = 0; g < groups; ++g) { @@ -203,13 +230,14 @@ TEST_P(fbgemmu8s8acc32test, Test) { groups * n, Cint32_ref.data() + g * n_adjusted, Cint8_ref.data() + g * n_adjusted, - C_multiplier, + C_multiplier.data() + g * n_adjusted / ncols_per_quant_group, C_zero_pt, Aint8_zero_point, - Bint8_zero_point, + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, row_offsets.data(), col_offsets.data() + g * n_adjusted, - bias ? (bias + g * n_adjusted) : nullptr); + bias ? (bias + g * n_adjusted) : nullptr, + ncols_per_quant_group); } PackBMatrix packedBN( @@ -217,17 +245,15 @@ TEST_P(fbgemmu8s8acc32test, Test) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, - groups, - Bint8_zero_point); + groups); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithRowOffset::rowOffsetBufferSize()); PackAWithRowOffset packAN( @@ -238,37 +264,82 @@ TEST_P(fbgemmu8s8acc32test, Test) { k, nullptr, groups, - Aint8_zero_point, row_offset_buf.data()); - DoNothing<> doNothingObj{}; - ReQuantizeOutput outputProcObj( - doNothingObj, - C_multiplier, - C_zero_pt, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - bias); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + DoNothing<> doNothingObj{}; - fbgemmPacked( - packAN, - packedBN, - Cint8_fb.data(), - Cint32_buffer.data(), - groups * n, - outputProcObj, - tid, - num_threads); + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeOutput outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeOutput + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeOutput + outputProcObj( + doNothingObj, + C_multiplier.data(), + C_zero_pt, + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + bias, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cint8_fb.data(), + Cint32_buffer.data(), + groups * n, + outputProcObj, + tid, + num_threads); + } } // printMatrix(matrix_op_t::NoTranspose, Cint32_local.data(), // m, n_adjusted, n, "C local"); @@ -289,11 +360,12 @@ TEST_P(fbgemmu8s8acc32test, Test) { * accumulation. Directly output fp32 matrix C. Output processing: * requantization -> nothing */ -TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { +TEST_P(fbgemmu8s8acc32WithQuantGranularityTest, TestFloatInputOutput) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; - tie(atrans, btrans, test_ld) = GetParam(); + QuantizationGranularity q_granularity; + tie(atrans, btrans, test_ld, q_granularity) = GetParam(); for (auto shape : shapes) { for (int groups : {1, 3, 4}) { @@ -305,26 +377,26 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { } int k_per_group = k / groups; - aligned_vector Afp32(m * k, 0.0f); - aligned_vector Aint8(Afp32.size(), 0); + aligned_vector Afp32(m * k); + aligned_vector Aint8(Afp32.size()); - aligned_vector Bfp32(k * n, 0.0f); - aligned_vector Bint8(Bfp32.size(), 0); + aligned_vector Bfp32(k * n); + aligned_vector Bint8(Bfp32.size()); - aligned_vector Cfp32_ref(m * n * groups, 0.0f); - aligned_vector Cfp32_fb(Cfp32_ref.size(), 0.0f); + aligned_vector Cfp32_ref(m * n * groups); + aligned_vector Cfp32_fb(Cfp32_ref.size()); - aligned_vector Cint8_fb(Cfp32_ref.size(), 0); - aligned_vector Cint32_buffer(Cfp32_ref.size(), 0); + aligned_vector Cint8_fb(Cfp32_ref.size()); + aligned_vector Cint32_buffer(Cfp32_ref.size()); - randFill(Aint8, 0, 255); + randFill(Aint8, 0, 255); int32_t Aint8_zero_point = 43; float Aint8_scale = 0.11; for (auto i = 0; i < Afp32.size(); ++i) { Afp32[i] = Aint8_scale * (Aint8[i] - Aint8_zero_point); } - randFill(Bint8, -128, 127); + randFill(Bint8, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -335,11 +407,6 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { Bint8.data() + g * k_per_group * n, n); } - int32_t Bint8_zero_point = -30; - float Bint8_scale = 0.49; - for (auto i = 0; i < Bfp32.size(); ++i) { - Bfp32[i] = Bint8_scale * (Bint8[i] - Bint8_zero_point); - } // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -353,17 +420,37 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { } } + int ncols_per_quant_group = groups * n_adjusted; + if (q_granularity == QuantizationGranularity::GROUP) { + ncols_per_quant_group = n_adjusted; + } else if (q_granularity == QuantizationGranularity::OUT_CHANNEL) { + ncols_per_quant_group = 1; + } + aligned_vector Bint8_zero_point( + groups * n_adjusted / ncols_per_quant_group); + randFill(Bint8_zero_point, -50, -10); + aligned_vector Bint8_scale(Bint8_zero_point.size()); + randFill(Bint8_scale, 0.49f / 2, 0.49f * 3 / 2); + for (int i = 0; i < k; ++i) { + int g = i / k_per_group; + for (int j = 0; j < n_adjusted; ++j) { + int quant_group = (g * n_adjusted + j) / ncols_per_quant_group; + Bfp32[i * n + j] = Bint8_scale[quant_group] * + (Bint8[i * n + j] - Bint8_zero_point[quant_group]); + } + } + // computing column offset - vector col_offsets; - col_offsets.resize(groups * n_adjusted); + vector col_offsets(groups * n_adjusted); for (int g = 0; g < groups; ++g) { col_offsets_with_zero_pt_s8acc32_ref( k_per_group, n_adjusted, n, Bint8.data() + g * k_per_group * n, - Bint8_zero_point, - col_offsets.data() + g * n_adjusted); + Bint8_zero_point.data() + g * n_adjusted / ncols_per_quant_group, + col_offsets.data() + g * n_adjusted, + ncols_per_quant_group); } if (btrans == matrix_op_t::Transpose) { @@ -374,8 +461,8 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } @@ -398,17 +485,15 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, - groups, - Bint8_zero_point); + groups); #ifdef _OPENMP #pragma omp parallel #endif { - vector row_offset_buf; - row_offset_buf.resize( + vector row_offset_buf( PackAWithQuantRowOffset::rowOffsetBufferSize()); PackAWithQuantRowOffset packAN( @@ -423,47 +508,96 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { groups, row_offset_buf.data()); + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); + DoNothing doNothingObj{}; - ReQuantizeForFloat outputProcObj( - doNothingObj, - Aint8_scale, - Bint8_scale, - Aint8_zero_point, - Bint8_zero_point, - packAN.getRowOffsetBuffer(), - col_offsets.data(), - nullptr); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + if (q_granularity == QuantizationGranularity::TENSOR) { + ReQuantizeForFloat outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - fbgemmPacked( - packAN, - packedBN, - Cfp32_fb.data(), - reinterpret_cast(Cfp32_fb.data()), - groups * n, - outputProcObj, - tid, - num_threads); - } + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } else if (q_granularity == QuantizationGranularity::GROUP) { + ReQuantizeForFloat + outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); - float maximum = *max_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float minimum = *min_element(Cfp32_ref.begin(), Cfp32_ref.end()); - float atol = (maximum - minimum) / 255 / 1.9; + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } else { + ReQuantizeForFloat + outputProcObj( + doNothingObj, + Aint8_scale, + Bint8_scale.data(), + Aint8_zero_point, + Bint8_zero_point.data(), + packAN.getRowOffsetBuffer(), + col_offsets.data(), + nullptr, + groups * n_adjusted, + groups); + + fbgemmPacked( + packAN, + packedBN, + Cfp32_fb.data(), + reinterpret_cast(Cfp32_fb.data()), + groups * n, + outputProcObj, + tid, + num_threads); + } + } + float maximum = 0; + for (int i = 0; i < m; ++i) { + for (int j = 0; j < groups * n_adjusted; ++j) { + float c = Cfp32_ref[i * groups * n + j]; + maximum = std::max(maximum, std::abs(c)); + } + } compare_validate_buffers( Cfp32_ref.data(), Cfp32_fb.data(), m, groups * n_adjusted, groups * n, - atol); + maximum * 1e-5f); } // for each groups } // for each shape } @@ -473,7 +607,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) { * accumulation. Output processing: requantization -> nothing. Symmetric: the * zero point is 0. */ -TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { +TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) { vector> shapes(GetShapes_()); matrix_op_t atrans, btrans; bool test_ld; @@ -489,22 +623,17 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { } int k_per_group = k / groups; - aligned_vector Afp32(m * k, 0.0f); - aligned_vector Aint8(Afp32.size(), 0); - - aligned_vector Bfp32(k * n, 0.0f); - aligned_vector Bint8(Bfp32.size(), 0); + aligned_vector Aint8(m * k); + aligned_vector Bint8(k * n); - aligned_vector Cfp32_ref(m * n * groups, 0.0f); - aligned_vector Cint32_fb(Cfp32_ref.size(), 0); + aligned_vector Cfp32_ref(m * n * groups); + aligned_vector Cint32_fb(Cfp32_ref.size()); - randFill(Afp32, 0, 255); - for (auto i = 0; i < Afp32.size(); i++) { - Aint8[i] = (uint8_t)Afp32[i]; - } + randFill(Aint8, 0, 255); + aligned_vector Afp32(Aint8.begin(), Aint8.end()); // initialize B matrix - randFill(Bfp32, -128, 127); + randFill(Bint8, -128, 127); for (int g = 0; g < groups; ++g) { avoidOverflow( m, @@ -512,13 +641,11 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { k_per_group, Aint8.data() + g * k_per_group, k, - Bfp32.data() + g * k_per_group * n, + Bint8.data() + g * k_per_group * n, n); } - for (auto i = 0; i < Bfp32.size(); ++i) { - Bint8[i] = (int8_t)Bfp32[i]; - } + aligned_vector Bfp32(Bint8.begin(), Bint8.end()); // To test lda != k , we just reduce k by half and use the original k // as lda. @@ -540,8 +667,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { n, Bint8.data() + g * k_per_group * n, n, - Bint8_temp.data() + g * k_per_group, - groups * k_per_group); + Bint8_temp.data() + g * k_per_group * n, + k_per_group); } Bint8 = Bint8_temp; } @@ -565,7 +692,7 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { k, n_adjusted, Bint8.data(), - (btrans == matrix_op_t::Transpose) ? k : n, + (btrans == matrix_op_t::Transpose) ? k_per_group : n, nullptr, groups); @@ -580,13 +707,8 @@ TEST_P(fbgemmu8s8acc32test, TestSymmetricQuantizedInputOutput) { DoNothing doNothingObj{}; memCopy<> outputProcObj(doNothingObj); -#ifdef _OPENMP - int num_threads = omp_get_num_threads(); - int tid = omp_get_thread_num(); -#else - int num_threads = 1; - int tid = 0; -#endif + int num_threads = fbgemm_get_num_threads(); + int tid = fbgemm_get_thread_num(); fbgemmPacked( packAN,