pytorch · jspark1105 · Nov 26, 2018 · Nov 26, 2018
diff --git a/bench/Im2ColFusedRequantizeAcc16Benchmark.cc b/bench/Im2ColFusedRequantizeAcc16Benchmark.cc
@@ -307,7 +307,6 @@ void performance_test() {
           KDim,
           nullptr,
           1,
-          Aint8_zero_point,
           row_offset_buf.data());
 
       fbgemmPacked(

diff --git a/bench/Im2ColFusedRequantizeAcc32Benchmark.cc b/bench/Im2ColFusedRequantizeAcc32Benchmark.cc
@@ -307,7 +307,6 @@ void performance_test() {
           KDim,
           nullptr,
           1,
-          Aint8_zero_point,
           row_offset_buf.data());
 
       fbgemmPacked(

diff --git a/bench/PackedFloatInOutBenchmark.cc b/bench/PackedFloatInOutBenchmark.cc
@@ -195,8 +195,7 @@ void performance_test() {
         Bint8.data(),
         n,
         nullptr,
-        1,
-        Bint8_zero_point);
+        1);
 
     DoNothing<float, float> doNothingObj{};
     ReQuantizeForFloat<false> outputProcObj(

diff --git a/bench/PackedRequantizeAcc16Benchmark.cc b/bench/PackedRequantizeAcc16Benchmark.cc
@@ -309,8 +309,7 @@ void performance_test() {
               Aint8.data(),
               k,
               nullptr,
-              1,
-              Aint8_zero_point);
+              1);
           PackAWithRowOffset<uint8_t, int16_t> packAWithRowOffset(
               matrix_op_t::NoTranspose,
               m,
@@ -319,7 +318,6 @@ void performance_test() {
               k,
               nullptr,
               1,
-              Aint8_zero_point,
               row_offset_buf.data());
 
           // no-op output process objects

diff --git a/bench/PackedRequantizeAcc32Benchmark.cc b/bench/PackedRequantizeAcc32Benchmark.cc
@@ -222,8 +222,7 @@ void performance_test() {
         Bint8.data(),
         n,
         nullptr,
-        1,
-        Bint8_zero_point);
+        1);
 
     ttot = 0.0;
     runType = "FBGEMM_i8_acc32";
@@ -261,7 +260,6 @@ void performance_test() {
             k,
             nullptr,
             1,
-            Aint8_zero_point,
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};

diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
@@ -92,14 +92,12 @@ class PackMatrix {
    *               dimension A.rows by B.cols*B.groups .
    *               A.groups must be same as B.groups, A.groups must divide
    *               A.cols, and B.groups must divide B.rows and C.cols.
-   * @param zero_pt the quantized value that maps to 0.0f floating-point number.
    */
   PackMatrix(
       std::int32_t rows,
       std::int32_t cols,
       inpType* pmat,
-      int groups = 1,
-      std::int32_t zero_pt = 0);
+      int groups = 1);
 
   /**
    * @return true usually when the matrix is constant matrix (e.g., weight
@@ -276,13 +274,6 @@ class PackMatrix {
         : (numPackedCols() % blockColSize());
   }
 
-  /**
-   * @return the quantized value that maps to 0.0f floating-point number
-   */
-  std::int32_t zeroPoint() const {
-    return zero_pt_;
-  }
-
   inpType* buf_;
   std::int32_t brow_; ///< the number of rows in each block
   std::int32_t bcol_; ///< the number of columns in each block
@@ -293,7 +284,6 @@ class PackMatrix {
  private:
   std::int32_t nrows_, ncols_;
   int G_;
-  std::int32_t zero_pt_;
   block_type_t packedBlock_; ///< The block in the source matrix just packed
   std::int32_t last_brow_, last_bcol_;
 };
@@ -320,8 +310,7 @@ class PackAMatrix final : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
       const inpType* smat,
       std::int32_t ld,
       inpType* pmat = nullptr,
-      int groups = 1,
-      std::int32_t zero_pt = 0);
+      int groups = 1);
 
   /**
    * Activation matrices are not constant so cannot amortize the cost of
@@ -391,8 +380,7 @@ class PackBMatrix final : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
       const inpType* smat,
       std::int32_t ld,
       inpType* pmat = nullptr,
-      int groups = 1,
-      std::int32_t zero_pt = 0);
+      int groups = 1);
 
   /**
    * Weight matrices are usually constant so worth pre-packing.
@@ -468,7 +456,7 @@ class PackAWithIm2Col
 
   PackAWithIm2Col() = delete; // no default constructor
   /**
-   * TODO: Currently only groups == 1 supported
+   * @param zero_pt the quantized value that maps to 0.0f floating-point number.
    */
   PackAWithIm2Col(
       const conv_param_t<SPATIAL_DIM>& conv_param,
@@ -523,6 +511,7 @@ class PackAWithIm2Col
  private:
   const conv_param_t<SPATIAL_DIM>& conv_p_;
   const T* sdata_;
+  std::int32_t zero_pt_;
   std::int32_t* row_offset_;
   bool rowOffsetAllocatedHere;
   std::int32_t row_interleave_B_;
@@ -551,7 +540,6 @@ class PackAWithRowOffset final
       std::uint32_t ld,
       inpType* pmat = nullptr,
       int groups = 1,
-      std::int32_t zero_pt = 0,
       std::int32_t* row_offset = nullptr);
 
   /**
@@ -693,6 +681,7 @@ class PackAWithQuantRowOffset final
   const float* smat_;
   std::int32_t ld_;
   float scale_;
+  std::int32_t zero_pt_;
   std::int32_t* row_offset_;
   bool rowOffsetAllocatedHere;
   std::int32_t row_interleave_B_;

diff --git a/include/fbgemm/QuantUtils.h b/include/fbgemm/QuantUtils.h
@@ -6,8 +6,6 @@
 #include <cstdint>
 #include <limits>
 
-#include <cpuinfo.h>
-
 namespace fbgemm {
 
 // Copied from gemmlowp

diff --git a/src/PackAMatrix.cc b/src/PackAMatrix.cc
@@ -20,14 +20,8 @@ PackAMatrix<T, accT>::PackAMatrix(
     const T* smat,
     int32_t ld,
     inpType* pmat,
-    int groups,
-    std::int32_t zero_pt)
-    : PackMatrix<PackAMatrix<T, accT>, T, accT>(
-          nRow,
-          nCol,
-          pmat,
-          groups,
-          zero_pt),
+    int groups)
+    : PackMatrix<PackAMatrix<T, accT>, T, accT>(nRow, nCol, pmat, groups),
       trans_(trans),
       smat_(smat),
       ld_(ld) {

diff --git a/src/PackAWithIm2Col.cc b/src/PackAWithIm2Col.cc
@@ -36,10 +36,10 @@ PackAWithIm2Col<T, accT, SPATIAL_DIM>::PackAWithIm2Col(
               std::multiplies<int>()) *
               conv_p.IC,
           pmat,
-          conv_p.G,
-          zero_pt),
+          conv_p.G),
       conv_p_(conv_p),
-      sdata_(sdata) {
+      sdata_(sdata),
+      zero_pt_(zero_pt) {
   static_assert(
       SPATIAL_DIM == 2 || SPATIAL_DIM == 3, "unsupported conv dimension ");
   if (cpuinfo_has_x86_avx512f()) {
@@ -187,7 +187,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
           std::memset(
               out + (i - block.row_start) * BaseType::blockColSize() +
                   (j_blk_start - block.col_start),
-              BaseType::zeroPoint(),
+              zero_pt_,
               sizeof(T) * (j_blk_end - j_blk_start));
         } else {
           std::memcpy(
@@ -239,7 +239,7 @@ void PackAWithIm2Col<T, accT, SPATIAL_DIM>::pack(const block_type_t& block) {
               &out
                   [(i - block.row_start) * BaseType::blockColSize() +
                    (j_blk_start - block.col_start)],
-              BaseType::zeroPoint(),
+              zero_pt_,
               sizeof(T) * (j_blk_end - j_blk_start));
         } else {
           std::memcpy(

diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc
@@ -31,12 +31,12 @@ PackAWithQuantRowOffset<T, accT>::PackAWithQuantRowOffset(
           nRow,
           nCol,
           pmat,
-          groups,
-          zero_pt),
+          groups),
       trans_(trans),
       smat_(smat),
       ld_(ld),
       scale_(scale),
+      zero_pt_(zero_pt),
       row_offset_(row_offset) {
   rowOffsetAllocatedHere = false;
 
@@ -158,7 +158,7 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
     for (; j < block.col_size / VLEN * VLEN; j += VLEN) {
       __m256 val_v = _mm256_loadu_ps(smat_temp + i * ld_temp + j);
       __m256 transformed_v = _mm256_fmadd_ps(
-          val_v, inverse_scale_v, _mm256_set1_ps(BaseType::zeroPoint()));
+          val_v, inverse_scale_v, _mm256_set1_ps(zero_pt_));
       __m256 clipped_v = _mm256_max_ps(
           _mm256_set1_ps(std::numeric_limits<uint8_t>::min()),
           _mm256_min_ps(
@@ -180,7 +180,7 @@ void PackAWithQuantRowOffset<T, accT>::pack(const block_type_t& block) {
 #endif
     for (; j < block.col_size; ++j) {
       float val = smat_temp[i * ld_temp + j];
-      float transformed = val / scale_ + BaseType::zeroPoint();
+      float transformed = val / scale_ + zero_pt_;
       float clipped = std::min<float>(
           std::max<float>(transformed, std::numeric_limits<uint8_t>::min()),
           std::numeric_limits<uint8_t>::max());

diff --git a/src/PackAWithRowOffset.cc b/src/PackAWithRowOffset.cc
@@ -23,14 +23,12 @@ PackAWithRowOffset<T, accT>::PackAWithRowOffset(
     uint32_t ld,
     inpType* pmat,
     int groups,
-    int32_t zero_pt,
     int32_t* row_offset)
     : PackMatrix<PackAWithRowOffset<T, accT>, T, accT>(
           nRow,
           nCol,
           pmat,
-          groups,
-          zero_pt),
+          groups),
       trans_(trans),
       smat_(smat),
       ld_(ld),

diff --git a/src/PackBMatrix.cc b/src/PackBMatrix.cc
@@ -20,14 +20,8 @@ PackBMatrix<T, accT>::PackBMatrix(
     const T* smat,
     int32_t ld,
     inpType* pmat,
-    int groups,
-    std::int32_t zero_pt)
-    : PackMatrix<PackBMatrix<T, accT>, T, accT>(
-          nRow,
-          nCol,
-          pmat,
-          groups,
-          zero_pt),
+    int groups)
+    : PackMatrix<PackBMatrix<T, accT>, T, accT>(nRow, nCol, pmat, groups),
       trans_(trans),
       smat_(smat),
       ld_(ld) {
@@ -162,8 +156,7 @@ bool PackBMatrix<T, accT>::metaEquals(const PackBMatrix<T, accT>& that) const {
       BaseType::blockCols() != that.blockCols() ||
       BaseType::numPackedRows() != that.numPackedRows() ||
       BaseType::numPackedCols() != that.numPackedCols() ||
-      BaseType::zeroPoint() != that.zeroPoint() || trans_ != that.trans_ ||
-      BaseType::numGroups() != that.numGroups() ||
+      trans_ != that.trans_ || BaseType::numGroups() != that.numGroups() ||
       row_interleave_ != that.row_interleave_) {
     return false;
   }

diff --git a/src/PackMatrix.cc b/src/PackMatrix.cc
@@ -18,9 +18,8 @@ PackMatrix<PT, inpType, accType>::PackMatrix(
     int32_t rows,
     int32_t cols,
     inpType* buf,
-    int groups,
-    int32_t zero_pt)
-    : buf_(buf), nrows_(rows), ncols_(cols), G_(groups), zero_pt_(zero_pt) {
+    int groups)
+    : buf_(buf), nrows_(rows), ncols_(cols), G_(groups) {
   bufAllocatedHere_ = false;
   if (!cpuinfo_initialize()) {
     throw std::runtime_error("Failed to initialize cpuinfo!");

diff --git a/src/QuantUtils.cc b/src/QuantUtils.cc
@@ -1,5 +1,6 @@
 #include "fbgemm/QuantUtils.h"
 
+#include <cpuinfo.h>
 #include <immintrin.h>
 
 #include "fbgemm/Fbgemm.h"

diff --git a/test/Im2ColFusedRequantizeTest.cc b/test/Im2ColFusedRequantizeTest.cc
@@ -418,8 +418,7 @@ static void Im2col3DTest() {
           Bint8.data(),
           NDim,
           nullptr,
-          conv_p.G,
-          Bint8_zero_point);
+          conv_p.G);
 
 #ifdef _OPENMP
 #pragma omp parallel

diff --git a/test/PackedRequantizeAcc16Test.cc b/test/PackedRequantizeAcc16Test.cc
@@ -199,8 +199,7 @@ TEST_P(fbgemmu8s8acc16test, Test) {
           Bint8.data(),
           (btrans == matrix_op_t::Transpose) ? k : n,
           nullptr,
-          groups,
-          Bint8_zero_point);
+          groups);
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -218,7 +217,6 @@ TEST_P(fbgemmu8s8acc16test, Test) {
             k,
             nullptr,
             groups,
-            Aint8_zero_point,
             row_offset_buf.data());
 
         DoNothing<> doNothingObj{};
@@ -446,8 +444,7 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
             Bint8.data(),
             (btrans == matrix_op_t::Transpose) ? k : n,
             nullptr,
-            groups,
-            Bint8_zero_point);
+            groups);
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -465,7 +462,6 @@ TEST_P(fbgemmu8s8acc16test, SpMDMTest) {
               k,
               nullptr,
               groups,
-              Aint8_zero_point,
               row_offset_buf.data());
 
           // spmdm -> requantization -> nothing
@@ -634,8 +630,7 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
           Bint8.data(),
           (btrans == matrix_op_t::Transpose) ? k : n,
           nullptr,
-          groups,
-          Bint8_zero_point);
+          groups);
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -653,7 +648,6 @@ TEST_P(fbgemmu8s8acc16test, NoRequantizeTest) {
             k,
             nullptr,
             groups,
-            Aint8_zero_point,
             row_offset_buf.data());
 
         // DoNothing<> doNothingObj{};

diff --git a/test/PackedRequantizeTest.cc b/test/PackedRequantizeTest.cc
@@ -219,8 +219,7 @@ TEST_P(fbgemmu8s8acc32test, Test) {
             Bint8.data(),
             (btrans == matrix_op_t::Transpose) ? k : n,
             nullptr,
-            groups,
-            Bint8_zero_point);
+            groups);
 
 #ifdef _OPENMP
 #pragma omp parallel
@@ -238,7 +237,6 @@ TEST_P(fbgemmu8s8acc32test, Test) {
               k,
               nullptr,
               groups,
-              Aint8_zero_point,
               row_offset_buf.data());
 
           DoNothing<> doNothingObj{};
@@ -400,8 +398,7 @@ TEST_P(fbgemmu8s8acc32test, TestFloatInputOutput) {
           Bint8.data(),
           (btrans == matrix_op_t::Transpose) ? k : n,
           nullptr,
-          groups,
-          Bint8_zero_point);
+          groups);
 
 #ifdef _OPENMP
 #pragma omp parallel