remove template parameter from PackedDepthWiseConvMatrix (#128)

Summary: Pull Request resolved: #128 We don't really need to have KERNEL_PROD as a compile time constant template parameter in PackedDepthWiseConvMatrix for performance. Removing the template parameter will make generalizing depth-wise convolution to non 3x3 cases easier. This diff only changes fbgemm while maintaining the old interface. The follow-up diff will change Caffe2 code using the old interface and remove the old interface. This diff also splits FbgemmI8DepthwiseAvx2.cc into FbgemmI8Depthwise3DAvx2.cc and PackDepthwiseConvMatrixAvx2.cc to avoid compilation timeouts in OSS build tests. Reviewed By: dskhudia Differential Revision: D17514003 fbshipit-source-id: 2214637ac0762a585f619f0035d3449cc4f7669e
pytorch · Sep 24, 2019 · 518d8a1 · 518d8a1
1 parent 53f0c0d
commit 518d8a1
Show file tree

Hide file tree

Showing 13 changed files with 2,792 additions and 2,768 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -75,8 +75,10 @@ endif()
 #All the source files that either use avx2 instructions statically
 set(FBGEMM_AVX2_SRCS
   src/FbgemmFP16UKernelsAvx2.cc
+  src/FbgemmI8Depthwise3DAvx2.cc
   src/FbgemmI8DepthwiseAvx2.cc
   src/OptimizedKernelsAvx2.cc
+  src/PackDepthwiseConvMatrixAvx2.cc
   src/QuantUtilsAvx2.cc
   src/UtilsAvx2.cc)
 

diff --git a/bench/Depthwise3DBenchmark.cc b/bench/Depthwise3DBenchmark.cc
@@ -159,7 +159,7 @@ int main() {
           K);
     }
 
-    Packed3x3x3ConvMatrix Bp(K, B.data());
+    PackedDepthWiseConvMatrix Bp(K, 3 * 3 * 3, B.data());
 
     double ttot = 0;
     double bytes = double(NITER) *

diff --git a/bench/DepthwiseBenchmark.cc b/bench/DepthwiseBenchmark.cc
@@ -235,7 +235,7 @@ int main() {
           K);
     }
 
-    Packed3x3ConvMatrix Bp(K, B.data());
+    PackedDepthWiseConvMatrix Bp(K, 3 * 3, B.data());
 
     double ttot = 0;
     double bytes = double(NITER) *

diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
@@ -588,12 +588,16 @@ class FBGEMM_API PackWeightsForConv {
     return W_im2col_packed_;
   }
 
-  std::shared_ptr<Packed3x3ConvMatrix> getPackedWFor2DDW() {
-    return W_dw_2D_packed_;
+  std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
+    return W_dw_packed_;
   }
 
-  std::shared_ptr<Packed3x3x3ConvMatrix> getPackedWFor3DDW() {
-    return W_dw_3D_packed_;
+  std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWFor2DDW() {
+    return W_dw_packed_;
+  }
+
+  std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWFor3DDW() {
+    return W_dw_packed_;
   }
 
   std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
@@ -642,10 +646,8 @@ class FBGEMM_API PackWeightsForConv {
   const conv_param_t<SPATIAL_DIM> conv_param_;
   // Packed weights if we use im2col based convolution implementation
   std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
-  // Packed weights if we use 2D depthwise convolution implementation
-  std::shared_ptr<Packed3x3ConvMatrix> W_dw_2D_packed_;
-  // Packed weights if we use 3D depthwise convolution implementation
-  std::shared_ptr<Packed3x3x3ConvMatrix> W_dw_3D_packed_;
+  // Packed weights if we use depthwise convolution implementation
+  std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
   // Packed weights if we use groupwise (small channels per group) convolution
   // implementation
   std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>

diff --git a/include/fbgemm/FbgemmI8DepthwiseAvx2.h b/include/fbgemm/FbgemmI8DepthwiseAvx2.h
@@ -11,19 +11,26 @@
 
 namespace fbgemm {
 
-// KERNEL_PROD is the product of all kernels.
-// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
-template <int KERNEL_PROD>
 class FBGEMM_API PackedDepthWiseConvMatrix {
  public:
-  // smat in GRS layout
-  PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
+  /**
+   * @params K the number of channels (same as the number of groups because
+   *           depth-wise convolution has one input/output channel per group)
+   * @params kernel_prod the product of all kernels. For example, kernel_prod =
+   *                     9 for 3x3 conv, and 27 for 3x3x3 conv.
+   * @param smat the source unpacked weight in GRS layout
+   */
+  PackedDepthWiseConvMatrix(int K, int kernel_prod, const std::int8_t* smat);
   virtual ~PackedDepthWiseConvMatrix();
 
   const std::int8_t* PackedMat() const {
     return pmat_;
   }
 
+  int GetKernelProduct() const {
+    return kernel_prod_;
+  }
+
   /**
    * @brief Unpacks pmat_ into unpack_data.
    * Used for recovering the weight matrix into the original format
@@ -36,19 +43,22 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
   int addr(int r, int c);
 
  private:
-  int K_;
-  std::int8_t* pmat_;
-}; // Packed3x3ConvMatrix
+  const int K_; /**< the number of channels */
+  const int kernel_prod_; /** the product of all kernel dims */
+  std::int8_t* pmat_; /** packed weight */
+}; // PackedDepthWiseConvMatrix
 
-using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
-using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
-using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>;
-using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>;
-using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>;
-using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>;
-using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
-using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
-using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
+class FBGEMM_API Packed3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+  Packed3x3ConvMatrix(int K, const std::int8_t* smat)
+      : PackedDepthWiseConvMatrix(K, 3 * 3, smat) {}
+};
+
+class FBGEMM_API Packed3x3x3ConvMatrix : public PackedDepthWiseConvMatrix {
+ public:
+  Packed3x3x3ConvMatrix(int K, const std::int8_t* smat)
+      : PackedDepthWiseConvMatrix(K, 3 * 3 * 3, smat) {}
+};
 
 /** To be removed. Keeping it just to make sure we don't change C2 files and
  * fbgemm files in a single diff
@@ -64,7 +74,7 @@ FBGEMM_API void depthwise_3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -93,7 +103,7 @@ FBGEMM_API void depthwise_3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -121,7 +131,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -145,7 +155,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -171,7 +181,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -196,7 +206,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     std::int32_t B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     float C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -223,7 +233,7 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,
@@ -249,7 +259,7 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
     std::int32_t A_zero_point,
     const std::uint8_t* A,
     const std::int32_t* B_zero_point,
-    const Packed3x3x3ConvMatrix& Bp,
+    const PackedDepthWiseConvMatrix& Bp,
     const float* C_multiplier,
     std::int32_t C_zero_point,
     std::uint8_t* C,

diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
@@ -109,7 +109,7 @@ int fbgemmConv(
               outProcess.getAZeroPoint(),
               activations,
               B_zero_point[0],
-              *(packed_weights.getPackedWFor3DDW()),
+              *(packed_weights.getPackedWForDepthwise()),
               C_multiplier[0],
               outProcess.getCZeroPoint(),
               out,
@@ -135,7 +135,7 @@ int fbgemmConv(
               outProcess.getAZeroPoint(),
               activations,
               B_zero_point,
-              *(packed_weights.getPackedWFor3DDW()),
+              *(packed_weights.getPackedWForDepthwise()),
               C_multiplier,
               outProcess.getCZeroPoint(),
               out,
@@ -163,7 +163,7 @@ int fbgemmConv(
               outProcess.getAZeroPoint(),
               activations,
               B_zero_point[0],
-              *(packed_weights.getPackedWFor2DDW()),
+              *(packed_weights.getPackedWForDepthwise()),
               C_multiplier[0],
               outProcess.getCZeroPoint(),
               out,
@@ -188,7 +188,7 @@ int fbgemmConv(
               outProcess.getAZeroPoint(),
               activations,
               B_zero_point,
-              *(packed_weights.getPackedWFor2DDW()),
+              *(packed_weights.getPackedWForDepthwise()),
               C_multiplier,
               outProcess.getCZeroPoint(),
               out,