From bee229da7d7e8ab5909f5ac039b3d13b9d055dc7 Mon Sep 17 00:00:00 2001
From: Daya Khudia <dskhudia@fb.com>
Date: Mon, 15 Jul 2019 14:22:12 -0700
Subject: [PATCH 1/2] Assume input weights to be in transposed format for
 convUnified

Differential Revision: D16186932

fbshipit-source-id: 244fc2d614aeb4d768201a553efeb3058fe9efeb
---
 bench/ConvUnifiedBenchmark.cc | 15 ++++++----
 src/PackWeightsForConv.cc     |  6 ++--
 src/RefImplementations.cc     | 54 ++++++++++++++++++++++++++---------
 3 files changed, 52 insertions(+), 23 deletions(-)
diff --git a/bench/ConvUnifiedBenchmark.cc b/bench/ConvUnifiedBenchmark.cc
index 59079c7b66..88f40f537a 100644
--- a/bench/ConvUnifiedBenchmark.cc
+++ b/bench/ConvUnifiedBenchmark.cc
@@ -42,9 +42,9 @@ vector<conv_param_t<3>> shapes_3d = {
   // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, stride_w},
   // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
   // Regular
-  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
+  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1}),
   // Depthwise
-  conv_param_t<3>(1, 64, 64, {32, 56, 56}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1})
+  conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, {1, 1, 1, 1, 1, 1})
 };
 
 template <int SPATIAL_DIM, typename Acc_t>
@@ -109,6 +109,9 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     aligned_vector<int8_t> Bint8(
         kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
 
+    aligned_vector<int8_t> Bint8_tr(
+        kernel_dim * conv_p.IC * (conv_p.OC / conv_p.G));
+
     int im_out_dim = accumulate(
         conv_p.OUT_DIM.begin(), conv_p.OUT_DIM.end(), 1, multiplies<int>());
     aligned_vector<int32_t> Cint32_ref(conv_p.MB * im_out_dim * conv_p.OC);
@@ -131,14 +134,14 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
     randFill(C_multiplier, 0.1234f / 2, 0.1234f * 3 / 2);
     int32_t C_zero_point = 5;
 
-    aligned_vector<float> Bfp32(Bint8.begin(), Bint8.end());
-
     // reference implementation
+    // conv_ref expects weights to be in G (R S C/G) K/G
+    transposeConvWeights<SPATIAL_DIM>(conv_p, Bint8.data(), Bint8_tr.data());
     conv_ref(
         conv_p,
         Aint8.data(),
         Aint8_zero_point,
-        Bint8.data(),
+        Bint8_tr.data(),
         Cint32_ref.data());
 
     // matrix dimensions after im2col
@@ -161,7 +164,7 @@ void performance_test(const vector<conv_param_t<SPATIAL_DIM>>& shapes) {
           KDimPerGroup,
           OC_per_G,
           OC_per_G,
-          Bint8.data() + g * KDimPerGroup * OC_per_G,
+          Bint8_tr.data() + g * KDimPerGroup * OC_per_G,
           Bint8_zero_point.data(),
           col_offsets.data() + g * OC_per_G,
           conv_p.OC);
diff --git a/src/PackWeightsForConv.cc b/src/PackWeightsForConv.cc
index c81114494d..78379af520 100644
--- a/src/PackWeightsForConv.cc
+++ b/src/PackWeightsForConv.cc
@@ -42,18 +42,18 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv(
       W_dw_3D_packed_ = nullptr;
       W_gconv_packed_ =
           std::make_shared<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>(
-              matrix_op_t::NoTranspose, conv_p, sdata, nullptr);
+              matrix_op_t::Transpose, conv_p, sdata, nullptr);
       break;
     }
     case optimized_conv_t::im2col: {
       int NDim = conv_p.OC / conv_p.G;
       int KDim = conv_p.K[0] * conv_p.K[1] * conv_p.IC;
       W_im2col_packed_ = std::make_shared<PackBMatrix<T, accT>>(
-          matrix_op_t::NoTranspose,
+          matrix_op_t::Transpose,
           KDim,
           NDim,
           sdata,
-          NDim,
+          KDim / conv_p.G,
           nullptr,
           conv_p.G,
           blocking_params);
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc
index b4b0c2b5e2..e3c0eac7c9 100644
--- a/src/RefImplementations.cc
+++ b/src/RefImplementations.cc
@@ -181,8 +181,7 @@ void cblas_sgemm_ref(
     int ldb,
     float beta,
     float* Cfp32,
-    int ldc
-    ) {
+    int ldc) {
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
       float sum = 0;
@@ -204,7 +203,6 @@ void cblas_sgemm_ref(
   }
 }
 
-
 void row_offsets_u8acc32_ref(
     int M,
     int K,
@@ -542,21 +540,49 @@ void transposeConvWeights(
     const conv_param_t<SPATIAL_DIM>& conv_p,
     const std::int8_t* src,
     std::int8_t* dest) {
-  assert(SPATIAL_DIM == 2 && "Only 2D supported currently");
-  int R = conv_p.K[0];
-  int S = conv_p.K[1];
   int G = conv_p.G;
   int IC_per_G = conv_p.IC / conv_p.G;
   int OC_per_G = conv_p.OC / conv_p.G;
 
-  // Transforms weights from  G K/G (R S C/G) to G (R S C/G) K/G format.
-  for (int r = 0; r < R; ++r) {
-    for (int s = 0; s < S; ++s) {
-      for (int k = 0; k < OC_per_G; ++k) {
-        for (int g = 0; g < G; ++g) {
-          for (int c = 0; c < IC_per_G; ++c) {
-            dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] =
-                src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c];
+  assert(
+      (SPATIAL_DIM == 3 || SPATIAL_DIM == 2) &&
+      "Only 2D and 3D convolutions are supported");
+  if (SPATIAL_DIM == 2) {
+    int R = conv_p.K[0];
+    int S = conv_p.K[1];
+    // Transforms weights from  G K/G (R S C/G) to G (R S C/G) K/G format.
+    for (int r = 0; r < R; ++r) {
+      for (int s = 0; s < S; ++s) {
+        for (int k = 0; k < OC_per_G; ++k) {
+          for (int g = 0; g < G; ++g) {
+            for (int c = 0; c < IC_per_G; ++c) {
+              dest[(((g * R + r) * S + s) * IC_per_G + c) * OC_per_G + k] =
+                  src[(((g * OC_per_G + k) * R + r) * S + s) * IC_per_G + c];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Transforms weights from  G K/G (T R S C/G) to G (T R S C/G) K/G format.
+    int T = conv_p.K[0];
+    int R = conv_p.K[1];
+    int S = conv_p.K[2];
+    for (int t = 0; t < T; ++t) {
+      for (int r = 0; r < R; ++r) {
+        for (int s = 0; s < S; ++s) {
+          for (int k = 0; k < OC_per_G; ++k) {
+            for (int g = 0; g < G; ++g) {
+              for (int c = 0; c < IC_per_G; ++c) {
+                dest
+                    [((((g * T + t) * R + r) * S + s) * IC_per_G + c) *
+                         OC_per_G +
+                     k] =
+                        src[((((g * OC_per_G + k) * T + t) * R + r) * S + s) *
+                                IC_per_G +
+                            c];
+              }
+            }
           }
         }
       }

From 2346ccb9b6456cb9a0a35fc14b4248302674d07e Mon Sep 17 00:00:00 2001
From: Daya Khudia <dskhudia@fb.com>
Date: Mon, 15 Jul 2019 14:22:54 -0700
Subject: [PATCH 2/2] unpack through unified convolution interface (#105)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/105

Support for calling unpack using unified interface for packing convolution weights

Reviewed By: jianyuh

Differential Revision: D16190534

fbshipit-source-id: 28e1b95c7642c1cf9ed3d8935f56c740f9b44bcd
---
 include/fbgemm/Fbgemm.h                       |  6 ++
 src/PackWeightsForConv.cc                     | 15 +++++
 .../{UniConvPackingTest.cc => UniConvTest.cc} | 63 ++++++++++++++++++-
 3 files changed, 81 insertions(+), 3 deletions(-)
 rename test/{UniConvPackingTest.cc => UniConvTest.cc} (75%)

diff --git a/include/fbgemm/Fbgemm.h b/include/fbgemm/Fbgemm.h
index 9ee25b5824..302af516f2 100644
--- a/include/fbgemm/Fbgemm.h
+++ b/include/fbgemm/Fbgemm.h
@@ -597,6 +597,12 @@ class FBGEMM_API PackWeightsForConv {
     return W_gconv_packed_;
   }
 
+  /**
+   * @brief Unpack packed matric into origin_buf (Used for the serialization to
+   * recover weight matrix).
+   */
+  void unpack(T* origin_buf);
+
  private:
   // Packed weights if we use im2col based convolution implementation
   std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
diff --git a/src/PackWeightsForConv.cc b/src/PackWeightsForConv.cc
index 78379af520..e16843c89a 100644
--- a/src/PackWeightsForConv.cc
+++ b/src/PackWeightsForConv.cc
@@ -65,6 +65,21 @@ PackWeightsForConv<SPATIAL_DIM, T, accT>::PackWeightsForConv(
   } // switch
 }
 
+template <int SPATIAL_DIM, typename T, typename accT>
+void PackWeightsForConv<SPATIAL_DIM, T, accT>::unpack(T* origin_buf) {
+  if (W_dw_2D_packed_) {
+    W_dw_2D_packed_->unpack(origin_buf);
+  } else if (W_dw_3D_packed_) {
+    W_dw_3D_packed_->unpack(origin_buf);
+  } else if (W_gconv_packed_) {
+    W_gconv_packed_->unpack(origin_buf);
+  } else if (W_im2col_packed_) {
+    W_im2col_packed_->unpack(origin_buf);
+  } else {
+    assert(false && "At least one packed weights object should exist");
+  }
+}
+
 template class PackWeightsForConv<2, int8_t, int32_t>;
 template class PackWeightsForConv<3, int8_t, int32_t>;
 
diff --git a/test/UniConvPackingTest.cc b/test/UniConvTest.cc
similarity index 75%
rename from test/UniConvPackingTest.cc
rename to test/UniConvTest.cc
index 77552af0df..2b110dde73 100644
--- a/test/UniConvPackingTest.cc
+++ b/test/UniConvTest.cc
@@ -23,7 +23,7 @@ using namespace fbgemm;
 namespace {
 
 // tuple represents MB, IC, OC, IT, IH, IW, KH/KW, stride, pad
-class convPackingTest
+class uniConvTest
     : public testing::TestWithParam<
           tuple<int, int, int, int, int, int, int, int, int, int>> {};
 
@@ -31,7 +31,7 @@ class convPackingTest
 
 INSTANTIATE_TEST_CASE_P(
     InstantiationName,
-    convPackingTest,
+    uniConvTest,
     ::testing::Combine(
         ::testing::ValuesIn({1, 2}), // MB
         ::testing::ValuesIn({16, 32}), // IC
@@ -47,7 +47,7 @@ INSTANTIATE_TEST_CASE_P(
 /**
  * Test for conv packing
  */
-TEST_P(convPackingTest, packingTest) {
+TEST_P(uniConvTest, packingTest) {
   int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad;
   tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam();
 
@@ -146,3 +146,60 @@ TEST_P(convPackingTest, packingTest) {
     }
   }
 }
+
+/**
+ * Test for packing/unpacking
+ */
+TEST_P(uniConvTest, packUnpackTest) {
+  int MB, IC, OC, IT, IH, IW, G, kernel, stride, pad;
+  tie(MB, IC, OC, IT, IH, IW, G, kernel, stride, pad) = GetParam();
+
+  conv_param_t<2> conv_p_2d(
+      MB,
+      IC,
+      OC,
+      {IH, IW},
+      G,
+      {kernel, kernel},
+      {stride, stride},
+      {pad, pad, pad, pad});
+
+  int kernel_dim_2d = kernel * kernel;
+
+  aligned_vector<int8_t> Bint8_2d(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+  aligned_vector<int8_t> Bint8_2d_unpacked(
+      kernel_dim_2d * conv_p_2d.IC * (conv_p_2d.OC / conv_p_2d.G));
+
+  PackWeightsForConv<2> packedB_2D(conv_p_2d, Bint8_2d.data());
+
+  packedB_2D.unpack(Bint8_2d_unpacked.data());
+
+  ASSERT_EQ(Bint8_2d, Bint8_2d_unpacked)
+      << "Original and unpacked data elements are not the same [2D]";
+
+  conv_param_t<3> conv_p_3d(
+      MB,
+      IC,
+      OC,
+      {IT, IH, IW},
+      G,
+      {kernel, kernel, kernel},
+      {stride, stride, stride},
+      {pad, pad, pad, pad, pad, pad});
+
+  int kernel_dim_3d = kernel * kernel * kernel;
+
+  aligned_vector<int8_t> Bint8_3d(
+      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
+
+  aligned_vector<int8_t> Bint8_3d_unpacked(
+      kernel_dim_3d * conv_p_3d.IC * (conv_p_3d.OC / conv_p_3d.G));
+
+  PackWeightsForConv<3> packedB_3D(conv_p_3d, Bint8_3d.data());
+
+  packedB_3D.unpack(Bint8_3d_unpacked.data());
+
+  ASSERT_EQ(Bint8_3d, Bint8_3d_unpacked)
+      << "Original and unpacked data elements are not the same [3D]";
+}