Skip to content

Commit

Permalink
remove template parameter from PackedDepthWiseConvMatrix (#128)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #128

We don't really need to have KERNEL_PROD as a compile time constant template parameter in PackedDepthWiseConvMatrix for performance. Removing the template parameter will make generalizing depth-wise convolution to non 3x3 cases easier.
This diff only changes fbgemm while maintaining the old interface. The follow-up diff will change Caffe2 code using the old interface and remove the old interface.
This diff also splits FbgemmI8DepthwiseAvx2.cc into FbgemmI8Depthwise3DAvx2.cc and PackDepthwiseConvMatrixAvx2.cc to avoid compilation timeouts in OSS build tests.

Reviewed By: dskhudia

Differential Revision: D17514003

fbshipit-source-id: 2214637ac0762a585f619f0035d3449cc4f7669e
  • Loading branch information
jspark1105 authored and facebook-github-bot committed Sep 24, 2019
1 parent 53f0c0d commit 518d8a1
Show file tree
Hide file tree
Showing 13 changed files with 2,792 additions and 2,768 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,10 @@ endif()
#All the source files that either use avx2 instructions statically
set(FBGEMM_AVX2_SRCS
src/FbgemmFP16UKernelsAvx2.cc
src/FbgemmI8Depthwise3DAvx2.cc
src/FbgemmI8DepthwiseAvx2.cc
src/OptimizedKernelsAvx2.cc
src/PackDepthwiseConvMatrixAvx2.cc
src/QuantUtilsAvx2.cc
src/UtilsAvx2.cc)

Expand Down
2 changes: 1 addition & 1 deletion bench/Depthwise3DBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ int main() {
K);
}

Packed3x3x3ConvMatrix Bp(K, B.data());
PackedDepthWiseConvMatrix Bp(K, 3 * 3 * 3, B.data());

double ttot = 0;
double bytes = double(NITER) *
Expand Down
2 changes: 1 addition & 1 deletion bench/DepthwiseBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ int main() {
K);
}

Packed3x3ConvMatrix Bp(K, B.data());
PackedDepthWiseConvMatrix Bp(K, 3 * 3, B.data());

double ttot = 0;
double bytes = double(NITER) *
Expand Down
18 changes: 10 additions & 8 deletions include/fbgemm/Fbgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -588,12 +588,16 @@ class FBGEMM_API PackWeightsForConv {
return W_im2col_packed_;
}

std::shared_ptr<Packed3x3ConvMatrix> getPackedWFor2DDW() {
return W_dw_2D_packed_;
std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
return W_dw_packed_;
}

std::shared_ptr<Packed3x3x3ConvMatrix> getPackedWFor3DDW() {
return W_dw_3D_packed_;
std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWFor2DDW() {
return W_dw_packed_;
}

std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWFor3DDW() {
return W_dw_packed_;
}

std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
Expand Down Expand Up @@ -642,10 +646,8 @@ class FBGEMM_API PackWeightsForConv {
const conv_param_t<SPATIAL_DIM> conv_param_;
// Packed weights if we use im2col based convolution implementation
std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
// Packed weights if we use 2D depthwise convolution implementation
std::shared_ptr<Packed3x3ConvMatrix> W_dw_2D_packed_;
// Packed weights if we use 3D depthwise convolution implementation
std::shared_ptr<Packed3x3x3ConvMatrix> W_dw_3D_packed_;
// Packed weights if we use depthwise convolution implementation
std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
// Packed weights if we use groupwise (small channels per group) convolution
// implementation
std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
Expand Down
60 changes: 35 additions & 25 deletions include/fbgemm/FbgemmI8DepthwiseAvx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,26 @@

namespace fbgemm {

// KERNEL_PROD is the product of all kernels.
// For example, KERNEL_PROD = 9 for 3x3, and 27 for 3x3x3.
template <int KERNEL_PROD>
class FBGEMM_API PackedDepthWiseConvMatrix {
public:
// smat in GRS layout
PackedDepthWiseConvMatrix(int K, const std::int8_t* smat);
/**
* @params K the number of channels (same as the number of groups because
* depth-wise convolution has one input/output channel per group)
* @params kernel_prod the product of all kernels. For example, kernel_prod =
* 9 for 3x3 conv, and 27 for 3x3x3 conv.
* @param smat the source unpacked weight in GRS layout
*/
PackedDepthWiseConvMatrix(int K, int kernel_prod, const std::int8_t* smat);
virtual ~PackedDepthWiseConvMatrix();

const std::int8_t* PackedMat() const {
return pmat_;
}

int GetKernelProduct() const {
return kernel_prod_;
}

/**
* @brief Unpacks pmat_ into unpack_data.
* Used for recovering the weight matrix into the original format
Expand All @@ -36,19 +43,22 @@ class FBGEMM_API PackedDepthWiseConvMatrix {
int addr(int r, int c);

private:
int K_;
std::int8_t* pmat_;
}; // Packed3x3ConvMatrix
const int K_; /**< the number of channels */
const int kernel_prod_; /** the product of all kernel dims */
std::int8_t* pmat_; /** packed weight */
}; // PackedDepthWiseConvMatrix

using Packed3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3>;
using Packed3x3x3ConvMatrix = PackedDepthWiseConvMatrix<3 * 3 * 3>;
using Packed1ConvMatrix = PackedDepthWiseConvMatrix<1>;
using Packed2ConvMatrix = PackedDepthWiseConvMatrix<2>;
using Packed3ConvMatrix = PackedDepthWiseConvMatrix<3>;
using Packed4ConvMatrix = PackedDepthWiseConvMatrix<4>;
using Packed5ConvMatrix = PackedDepthWiseConvMatrix<5>;
using Packed10ConvMatrix = PackedDepthWiseConvMatrix<10>;
using Packed11ConvMatrix = PackedDepthWiseConvMatrix<11>;
class FBGEMM_API Packed3x3ConvMatrix : public PackedDepthWiseConvMatrix {
public:
Packed3x3ConvMatrix(int K, const std::int8_t* smat)
: PackedDepthWiseConvMatrix(K, 3 * 3, smat) {}
};

class FBGEMM_API Packed3x3x3ConvMatrix : public PackedDepthWiseConvMatrix {
public:
Packed3x3x3ConvMatrix(int K, const std::int8_t* smat)
: PackedDepthWiseConvMatrix(K, 3 * 3 * 3, smat) {}
};

/** To be removed. Keeping it just to make sure we don't change C2 files and
* fbgemm files in a single diff
Expand All @@ -64,7 +74,7 @@ FBGEMM_API void depthwise_3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
const Packed3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand Down Expand Up @@ -93,7 +103,7 @@ FBGEMM_API void depthwise_3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
const Packed3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand Down Expand Up @@ -121,7 +131,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
const Packed3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand All @@ -145,7 +155,7 @@ FBGEMM_API void depthwise_3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
const Packed3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand All @@ -171,7 +181,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
const Packed3x3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand All @@ -196,7 +206,7 @@ FBGEMM_API void depthwise_3x3x3_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
std::int32_t B_zero_point,
const Packed3x3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
float C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand All @@ -223,7 +233,7 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
const Packed3x3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand All @@ -249,7 +259,7 @@ FBGEMM_API void depthwise_3x3x3_per_channel_quantization_pad_1(
std::int32_t A_zero_point,
const std::uint8_t* A,
const std::int32_t* B_zero_point,
const Packed3x3x3ConvMatrix& Bp,
const PackedDepthWiseConvMatrix& Bp,
const float* C_multiplier,
std::int32_t C_zero_point,
std::uint8_t* C,
Expand Down
8 changes: 4 additions & 4 deletions src/FbgemmConv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ int fbgemmConv(
outProcess.getAZeroPoint(),
activations,
B_zero_point[0],
*(packed_weights.getPackedWFor3DDW()),
*(packed_weights.getPackedWForDepthwise()),
C_multiplier[0],
outProcess.getCZeroPoint(),
out,
Expand All @@ -135,7 +135,7 @@ int fbgemmConv(
outProcess.getAZeroPoint(),
activations,
B_zero_point,
*(packed_weights.getPackedWFor3DDW()),
*(packed_weights.getPackedWForDepthwise()),
C_multiplier,
outProcess.getCZeroPoint(),
out,
Expand Down Expand Up @@ -163,7 +163,7 @@ int fbgemmConv(
outProcess.getAZeroPoint(),
activations,
B_zero_point[0],
*(packed_weights.getPackedWFor2DDW()),
*(packed_weights.getPackedWForDepthwise()),
C_multiplier[0],
outProcess.getCZeroPoint(),
out,
Expand All @@ -188,7 +188,7 @@ int fbgemmConv(
outProcess.getAZeroPoint(),
activations,
B_zero_point,
*(packed_weights.getPackedWFor2DDW()),
*(packed_weights.getPackedWForDepthwise()),
C_multiplier,
outProcess.getCZeroPoint(),
out,
Expand Down

0 comments on commit 518d8a1

Please sign in to comment.