pytorch · kimishpatel · Apr 30, 2020 · May 1, 2020 · May 1, 2020 · May 1, 2020
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -537,9 +537,21 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
     for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
+    at::Tensor qbias;
     // Original bias was float, so we requantize it here.
-    auto qbias = at::quantize_per_tensor(
-        bias_fp32, weight_scales_data[0] * act_input_scale, 0, c10::kQInt32);
+    if (is_per_channel) {
+      at::Tensor bias_quant_scales =
+          weight_contig.q_per_channel_scales() * act_input_scale;
+      at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+      qbias = at::native::quantize_per_channel_cpu(
+          bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+    } else {
+      qbias = at::native::quantize_per_tensor(
+          bias_fp32,
+          weight_contig.q_scale() * act_input_scale,
+          0,
+          c10::kQInt32);
+    }
 
     conv_p = qnnpack::conv_param_t(
         {kernel_w, kernel_h},

diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -168,10 +168,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<kSpa
       weight.ndimension() == 4,
       "quantized::conv2d_prepack (qnnpack): Weights are expected to have 4 "
       "dimensions");
-  TORCH_CHECK(
-      weight.qscheme() == c10::kPerTensorAffine,
-      "quantized::conv2d_prepack (qnnpack): only supports Per Tensor "
-      "Quantization Scheme")
   TORCH_CHECK(
       stride.size() == 2,
       "quantized::conv2d_prepack (qnnpack): 2D convolution only");

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -266,17 +266,28 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
     // Original bias was float, so we requantize it here.
-    auto qbias = at::quantize_per_tensor(
-        bias_fp32, weight_scales_data[0] * input_scale, 0, c10::kQInt32);
+    const bool is_per_channel = orig_weight.qscheme() == at::kPerChannelAffine;
+    at::Tensor qbias;
+    // Original bias was float, so we requantize it here.
+    if (is_per_channel) {
+      at::Tensor bias_quant_scales =
+          weight_contig.q_per_channel_scales() * input_scale;
+      at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+      qbias = at::native::quantize_per_channel_cpu(
+          bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+    } else {
+      qbias = at::native::quantize_per_tensor(
+          bias_fp32, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
+    }
 
     // Update the input scale to not pack again.
     this->input_scale = input_scale;
     w.reset();
     w = std::make_unique<qnnpack::PackBMatrix>(
         cols_w /* input_channels */,
         rows_w /* output_channels */,
-        weight_zp_data[0],
-        requantization_scale.data()[0],
+        weight_zp_data,
+        requantization_scale.data(),
         reinterpret_cast<uint8_t*>(qnnp_w_data),
         reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()));
     packB = w.get();

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -280,6 +280,11 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
         (uint8_t*)qnnp_w_data,
         nullptr);
     packB = w.get();
+    // Need to move the check here since we are releasing the weights.
+    TORCH_CHECK(
+        orig_weight.qscheme() == at::kPerTensorAffine,
+        "quantized::linear_dynamic (qnnpack) only supports "
+        "Per Tensor Quantization Scheme");
     if (at::globalContext().releaseWeightsWhenPrepacking()) {
       // On mobile, we release the original weight by resetting the intrusive_ptr.
       // Calling unpack after this will throw an assertion.

diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -119,9 +119,6 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsQnnp::prepack(
   TORCH_CHECK(
       weight.dim() == 2,
       "quantized::linear_prepack (qnnpack): Weight tensor rank should be == 2");
-  TORCH_CHECK(
-      weight.qscheme() == c10::kPerTensorAffine,
-      "quantized::linear_prepack (qnnpack) only supports Per Tensor Quantization Scheme")
 
   int64_t rows_w = weight.size(0);
   at::Tensor bias_fp32;

diff --git a/test/quantization/test_quantized_functional.py b/test/quantization/test_quantized_functional.py
@@ -101,7 +101,6 @@ def test_conv2d_api(
         if qengine == 'qnnpack':
             if IS_PPC or TEST_WITH_UBSAN:
                 return
-            use_channelwise = False
 
         input_feature_map_size = (H, W)
         kernel_size = (kernel_h, kernel_w)

diff --git a/test/quantization/test_quantized_module.py b/test/quantization/test_quantized_module.py
@@ -378,8 +378,6 @@ def test_conv2d_api(
         stride = (stride_h, stride_w)
         padding = (pad_h, pad_w)
         dilation = (dilation, dilation)
-        if torch.backends.quantized.engine == 'qnnpack':
-            use_channelwise = False
         if use_fused:
             module_name = "QuantizedConvReLU2d"
             qconv_module = nnq_fused.ConvReLU2d(

diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
@@ -2064,7 +2064,6 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
                      use_relu, use_multi_dim_input, use_channelwise):
         decimal_val = 4
         if torch.backends.quantized.engine == 'qnnpack':
-            use_channelwise = False
             use_multi_dim_input = False
             # QNNPACK supports uint8 in the kernels. In the op we shift the int8
             # weight values to uint8 to be on par with fbgemm. However, this causes
@@ -2180,8 +2179,6 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
            use_channelwise=st.booleans())
     @override_qengines
     def test_qlinear_unpack(self, W, use_channelwise):
-        if torch.backends.quantized.engine == 'qnnpack':
-            use_channelwise = False
 
         W, (W_scale, W_zp, torch_type) = W
         if use_channelwise:
@@ -2435,9 +2432,6 @@ def test_qconv2d(
             use_relu,
             use_channelwise,
     ):
-        if torch.backends.quantized.engine == 'qnnpack':
-            use_channelwise = False
-
         input_channels = input_channels_per_group * groups
         output_channels = output_channels_per_group * groups
         kernels = (kernel_h, kernel_w)
@@ -2488,8 +2482,6 @@ def test_qconv2d(
     def test_qconv_unpack(
         self, inputs, stride_h, stride_w, pad_h, pad_w, channelwise
     ):
-        if torch.backends.quantized.engine == 'qnnpack':
-            channelwise = False
 
         qconv_prepack = torch.ops.quantized.conv2d_prepack
         qconv_unpack = torch.ops.quantized.conv2d_unpack