pytorch
diff --git a/‎aten/src/ATen/native/quantized/cpu/qconv.cpp‎
Lines changed: 39 additions & 33 deletions b/‎aten/src/ATen/native/quantized/cpu/qconv.cpp‎
Lines changed: 39 additions & 33 deletions
diff --git a/‎aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp‎
Lines changed: 9 additions & 30 deletions b/‎aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp‎
Lines changed: 9 additions & 30 deletions
diff --git a/‎aten/src/ATen/native/quantized/cpu/qlinear.cpp‎
Lines changed: 29 additions & 15 deletions b/‎aten/src/ATen/native/quantized/cpu/qlinear.cpp‎
Lines changed: 29 additions & 15 deletions
diff --git a/‎aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp‎
Lines changed: 22 additions & 13 deletions b/‎aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp‎
Lines changed: 22 additions & 13 deletions
@@ -474,12 +474,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       act.ndimension(), stride_.size(), padding_.size(), dilation_.size());
 
   auto* pack_w = w.get();
-  // Adjust weight zero point, similar to weight data.
-  auto kernel_zp = w_zp + 128;
-  const auto& kernel_scale = w_scale;
 
-  const uint32_t kernel_h = kernel[0];
-  const uint32_t kernel_w = kernel[1];
   // TODO Can be replaced with packB->getOutputChannels() when update pre-pack
   // to actually do the packing.
   const auto out_ch = bias.size(0);
@@ -490,14 +485,12 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
   const int W = act.size(3);
   const int M = out_ch; // output channels
 
-  const at::Tensor act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast);
+  TORCH_CHECK( M == orig_weight.size(0),
+      "Output channel size of weight and bias must match.");
+  TORCH_CHECK( C == groups_ * orig_weight.size(1),
+      "Output channel size of weight and bias must match.");
 
-  const uint32_t stride_h = stride_[0];
-  const uint32_t stride_w = stride_[1];
-  const uint32_t pad_h = padding_[0];
-  const uint32_t pad_w = padding_[1];
-  const uint32_t dilation_h = dilation_[0];
-  const uint32_t dilation_w = dilation_[1];
+  const at::Tensor act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast);
 
   auto output_min = kReluFused
       ? activationLimits(output_scale, output_zero_point, Activation::RELU)
@@ -507,20 +500,6 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       ? activationLimits(output_scale, output_zero_point, Activation::RELU)
             .second
       : std::numeric_limits<uint8_t>::max();
-  qnnpack::conv_param_t conv_p(
-      {kernel_w, kernel_h},
-      {stride_w, stride_h},
-      {dilation_w, dilation_h},
-      {pad_h, pad_w, pad_h, pad_w},
-      /*adjustment=*/{0, 0},
-      groups_,
-      C,
-      M,
-      kernel_zp,
-      kernel_scale,
-      output_min,
-      output_max,
-      /*transpose=*/false);
 
   double act_input_scale = act_nhwc.q_scale();
 
@@ -532,27 +511,51 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
     auto bias_fp32 = bias;
     int8_t* w_data =
         reinterpret_cast<int8_t*>(weight_contig.template data_ptr<c10::qint8>());
+
+    float* weight_scales_data = w_scales.data_ptr<float>();
+    // We calculate requant scale here as the vector holding the requant scale
+    // is owned by this module. The pointer is then passed to qnnpack backend.
+    generate_requantization_scales(
+        w_scales, act_input_scale, output_scale, requantization_scales);
+
+    // TODO Kimish, we are allocating affine_quantized regardless of per channel or not.
+    // This allocation is actually used only for packing weight and thus will be freed.
+    // Still we should be consistent. Fix this.
     at::Tensor qnnp_weight = at::_empty_affine_quantized(
         weight_contig.sizes(),
         at::device(c10::kCPU)
             .dtype(c10::kQUInt8)
             .memory_format(c10::MemoryFormat::ChannelsLast),
-        kernel_scale,
-        kernel_zp,
+        weight_scales_data[0],
+        w_zero_points[0],
         c10::nullopt);
     auto* qnnp_w_data = qnnp_weight.template data_ptr<c10::quint8>();
     auto wt_numel = weight_contig.numel();
     for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
+    at::Tensor qbias;
     // Original bias was float, so we requantize it here.
-    auto qbias = at::quantize_per_tensor(
-        bias_fp32, kernel_scale * act_input_scale, 0, c10::kQInt32);
+    if (conv_p.per_channel) {
+      at::Tensor bias_quant_scales =
+          weight_contig.q_per_channel_scales() * act_input_scale;
+      at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+      qbias = at::native::quantize_per_channel_cpu(
+          bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+    } else {
+      qbias = at::native::quantize_per_tensor(
+          bias_fp32,
+          weight_contig.q_scale() * act_input_scale,
+          0,
+          c10::kQInt32);
+    }
+
     // Update the input scale to not pack again.
     input_scale = act_input_scale;
     w.reset();
     w = std::make_unique<qnnpack::PrePackConvWeights>(
         conv_p,
+        w_zero_points.data(),
         reinterpret_cast<uint8_t*>(qnnp_w_data),
         reinterpret_cast<int32_t*>(qbias.template data_ptr<c10::qint32>()));
     pack_w = w.get();
@@ -562,9 +565,10 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
         orig_weight.reset();
     }
   }
+
   TORCH_INTERNAL_ASSERT(pack_w != nullptr, "Packed Weights are NULL");
   const auto output_shape = MakeConvOutputShape<kSpatialDim>(
-      N, M, {H, W}, kernel, stride_, padding_, dilation_);
+      N, M, {H, W}, kernel_, stride_, padding_, dilation_);
   if (act_nhwc.numel() > 0) {
     TORCH_CHECK(
         std::all_of(
@@ -591,11 +595,13 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       N,
       H,
       W,
-      act_nhwc.q_scale(),
       act_nhwc.q_zero_point(),
       reinterpret_cast<uint8_t*>(act_nhwc.template data_ptr<c10::quint8>()),
-      output.q_scale(),
+      w_zero_points.data(),
+      requantization_scales.data(),
       output.q_zero_point(),
+      output_min,
+      output_max,
       reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()),
       caffe2::mobile_pthreadpool());
 
 
@@ -172,11 +172,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
       weight.ndimension() == 4,
       "quantized::conv2d_prepack (qnnpack): Weights are expected to have 4 "
       "dimensions");
-  const auto qtype = weight.qscheme();
-  TORCH_CHECK(
-      weight.qscheme() == c10::kPerTensorAffine,
-      "quantized::conv2d_prepack (qnnpack): only supports Per Tensor "
-      "Quantization Scheme")
   TORCH_CHECK(
       stride.size() == 2,
       "quantized::conv2d_prepack (qnnpack): 2D convolution only");
@@ -193,7 +188,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
   // QNNPACK expects weights to be of the format {out_c, kH, kW, in_c/groups},
   // but PyTorch lays them out as {out_c, in_c/groups, kH, kW}
   const size_t out_ch = weight.size(0);
-  const size_t in_ch = weight.size(1) * groups;
   const uint32_t kernel_h = weight.size(2);
   const uint32_t kernel_w = weight.size(3);
 
@@ -203,6 +197,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
   } else {
     bias_fp32 = at::zeros(out_ch, weight.options().dtype(at::kFloat));
   }
+
   TORCH_CHECK(
       !bias_fp32.defined() ||
           (bias_fp32.ndimension() == 1 && bias_fp32.size(0) == out_ch),
@@ -214,30 +209,13 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
       bias_fp32.sizes(),
       " instead");
 
-  uint32_t stride_h = stride[0];
-  uint32_t stride_w = stride[1];
-  uint32_t pad_t = padding[0];
-  uint32_t pad_l = padding[1];
-  uint32_t dilation_h = dilation[0];
-  uint32_t dilation_w = dilation[1];
-
-  qnnpack::conv_param_t conv_p(
-      {kernel_w, kernel_h},
-      {stride_w, stride_h},
-      {dilation_w, dilation_h},
-      {pad_t, pad_l, pad_t, pad_l},
-      /*adjustment=*/{0, 0},
-      groups,
-      in_ch,
-      out_ch,
-      weight.q_zero_point(),
-      weight.q_scale(),
-      std::numeric_limits<uint8_t>::min(),
-      std::numeric_limits<uint8_t>::max(),
-      /*transpose=*/false);
-
   auto weight_contig = weight.contiguous(c10::MemoryFormat::ChannelsLast);
+  const bool is_per_channel = weight_contig.qscheme() == at::kPerChannelAffine;
 
+  std::vector<uint8_t> w_zero_points;
+  at::Tensor  w_scales;
+  std::tie(w_zero_points, w_scales) =
+      make_zero_points_and_scales_tensor(weight_contig);
   // We set the pre-packed conv weights to nullptr below as we call pre-pack
   // during the first invocation of operator run. Refer to qconv.cpp for more
   // details. TODO Update to actually call pre-pack here once bias is removed
@@ -254,8 +232,9 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
               groups,
               c10::nullopt, /* input_scale */
               {kernel_h, kernel_w},
-              static_cast<float>(weight.q_scale()),
-              static_cast<int32_t>(weight.q_zero_point())});
+              w_scales,
+              std::move(w_zero_points),
+              is_per_channel});
 
   return ret_ptr;
 }
 
@@ -236,9 +236,6 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
   auto input_contig = input.contiguous();
 
   auto packB = w.get();
-  // Adjust weight zero point, similar to weight data.
-  auto kernel_zp = w_zp + 128;
-  auto kernel_scale = w_scale;
   size_t rows_w = bias_.size(0);
   size_t cols_w = input_contig.size(input_contig.dim() - 1);
   auto input_scale = input_contig.q_scale();
@@ -249,29 +246,48 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
     auto weight_contig = orig_weight;
     auto bias_fp32 = bias_;
     int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
+
+    float* weight_scales_data = w_scales.data_ptr<float>();
+    // We calculate requant scale here as the vector holding the requant scale
+    // is owned by this module. The pointer is then passed to qnnpack backend.
+    generate_requantization_scales(
+        w_scales, input_scale, output_scale, requantization_scales);
+
     at::Tensor qnnp_weight = at::_empty_affine_quantized(
         weight_contig.sizes(),
         at::device(c10::kCPU).dtype(c10::kQUInt8),
-        kernel_scale,
-        kernel_zp);
+        weight_scales_data[0],
+        w_zero_points[0]);
     auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
     auto wt_numel = weight_contig.numel();
     for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
     // Original bias was float, so we requantize it here.
-    auto qbias = at::quantize_per_tensor(
-        bias_fp32, kernel_scale * input_scale, 0, c10::kQInt32);
+    const bool is_per_channel = orig_weight.qscheme() == at::kPerChannelAffine;
+    at::Tensor qbias;
+    // Original bias was float, so we requantize it here.
+    if (is_per_channel) {
+      at::Tensor bias_quant_scales =
+          weight_contig.q_per_channel_scales() * input_scale;
+      at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+      qbias = at::native::quantize_per_channel_cpu(
+          bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+    } else {
+      qbias = at::native::quantize_per_tensor(
+          bias_fp32, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
+    }
+
     // Update the input scale to not pack again.
     this->input_scale = input_scale;
     w.reset();
     w = std::make_unique<qnnpack::PackBMatrix>(
         cols_w /* input_channels */,
         rows_w /* output_channels */,
-        kernel_zp,
-        kernel_scale,
-        (uint8_t*)qnnp_w_data,
-        (int32_t*)qbias.data_ptr<c10::qint32>());
+        w_zero_points.data(),
+        requantization_scales.data(),
+        reinterpret_cast<uint8_t*>(qnnp_w_data),
+        reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()));
     packB = w.get();
     if (at::globalContext().releaseWeightsWhenPrepacking()) {
       // On mobile, we release the original weight by resetting the intrusive_ptr.
@@ -315,11 +331,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
       cols_input /* input_channels */,
       rows_w /* output_channels */,
       input_contig.q_zero_point(),
-      input_contig.q_scale(),
-      kernel_zp,
-      kernel_scale,
+      w_zero_points.data(),
+      requantization_scales.data(),
       output_zero_point,
-      output_scale,
       output_min,
       output_max,
       (uint8_t*)input_contig.data_ptr<c10::quint8>(),
 
@@ -227,9 +227,6 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
   // matrices, respectively.
 
   auto packB = w.get();
-  // Adjust weight zero point, similar to weight data.
-  auto kernel_zp = w_zp + 128;
-  auto kernel_scale = w_scale;
   size_t rows_w = bias_.size(0);
   size_t cols_w = input_contig.size(input_contig.dim() - 1);
 
@@ -250,30 +247,38 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
       /*max=*/x_max,
       /*qmin=*/0,
       /*qmax=*/255);
+  float* weight_scales_data = w_scales.data_ptr<float>();
+  if (!input_scale.has_value() || input_scale.value() != q_params.scale) {
+    generate_requantization_scales(
+        w_scales, q_params.scale, 1.f, requantization_scales);
+  }
+
   if (!input_scale.has_value()) {
     // Get the original weight and adjust it to uint8 from int8
     auto weight_contig = orig_weight;
-    int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
+
+    // TODO(kimishpatel), we are allocating affine_quantized regardless of per channel or not.
+    // This allocation is actually used only for packing weight and thus will be freed.
+    // Still we should be consistent. Fix this.
     Tensor qnnp_weight = at::_empty_affine_quantized(
         weight_contig.sizes(),
         at::device(c10::kCPU).dtype(c10::kQUInt8),
-        kernel_scale,
-        kernel_zp);
+        weight_scales_data[0],
+        w_zero_points[0]);
     auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
+    int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
     auto wt_numel = weight_contig.numel();
     for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
 
-    // Update the input scale to not pack again.
     // Pass in nullptr for bias, as we pass FP32 bias to run function.
-    input_scale = q_params.scale;
     w.reset();
     w = std::make_unique<qnnpack::PackBMatrix>(
         cols_w /* input_channels */,
         rows_w /* output_channels */,
-        kernel_zp,
-        kernel_scale,
+        w_zero_points.data(),
+        requantization_scales.data(),
         (uint8_t*)qnnp_w_data,
         nullptr);
     packB = w.get();
@@ -284,6 +289,10 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
     }
   }
 
+  // Update the input scale to not pack weights again.
+  // as well as to avoid repopulating requant scale if scale has not changed.
+  input_scale = q_params.scale;
+
   // Quantize input
   Tensor q_input = at::quantize_per_tensor(
       input_contig, q_params.scale, q_params.zero_point, c10::kQUInt8);
@@ -307,9 +316,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
       cols_input /* input_channels */,
       rows_w /* output_channels */,
       q_input.q_zero_point(),
-      q_input.q_scale(),
-      kernel_zp,
-      kernel_scale,
+      w_zero_points.data(),
+      /* for dynamic should really be called dequant scale */
+      requantization_scales.data(),
       (uint8_t*)q_input.data_ptr<c10::quint8>(),
       cols_input /* input_stride */,
       packB->getPackedWeights(),