From f7ebb5822a593d1126c0dd973b63a086c45eb557 Mon Sep 17 00:00:00 2001 From: Rahul Chandra Date: Wed, 3 Dec 2025 15:12:26 -0800 Subject: [PATCH] Using generic implemntation for 16-bit activations and 8 bit weights for Conv2D in Backends (#16007) Summary: # Context We continue from D84284794 to add support for 16-bit activations. Note that right now, all though they support 16-bit activations already, it's only if the weights are also 16-bits. To do this, we need to change the way we template some functions. # Current Behavior Right now, we're composing two macros together, the `ET_FORALL_JARVIS_QUANTIZED_TYPES_WITH_INT16` macro: https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h?lines=22-25 and the function macro(`quantized_linear` chosen for example): https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/quantized_linear_out.cpp?lines=30-41 so together, it just becomes a switch statement, calling the `quantized_linear` function with the correct template parameter. However, note that it assumes that both the input activations and weights are the same dtype, which is not the case. # This Diff We finish by using the generic implementation for all the backends and adding e2e tests as well as unit tests. Reviewed By: hsharma35 Differential Revision: D87993325 --- backends/cadence/aot/quantizer/quantizer.py | 14 + .../op_quantized_conv2d_nchw_out.cpp | 49 +++ .../op_quantized_conv2d_nhwc_out.cpp | 53 ++- backends/cadence/hifi/operators/targets.bzl | 6 +- .../tests/test_op_quantized_conv2d_out.cpp | 304 ++++++++++++++++++ 5 files changed, 421 insertions(+), 5 deletions(-) create mode 100644 backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 70b16b86fda..7dac4049feb 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -372,3 +372,17 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: # Add 16-bit quantizers for LinearPattern quantizers.append(CadenceAtenQuantizer(LinearPattern(), qconfig_A16)) super().__init__(quantizers) + + +class CadenceWith16BitConvActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 conv + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for Conv patterns + quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16)) + quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16)) + super().__init__(quantizers) diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp index 984747d9316..fdc2c9ad5dc 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) @@ -532,6 +533,30 @@ void quantized_conv2d_nchw_out( __ET_UNUSED const Tensor& out_multiplier, __ET_UNUSED const Tensor& out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nchw_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; @@ -596,6 +621,30 @@ void quantized_conv2d_nchw_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nchw_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + bool optimized = 0; if ((input.scalar_type() == ScalarType::Char) || diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index a5d503853c4..1b4870d5f1b 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) @@ -435,9 +436,32 @@ void quantized_conv2d_nhwc_out( const Tensor& bias_scale, double output_scale, int64_t output_zero_point, - __ET_UNUSED const Tensor& out_multiplier, - __ET_UNUSED const Tensor& out_shift, + const Tensor& out_multiplier, + const Tensor& out_shift, Tensor& out) { + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nhwc_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } const float bias_scale_float = bias_scale.const_data_ptr()[0]; const int32_t weight_zero_point_int = weight_zero_point.const_data_ptr()[0]; @@ -502,8 +526,31 @@ void quantized_conv2d_nhwc_per_tensor_out( __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, Tensor& out) { - bool optimized = 0; + // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) + if (out.scalar_type() == ::executorch::aten::ScalarType::Short && + input.scalar_type() == ::executorch::aten::ScalarType::Short && + weight.scalar_type() == ::executorch::aten::ScalarType::Char) { + ::impl::generic::native::quantized_conv2d_nhwc_per_tensor_out( + ctx, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + out); + return; + } + bool optimized = 0; if ((input.scalar_type() == ScalarType::Char) || (input.scalar_type() == ScalarType::Byte)) optimized = 1; diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index ca36347da30..c993745c4c0 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -65,7 +65,6 @@ OPERATORS = [ "ne", "permute_copy", "pow", - "quantized_conv2d_nchw_out", "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out", @@ -74,7 +73,6 @@ OPERATORS = [ "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out", - "quantized_conv2d_nhwc_out", "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out", @@ -125,3 +123,7 @@ def define_common_targets(): # quantized_linear_out and quantized_linear_per_tensor_out needs additional dependency for int16 support define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"]) define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"]) + + # quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support + define_operator("quantized_conv2d_nchw_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) + define_operator("quantized_conv2d_nhwc_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp new file mode 100644 index 00000000000..70afc030b4c --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_conv2d_out.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; + +class HiFiQuantizedConv2dTest : public OperatorTest { + public: + protected: + void quantized_conv2d_nchw_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + ::executorch::aten::IntArrayRef stride, + ::executorch::aten::IntArrayRef padding, + ::executorch::aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + return ::impl::HiFi::native::quantized_conv2d_nchw_out( + context_, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + } + + void quantized_conv2d_nhwc_out( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + ::executorch::aten::IntArrayRef stride, + ::executorch::aten::IntArrayRef padding, + ::executorch::aten::IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + return ::impl::HiFi::native::quantized_conv2d_nhwc_out( + context_, + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + } +}; + +// Test quantized_conv2d_nchw_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedConv2dTest, QuantizedConv2dNchwInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + TensorFactory tf_float; + + // Minimal test case: input [1, 2, 3, 3], kernel [1, 2, 2, 2] -> output [1, 1, + // 2, 2] Small enough to verify by hand calculation + // + // Input Channel 0 (3x3): Input Channel 1 (3x3): + // 1 2 3 1 1 1 + // 4 5 6 1 1 1 + // 7 8 9 1 1 1 + // + // Weight Out Ch 0, In Ch 0: Weight Out Ch 0, In Ch 1: + // 1 0 1 1 + // 0 1 1 1 + // + // Hand calculation for each output position: + // (0,0): Ch0: 1*1+2*0+4*0+5*1=6, Ch1: 1*1+1*1+1*1+1*1=4 -> 10 + // (0,1): Ch0: 2*1+3*0+5*0+6*1=8, Ch1: 1*1+1*1+1*1+1*1=4 -> 12 + // (1,0): Ch0: 4*1+5*0+7*0+8*1=12, Ch1: 1*1+1*1+1*1+1*1=4 -> 16 + // (1,1): Ch0: 5*1+6*0+8*0+9*1=14, Ch1: 1*1+1*1+1*1+1*1=4 -> 18 + Tensor input = tf_int16.make( + {1, 2, 3, 3}, + {1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, // Channel 0 + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1}); // Channel 1 + Tensor weight = tf_int8.make( + {1, 2, 2, 2}, + {1, + 0, + 0, + 1, // Out Ch 0, In Ch 0: diagonal pattern + 1, + 1, + 1, + 1}); // Out Ch 0, In Ch 1: all ones + Tensor bias = tf_int32.zeros({1}); + + // Output dimensions: (3-2)/1+1=2 for each spatial dimension + Tensor output = tf_int16.zeros({1, 1, 2, 2}); + + int64_t in_zero_point = 0; + Tensor weight_zero_point = tf_int32.make({1}, {0}); + Tensor bias_scale = tf_float.make({1}, {1.0f}); + double output_scale = 1.0; + int64_t output_zero_point = 0; + Tensor out_multiplier = tf_int32.make({1}, {1073741824}); // 0.5 * 2^31 + Tensor out_shift = tf_int32.make({1}, {0}); + + std::array stride_arr = {1, 1}; + std::array padding_arr = {0, 0}; + std::array dilation_arr = {1, 1}; + + ::executorch::aten::ArrayRef stride(stride_arr.data(), 2); + ::executorch::aten::ArrayRef padding(padding_arr.data(), 2); + ::executorch::aten::ArrayRef dilation(dilation_arr.data(), 2); + + quantized_conv2d_nchw_out( + input, + weight, + bias, + stride, + padding, + dilation, + 1, // groups + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + + Tensor expected = tf_int16.make({1, 1, 2, 2}, {10, 12, 16, 18}); + EXPECT_TENSOR_EQ(output, expected); +} + +// Test quantized_conv2d_nhwc_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedConv2dTest, QuantizedConv2dNhwcInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + TensorFactory tf_float; + + // Minimal test case in NHWC format: input [1, 3, 3, 2], kernel [1, 2, 2, 2] + // -> output [1, 2, 2, 1] Same values as NCHW test, just different layout + // + // Input (H=3, W=3, C=2): + // Position (h,w): [Ch0, Ch1] + // (0,0): [1, 1] (0,1): [2, 1] (0,2): [3, 1] + // (1,0): [4, 1] (1,1): [5, 1] (1,2): [6, 1] + // (2,0): [7, 1] (2,1): [8, 1] (2,2): [9, 1] + // + // Weight (Out=1, H=2, W=2, In=2): + // For output channel 0: + // Position (h,w): [In0, In1] + // (0,0): [1, 1] (0,1): [0, 1] + // (1,0): [0, 1] (1,1): [1, 1] + // + // Hand calculation matches NCHW test: + // Output (0,0): 10, (0,1): 12, (1,0): 16, (1,1): 18 + Tensor input = tf_int16.make( + {1, 3, 3, 2}, + {1, + 1, + 2, + 1, + 3, + 1, // Row 0: (Ch0,Ch1) pairs + 4, + 1, + 5, + 1, + 6, + 1, // Row 1 + 7, + 1, + 8, + 1, + 9, + 1}); // Row 2 + Tensor weight = tf_int8.make( + {1, 2, 2, 2}, + {1, + 1, + 0, + 1, // Row 0: (In0,In1) pairs + 0, + 1, + 1, + 1}); // Row 1 + Tensor bias = tf_int32.zeros({1}); + + // Output dimensions: (3-2)/1+1=2 for each spatial dimension + Tensor output = tf_int16.zeros({1, 2, 2, 1}); + + int64_t in_zero_point = 0; + Tensor weight_zero_point = tf_int32.make({1}, {0}); + Tensor bias_scale = tf_float.make({1}, {1.0f}); + double output_scale = 1.0; + int64_t output_zero_point = 0; + Tensor out_multiplier = tf_int32.make({1}, {1073741824}); // 0.5 * 2^31 + Tensor out_shift = tf_int32.make({1}, {0}); + + std::array stride_arr = {1, 1}; + std::array padding_arr = {0, 0}; + std::array dilation_arr = {1, 1}; + + ::executorch::aten::ArrayRef stride(stride_arr.data(), 2); + ::executorch::aten::ArrayRef padding(padding_arr.data(), 2); + ::executorch::aten::ArrayRef dilation(dilation_arr.data(), 2); + + quantized_conv2d_nhwc_out( + input, + weight, + bias, + stride, + padding, + dilation, + 1, // groups + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + output); + + Tensor expected = tf_int16.make({1, 2, 2, 1}, {10, 12, 16, 18}); + EXPECT_TENSOR_EQ(output, expected); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl