From a687c43131a3c703031d51dc624f508bf3b4b3e7 Mon Sep 17 00:00:00 2001 From: Abdurrahman Akkas Date: Tue, 21 Apr 2026 13:51:07 -0700 Subject: [PATCH] Precompute T1 offset for quantized conv2d NHWC in TIE kernel (#18960) Summary: Move the zero-point correction term `t1[oc] = -input_zero_point * sum(weight[oc])` from runtime (malloc + compute_t1_..._DWH + free per inference) to compile time via a new PrecomputeForQuantizedConvPass, mirroring the existing linear pass. The precomputed offset is threaded through a new optional "offset" parameter on cadence::quantized_conv2d_nhwc.per_tensor (defaults to None for backwards compatibility). The now-dead compute_t1_..._DWH functions are removed. The TIE kernels assume the existence of the offset parameter similar to quantized_linear case. Differential Revision: D100690813 --- backends/cadence/aot/functions.yaml | 2 +- backends/cadence/aot/functions_hifi.yaml | 2 +- backends/cadence/aot/ops_registrations.py | 5 +++-- backends/cadence/aot/ref_implementations.py | 1 + backends/cadence/generic/operators/op_quantized_conv2d.cpp | 2 ++ backends/cadence/generic/operators/op_quantized_conv2d.h | 1 + .../cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp | 4 ++++ backends/cadence/hifi/operators/operators.h | 1 + backends/cadence/vision/operators/op_quantized_conv_out.cpp | 1 + 9 files changed, 15 insertions(+), 4 deletions(-) diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index 4f557f210a9..2eed2f4c486 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -384,7 +384,7 @@ - arg_meta: null kernel_name: impl::generic::quantized_conv2d_nchw_per_tensor_out -- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index a4f3a7347bb..3b1932d01ec 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -395,7 +395,7 @@ - arg_meta: null kernel_name: impl::HiFi::quantized_conv2d_nchw_per_tensor_out -- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null kernel_name: impl::HiFi::quantized_conv2d_nhwc_per_tensor_out diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index 269d5b4236e..0effaf3e029 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -233,10 +233,10 @@ def register_fake( "quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( - "quantized_conv2d_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" + "quantized_conv2d_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None) -> (Tensor Z)" ) lib.define( - "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" + "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)" ) lib.define( "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" @@ -1444,6 +1444,7 @@ def quantized_conv2d_nhwc_per_tensor_meta( output_zero_point: int, out_multiplier: int, out_shift: int, + offset: Optional[torch.Tensor] = None, ) -> torch.Tensor: in_size = input.shape # Assert that the input tensor has at least 3 dimensions, and at most 6 diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index f7020979612..32558166fbf 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -1449,6 +1449,7 @@ def quantized_conv2d_nhwc_per_tensor( output_zero_point: int, out_multiplier: int, out_shift: int, + offset: torch.Tensor | None = None, ) -> torch.Tensor: """ Quantized convolution operation. diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.cpp b/backends/cadence/generic/operators/op_quantized_conv2d.cpp index d12df8a46b4..8cf24015893 100644 --- a/backends/cadence/generic/operators/op_quantized_conv2d.cpp +++ b/backends/cadence/generic/operators/op_quantized_conv2d.cpp @@ -16,6 +16,7 @@ namespace impl { namespace generic { namespace native { +using ::executorch::aten::optional; using ::executorch::aten::ScalarType; using ::executorch::aten::Tensor; using ::executorch::runtime::KernelRuntimeContext; @@ -935,6 +936,7 @@ Tensor& quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, ET_UNUSED int64_t out_multiplier, ET_UNUSED int64_t out_shift, + ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { quantized_conv2d_nhwc( input, diff --git a/backends/cadence/generic/operators/op_quantized_conv2d.h b/backends/cadence/generic/operators/op_quantized_conv2d.h index 07678b0600c..00cf62eba70 100644 --- a/backends/cadence/generic/operators/op_quantized_conv2d.h +++ b/backends/cadence/generic/operators/op_quantized_conv2d.h @@ -205,6 +205,7 @@ ::executorch::aten::Tensor& quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, + const ::executorch::aten::optional& offset, Tensor& out); ::executorch::aten::Tensor& diff --git a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp index c5ebac73b4b..ea3a756f995 100644 --- a/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp @@ -17,6 +17,7 @@ using Tensor = executorch::aten::Tensor; using KernelRuntimeContext = torch::executor::KernelRuntimeContext; using ScalarType = executorch::aten::ScalarType; using ::executorch::aten::IntArrayRef; +using ::executorch::aten::optional; namespace impl { namespace HiFi { @@ -378,6 +379,7 @@ void xa_opt_quantized_conv2d_nhwc( output_zero_point, 0, // out_multiplier (unused) 0, // out_shift (unused) + optional(), // offset (unused) out); } @@ -568,6 +570,7 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, __ET_UNUSED int64_t out_multiplier, __ET_UNUSED int64_t out_shift, + const optional& offset, Tensor& out) { // Handle W8A16 heterogeneous type (int16_t activations, int8_t weights) if (out.scalar_type() == ::executorch::aten::ScalarType::Short && @@ -589,6 +592,7 @@ void quantized_conv2d_nhwc_per_tensor_out( output_zero_point, out_multiplier, out_shift, + offset, out); return; } diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h index 6adc027fed9..3ca505d40cb 100644 --- a/backends/cadence/hifi/operators/operators.h +++ b/backends/cadence/hifi/operators/operators.h @@ -158,6 +158,7 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, + const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); ::executorch::aten::Tensor& cat_out( diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp index b632f0931c2..be4b34bff03 100644 --- a/backends/cadence/vision/operators/op_quantized_conv_out.cpp +++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp @@ -582,6 +582,7 @@ void quantized_conv2d_nhwc_per_tensor_out( int64_t output_zero_point, int64_t out_multiplier, int64_t out_shift, + ET_UNUSED const ::executorch::aten::optional& offset, Tensor& out) { quantized_conv_per_tensor_out( ctx,