From a3057697680f387be8508b3062bf18bef8b7876e Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Fri, 20 Mar 2026 17:53:48 -0700 Subject: [PATCH 1/4] Cortex-M: Fuse relu activation into quantized_add ResNet8 has skip connections with relu(add(conv(x), skip(x))). The ActivationFusionPass only fused relu into conv/linear, leaving 3 unfused relu ops that fell through to portable aten::relu.out which incorrectly clamps int8 tensors to literal 0 instead of the quantized zero_point, causing numerical mismatches on the FVP. Add fused activation patterns (relu, hardtanh, clamp) for add/add_ to quantizer_support.py BINARY_OP_PATTERNS so the quantizer produces activation-aware quantization bounds. Add aten.add.Tensor to ActivationFusionPass FUSE_OPS. Update QuantizedOpFusionPass to read activation bounds from output_qparams and pass them to quantized_add. Update the quantized_add operator (schema, meta, impl, C++) to accept activation_min/activation_max parameters. Co-authored-by: Claude --- backends/cortex_m/ops/op_quantized_add.cpp | 10 ++++++---- backends/cortex_m/ops/operators.py | 11 ++++++++--- backends/cortex_m/ops/operators.yaml | 2 +- backends/cortex_m/passes/activation_fusion_pass.py | 3 ++- backends/cortex_m/passes/quantized_op_fusion_pass.py | 5 +++++ backends/cortex_m/quantizer/quantizer_support.py | 12 ++++++++++++ 6 files changed, 34 insertions(+), 9 deletions(-) diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp index b4bbfdaffce..f607977aa48 100644 --- a/backends/cortex_m/ops/op_quantized_add.cpp +++ b/backends/cortex_m/ops/op_quantized_add.cpp @@ -26,6 +26,8 @@ Tensor& quantized_add_out( const int64_t output_zero_point, const int64_t output_multiplier, const int64_t output_shift, + const int64_t activation_min, + const int64_t activation_max, Tensor& out) { // Validate tensor types and dim order bool channel_broadcast = is_channel_broadcast(input1_int8, input2_int8); @@ -69,8 +71,8 @@ Tensor& quantized_add_out( // Left shift to maximize precision const int32_t left_shift = 20; - const int32_t activation_min = std::numeric_limits::min(); - const int32_t activation_max = std::numeric_limits::max(); + const int32_t act_min = static_cast(activation_min); + const int32_t act_max = static_cast(activation_max); ET_LOG( Debug, @@ -121,8 +123,8 @@ Tensor& quantized_add_out( static_cast(out_zp), output_mult, output_shift_val, - activation_min, - activation_max, + act_min, + act_max, adds_per_loop); if (status != ARM_CMSIS_NN_SUCCESS) { diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 6e8e149130d..78cdec58c25 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -122,15 +122,16 @@ def dequantize_per_tensor_impl( "quantized_add(" "Tensor self, int self_zero_point, int self_multiplier, int self_shift, " "Tensor other, int other_zero_point, int other_multiplier, int other_shift, " - "int output_zero_point, int output_multiplier, int output_shift) -> Tensor" + "int output_zero_point, int output_multiplier, int output_shift, " + "int activation_min, int activation_max) -> Tensor" ) -# Define the operator schema with multipliers and shifts (11 args + out tensor) lib.define( "quantized_add.out(" "Tensor self, int self_zero_point, int self_multiplier, int self_shift, " "Tensor other, int other_zero_point, int other_multiplier, int other_shift, " "int output_zero_point, int output_multiplier, int output_shift, " + "int activation_min, int activation_max, " "*, Tensor(a!) out) -> Tensor(a!)" ) @@ -148,6 +149,8 @@ def quantized_add_meta( output_zero_point: int, output_multiplier: int, output_shift: int, + activation_min: int, + activation_max: int, ) -> torch.Tensor: assert self.shape == other.shape or is_channel_broadcast(self, other), ( "Cortex-M quantized_add: broadcasting is not yet supported except for channel dim — " @@ -173,6 +176,8 @@ def quantized_add_impl( output_zero_point: int, output_multiplier: int, output_shift: int, + activation_min: int, + activation_max: int, ) -> torch.Tensor: assert self.shape == other.shape or is_channel_broadcast(self, other), ( "Cortex-M quantized_add: broadcasting is not yet supported except for channel dim — " @@ -186,7 +191,7 @@ def quantized_add_impl( result_fp = self_fp + other_fp result_quantized = requantize_cmsis(result_fp, output_multiplier, output_shift) - result = torch.clamp(result_quantized + output_zero_point, -128, 127).to(torch.int8) + result = torch.clamp(result_quantized + output_zero_point, activation_min, activation_max).to(torch.int8) return result diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index 0f8f764c1f3..e0ebbfab868 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -17,7 +17,7 @@ - arg_meta: null kernel_name: cortex_m::dequantize_per_tensor_out -- func: cortex_m::quantized_add.out(Tensor self, int self_zero_point, int self_multiplier, int self_shift, Tensor other, int other_zero_point, int other_multiplier, int other_shift, int output_zero_point, int output_multiplier, int output_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_add.out(Tensor self, int self_zero_point, int self_multiplier, int self_shift, Tensor other, int other_zero_point, int other_multiplier, int other_shift, int output_zero_point, int output_multiplier, int output_shift, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/backends/cortex_m/passes/activation_fusion_pass.py b/backends/cortex_m/passes/activation_fusion_pass.py index 864f9e47ec8..a53c065aaa4 100644 --- a/backends/cortex_m/passes/activation_fusion_pass.py +++ b/backends/cortex_m/passes/activation_fusion_pass.py @@ -40,6 +40,7 @@ class ActivationFusionPass(ExportPass): FUSE_OPS = { exir_ops.edge.aten.linear.default, exir_ops.edge.aten.convolution.default, + exir_ops.edge.aten.add.Tensor, } def _get_validated_qparams(self, node, input_node): @@ -85,7 +86,7 @@ def _get_validated_qparams(self, node, input_node): else qmax ) case _: - raise RuntimeError("Unexpected target {node.target}.") + raise RuntimeError(f"Unexpected target {node.target}.") # If the minimal quantized value is larger than the qmin, it means that the quantized range contains # invalid values [qmin, ..., quantized_min_val-1], indicating bad quantization parameters. diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py index eeb8e8fe6e7..ba772fc2461 100644 --- a/backends/cortex_m/passes/quantized_op_fusion_pass.py +++ b/backends/cortex_m/passes/quantized_op_fusion_pass.py @@ -62,6 +62,9 @@ def _get_add_replacement(self, args, meta): max_scale_2x / (output_scale * (1 << SHIFT_INT8)) ) + activation_min = meta["output_qparams"][0].qmin + activation_max = meta["output_qparams"][0].qmax + args = ( args[0], zero_point1, @@ -74,6 +77,8 @@ def _get_add_replacement(self, args, meta): output_zero_point, output_mult, output_shift, + activation_min, + activation_max, ) return exir_ops.edge.cortex_m.quantized_add.default, args diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py index 348e7bf87f1..9b66df684c8 100644 --- a/backends/cortex_m/quantizer/quantizer_support.py +++ b/backends/cortex_m/quantizer/quantizer_support.py @@ -17,7 +17,19 @@ BINARY_OP_PATTERNS = { (torch.ops.aten.add.Tensor,): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.relu.default): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.relu_.default): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.hardtanh.default): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.hardtanh_.default): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.clamp.default): CortexMAddMulCheck, + (torch.ops.aten.add.Tensor, torch.ops.aten.clamp_.default): CortexMAddMulCheck, (torch.ops.aten.add_.Tensor,): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.relu.default): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.relu_.default): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.hardtanh.default): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.hardtanh_.default): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.clamp.default): CortexMAddMulCheck, + (torch.ops.aten.add_.Tensor, torch.ops.aten.clamp_.default): CortexMAddMulCheck, (torch.ops.aten.mul.Tensor,): CortexMAddMulCheck, (torch.ops.aten.mul_.Tensor,): CortexMAddMulCheck, (torch.ops.aten.hardswish.default,): CortexMAddMulCheck, # lowers to mul From 0169b210f95dba7151c9423344a2098b477a1ece Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Tue, 24 Mar 2026 17:32:09 -0700 Subject: [PATCH 2/4] Cortex-M: Add tests for fused add+activation patterns Add add_relu, add_relu_channels_last, add_hardtanh, and add_hardtanh_channels_last test cases to test_add.py verifying that relu/hardtanh activations are fused into quantized_add. Remove the conv_add_relu xfail from test_nn_modules.py since the fusion now works. Co-authored-by: Claude --- .../cortex_m/test/models/test_nn_modules.py | 4 +- backends/cortex_m/test/ops/test_add.py | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/backends/cortex_m/test/models/test_nn_modules.py b/backends/cortex_m/test/models/test_nn_modules.py index f016f94e7d0..2e10de3cc29 100644 --- a/backends/cortex_m/test/models/test_nn_modules.py +++ b/backends/cortex_m/test/models/test_nn_modules.py @@ -188,9 +188,7 @@ def forward(self, x): ), } -xfails = { - "conv_add_relu": "Activation fusion does not support relu after add", -} +xfails = {} @parametrize("test_case", test_cases, xfails=xfails, strict=False) diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py index b776ac91002..2a92e5ca433 100644 --- a/backends/cortex_m/test/ops/test_add.py +++ b/backends/cortex_m/test/ops/test_add.py @@ -73,6 +73,50 @@ class CortexMAlphaAdd(ModelAlpha): } +class CortexMAddReLU(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_relu_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def __init__(self): + super().__init__() + self.relu = torch.nn.ReLU() + + def forward(self, x, y): + return self.relu(x + y) + + +class CortexMAddHardtanh(torch.nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_hardtanh_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def __init__(self, min_val=-0.5, max_val=0.5): + super().__init__() + self.act = torch.nn.Hardtanh(min_val=min_val, max_val=max_val) + + def forward(self, x, y): + return self.act(x + y) + + test_cases = { "self_rank_1": McuTestCase( CortexMSelfAdd(), @@ -149,6 +193,34 @@ class CortexMAlphaAdd(ModelAlpha): ramp_tensor(-20, 20, (4, 5)), ), ), + "add_relu": McuTestCase( + CortexMAddReLU(), + ( + ramp_tensor(-5, 5, (2, 4)), + ramp_tensor(-3, 3, (2, 4)), + ), + ), + "add_relu_channels_last": McuTestCase( + CortexMAddReLU(), + ( + ramp_tensor(-5, 5, (1, 4, 8, 8)).to(memory_format=torch.channels_last), + ramp_tensor(-3, 3, (1, 4, 8, 8)).to(memory_format=torch.channels_last), + ), + ), + "add_hardtanh": McuTestCase( + CortexMAddHardtanh(min_val=-0.5, max_val=0.5), + ( + ramp_tensor(-2, 2, (2, 4)), + ramp_tensor(-1, 1, (2, 4)), + ), + ), + "add_hardtanh_channels_last": McuTestCase( + CortexMAddHardtanh(min_val=-1.0, max_val=1.0), + ( + ramp_tensor(-3, 3, (1, 4, 8, 8)).to(memory_format=torch.channels_last), + ramp_tensor(-2, 2, (1, 4, 8, 8)).to(memory_format=torch.channels_last), + ), + ), } From ab93ef43efb81bca2ba1290a03859e8b23ce63f7 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Thu, 26 Mar 2026 20:02:24 -0700 Subject: [PATCH 3/4] Cortex-M: Remove redundant inplace add activation patterns Remove add_.Tensor + activation fused patterns from BINARY_OP_PATTERNS. Functionalization converts inplace ops to out-of-place before the quantizer runs, so these patterns are never matched. Co-authored-by: Claude --- backends/cortex_m/quantizer/quantizer_support.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py index 9b66df684c8..2cf0483f74b 100644 --- a/backends/cortex_m/quantizer/quantizer_support.py +++ b/backends/cortex_m/quantizer/quantizer_support.py @@ -24,12 +24,6 @@ (torch.ops.aten.add.Tensor, torch.ops.aten.clamp.default): CortexMAddMulCheck, (torch.ops.aten.add.Tensor, torch.ops.aten.clamp_.default): CortexMAddMulCheck, (torch.ops.aten.add_.Tensor,): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.relu.default): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.relu_.default): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.hardtanh.default): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.hardtanh_.default): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.clamp.default): CortexMAddMulCheck, - (torch.ops.aten.add_.Tensor, torch.ops.aten.clamp_.default): CortexMAddMulCheck, (torch.ops.aten.mul.Tensor,): CortexMAddMulCheck, (torch.ops.aten.mul_.Tensor,): CortexMAddMulCheck, (torch.ops.aten.hardswish.default,): CortexMAddMulCheck, # lowers to mul From e527a0404b9d07b756459b786fa67c8d15d91089 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Fri, 27 Mar 2026 10:14:27 -0700 Subject: [PATCH 4/4] Fix lint errors --- backends/cortex_m/ops/operators.py | 4 +++- backends/cortex_m/test/models/test_nn_modules.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index bd24760cdc3..4de702b47a9 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -192,7 +192,9 @@ def quantized_add_impl( result_fp = self_fp + other_fp result_quantized = requantize_cmsis(result_fp, output_multiplier, output_shift) - result = torch.clamp(result_quantized + output_zero_point, activation_min, activation_max).to(torch.int8) + result = torch.clamp( + result_quantized + output_zero_point, activation_min, activation_max + ).to(torch.int8) return result diff --git a/backends/cortex_m/test/models/test_nn_modules.py b/backends/cortex_m/test/models/test_nn_modules.py index 1a255687ec8..4a92fd578ff 100644 --- a/backends/cortex_m/test/models/test_nn_modules.py +++ b/backends/cortex_m/test/models/test_nn_modules.py @@ -188,7 +188,7 @@ def forward(self, x): ), } -xfails : dict[str, xfail_type] = {} +xfails: dict[str, xfail_type] = {} @parametrize("test_case", test_cases, xfails=xfails, strict=False)