diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index 627406c1935..f88a6306fed 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -81,6 +81,7 @@ set(_cortex_m_kernels__srcs ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_activation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_avg_pool2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_batch_matmul.cpp diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp new file mode 100644 index 00000000000..fb9b4768acf --- /dev/null +++ b/backends/cortex_m/ops/op_quantized_activation.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cortex_m_ops_common.h" + +#include + +#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1) +#include +#define HAS_HELIUM_SIMD 1 +#endif + +#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD) +#include +#define HAS_DSP_PACKED_LUT 1 +#endif + +namespace cortex_m { +namespace native { + +#if defined(HAS_DSP_PACKED_LUT) +// Local 4-byte read/write helpers. We deliberately don't include +// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia` +// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire +// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers. +static inline uint32_t read_u8x4_ia(const int8_t** in) { + uint32_t val; + std::memcpy(&val, *in, 4); + *in += 4; + return val; +} + +static inline void write_u8x4_ia(int8_t** out, uint32_t val) { + std::memcpy(*out, &val, 4); + *out += 4; +} +#endif + +// cppcheck-suppress unusedFunction +Tensor& quantized_activation_out( + KernelRuntimeContext& context, + const Tensor& input, + const Tensor& lut, + Tensor& out) { + ET_CHECK_MSG( + input.scalar_type() == ScalarType::Char, + "quantized_activation: input must be int8"); + ET_CHECK_MSG( + out.scalar_type() == ScalarType::Char, + "quantized_activation: output must be int8"); + ET_CHECK_MSG( + lut.scalar_type() == ScalarType::Char, + "quantized_activation: lut must be int8"); + ET_CHECK_MSG( + lut.numel() == 256, + "quantized_activation: lut must have 256 entries, got %" PRId64, + static_cast(lut.numel())); + ET_CHECK_MSG( + input.numel() == out.numel(), + "quantized_activation: input and output must have the same numel"); + + const int8_t* in_data = input.const_data_ptr(); + const int8_t* lut_data = lut.const_data_ptr(); + int8_t* out_data = out.mutable_data_ptr(); + + // The LUT is precomputed AoT from the input/output qparams and the + // activation function (sigmoid / tanh / silu / ...), so the kernel does not + // need to know which activation it is implementing. The signed int8 input + // is biased by 128 to use it as an unsigned [0, 255] table index. + const int64_t n = input.numel(); + int64_t i = 0; + +#if defined(HAS_HELIUM_SIMD) + // M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8 + // (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then + // gather-load the int8 result from the LUT. + for (; i + 15 < n; i += 16) { + uint8x16_t in_u8 = + vldrbq_u8(reinterpret_cast(in_data + i)); + uint8x16_t idx = vaddq_n_u8(in_u8, 128); + int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx); + vstrbq_s8(out_data + i, result); + } +#elif defined(HAS_DSP_PACKED_LUT) + // M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from + // (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias + // with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The + // LUT lookups themselves still hit memory four times per word -- no DSP + // gather instruction exists on M-class. + const int8_t* in_ptr = in_data; + int8_t* out_ptr = out_data; + const int64_t word_iters = n >> 2; + for (int64_t w = 0; w < word_iters; ++w) { + const uint32_t in_word = read_u8x4_ia(&in_ptr); + const uint32_t idx_word = __uadd8(in_word, 0x80808080u); + const uint32_t out_word = + static_cast(static_cast(lut_data[idx_word & 0xFFu])) | + (static_cast(static_cast(lut_data[(idx_word >> 8) & 0xFFu])) + << 8) | + (static_cast(static_cast(lut_data[(idx_word >> 16) & 0xFFu])) + << 16) | + (static_cast(static_cast(lut_data[(idx_word >> 24) & 0xFFu])) + << 24); + write_u8x4_ia(&out_ptr, out_word); + } + i = word_iters << 2; +#endif + + // 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll + // lets the compiler issue independent LUT loads; on the MVE / DSP paths + // above this only runs for the < 16- (or < 4-) element remainder. + for (; i + 3 < n; i += 4) { + out_data[i + 0] = lut_data[static_cast(in_data[i + 0] + 128)]; + out_data[i + 1] = lut_data[static_cast(in_data[i + 1] + 128)]; + out_data[i + 2] = lut_data[static_cast(in_data[i + 2] + 128)]; + out_data[i + 3] = lut_data[static_cast(in_data[i + 3] + 128)]; + } + for (; i < n; ++i) { + out_data[i] = lut_data[static_cast(in_data[i] + 128)]; + } + + return out; +} + +} // namespace native +} // namespace cortex_m diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index d4393bc7ada..4c6fb44e89d 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -264,6 +264,35 @@ def quantized_mul_impl( return result +# =================================================================== +# QUANTIZED ACTIVATION (LUT) OPERATION DEFINITION +# =================================================================== +# Generic table-lookup activation. The 256-entry int8 LUT is precomputed AoT +# from the input/output qparams and the activation function (sigmoid, tanh, +# silu, ...), so the kernel is identical regardless of which activation it +# evaluates: out[i] = lut[input[i] + 128]. +lib.define("quantized_activation(Tensor input, Tensor lut) -> Tensor") +lib.define( + "quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)" +) + + +@register_fake("cortex_m::quantized_activation") # type: ignore[misc] +def quantized_activation_meta(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor: + assert input.dtype == torch.int8, "quantized_activation input must be int8" + assert lut.dtype == torch.int8 and lut.numel() == 256, ( + "quantized_activation lut must be int8 with 256 entries; " + f"got dtype={lut.dtype}, numel={lut.numel()}" + ) + return torch.empty_like(input) + + +@impl(lib, "quantized_activation", "CompositeExplicitAutograd") # type: ignore[misc] +def quantized_activation_impl(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor: + indices = input.to(torch.int32) + 128 + return lut[indices].to(torch.int8) + + # =================================================================== # QUANTIZED BATCH MATMUL OPERATION DEFINITION # =================================================================== diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index 8db109dea43..8eacf2f49b9 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -29,6 +29,12 @@ - arg_meta: null kernel_name: cortex_m::quantized_mul_out +- func: cortex_m::quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: cortex_m::quantized_activation_out + - func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 5704645caf8..24cc85bac66 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -13,7 +13,10 @@ from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass -from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot +from executorch.backends.cortex_m.passes.passes_utils import ( + build_activation_lut, + quantize_multiplier_aot, +) from executorch.backends.cortex_m.passes.scratch_buffer_sizes import ( required_cmsis_nn_buffer_sizes, ) @@ -483,6 +486,38 @@ def _get_bmm_replacement(self, node): ) return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args + def _get_activation_replacement(self, node): + """Lower a standalone quantized sigmoid / tanh / silu to a single + cortex_m.quantized_activation call backed by an AoT-built 256-entry + int8 LUT. The kernel is shape-agnostic; the LUT encodes both the + activation function and the input/output qparams. + """ + input_qparams = node.meta["input_qparams"][0] + output_qparams = node.meta["output_qparams"][0] + lut_tensor = build_activation_lut( + node.target, + float(input_qparams.scale), + int(input_qparams.zp), + float(output_qparams.scale), + int(output_qparams.zp), + ) + + # Constant placeholders must appear before user-input placeholders; + # anchor on the first existing placeholder so the new LUT lands in the + # constant-placeholder block at the top of the graph. + first_placeholder = next(n for n in node.graph.nodes if n.op == "placeholder") + with node.graph.inserting_before(first_placeholder): + lut_node = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_lut", + InputKind.PARAMETER, + lut_tensor, + ) + + new_args = (node.args[0], lut_node) + return exir_ops.edge.cortex_m.quantized_activation.default, new_args + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: modified = False for node in graph_module.graph.nodes: @@ -506,6 +541,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: op, args = self._get_convolution_replacement(node) case exir_ops.edge.aten.bmm.default: op, args = self._get_bmm_replacement(node) + case ( + exir_ops.edge.aten.sigmoid.default + | exir_ops.edge.aten.tanh.default + | exir_ops.edge.aten.silu.default + ): + op, args = self._get_activation_replacement(node) case _: continue diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py index fcbfa301b06..24e2da95dba 100644 --- a/backends/cortex_m/passes/passes_utils.py +++ b/backends/cortex_m/passes/passes_utils.py @@ -190,6 +190,67 @@ def is_qualified_int8_node(args) -> bool: return False +def _stable_sigmoid(x: float) -> float: + # Always exponentiate the non-positive value so `math.exp` never overflows + # for unusually large `|x|` (e.g. wide-range input qparams). Algebraically + # identical to `1 / (1 + exp(-x))`. + if x >= 0: + return 1.0 / (1.0 + math.exp(-x)) + e = math.exp(x) + return e / (1.0 + e) + + +def _stable_silu(x: float) -> float: + return x * _stable_sigmoid(x) + + +_ACTIVATION_FNS = { + exir_ops.edge.aten.sigmoid.default: _stable_sigmoid, + exir_ops.edge.aten.tanh.default: math.tanh, + exir_ops.edge.aten.silu.default: _stable_silu, +} + + +def _round_half_away_from_zero(x: float) -> int: + # Matches the rounding convention `requantize_cmsis` (above) applies after + # the right-shift step: ties on positive values round toward +∞, ties on + # negative values round toward -∞. Python's built-in `round` would use + # banker's rounding instead and disagree at exact half-integers. + return int(math.copysign(math.floor(abs(x) + 0.5), x)) if x != 0 else 0 + + +def build_activation_lut( + target, + input_scale: float, + input_zp: int, + output_scale: float, + output_zp: int, +) -> torch.Tensor: + """AoT-compute a 256-entry int8 lookup table for a quantized activation. + + `target` is the edge-dialect op being lowered (e.g. + `exir_ops.edge.aten.sigmoid.default`). + + The LUT is indexed by the input byte value biased by 128: for any int8 + input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output. + Because the LUT is computed in float and quantized once per entry, the + runtime kernel is a single memory-lookup with no requantization math. + """ + if target not in _ACTIVATION_FNS: + raise ValueError( + f"build_activation_lut: unsupported activation target {target!r} " + f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})" + ) + f = _ACTIVATION_FNS[target] + lut = torch.empty(256, dtype=torch.int8) + for q in range(-128, 128): + x = (q - input_zp) * input_scale + y = f(x) + q_out = _round_half_away_from_zero(y / output_scale + output_zp) + lut[q + 128] = max(-128, min(127, q_out)) + return lut + + def quantize_multiplier_aot(scale: float) -> tuple[int, int]: if scale == 0.0: return 0, 0 diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py index 860d8345607..5715ca042de 100644 --- a/backends/cortex_m/quantizer/pattern_checkers.py +++ b/backends/cortex_m/quantizer/pattern_checkers.py @@ -99,6 +99,25 @@ def check_quantization_config( return is_int8 +class CortexMActivationCheck(PatternCheck): + """Accept standalone elementwise activations (sigmoid / tanh / silu) + that the LUT-based cortex_m.quantized_activation op handles uniformly. + + The kernel is shape-agnostic and the LUT is computed AoT from per-tensor + qparams, so the only thing to enforce is int8 per-tensor quantization. + """ + + @classmethod + def check_quantization_config( + cls, pattern: list[Node], quantization_config: QuantizationConfig + ) -> bool: + is_int8 = cls.is_int8_activations(quantization_config) + is_per_tensor = cls.is_per_tensor( + quantization_config.get_input_act_qspec() + ) and cls.is_per_tensor(quantization_config.get_output_act_qspec()) + return is_int8 and is_per_tensor + + class CortexMSoftmaxCheck(PatternCheck): @classmethod diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py index 3dfbb67638a..317189a5f3e 100644 --- a/backends/cortex_m/quantizer/quantizer_support.py +++ b/backends/cortex_m/quantizer/quantizer_support.py @@ -5,6 +5,7 @@ import torch from executorch.backends.cortex_m.quantizer.pattern_checkers import ( + CortexMActivationCheck, CortexMAddMulCheck, CortexMAvgPool2DCheck, CortexMBmmCheck, @@ -119,6 +120,12 @@ (torch.ops.aten.softmax.int,): CortexMSoftmaxCheck, } +ACTIVATION_OP_PATTERNS = { + (torch.ops.aten.sigmoid.default,): CortexMActivationCheck, + (torch.ops.aten.tanh.default,): CortexMActivationCheck, + (torch.ops.aten.silu.default,): CortexMActivationCheck, +} + POOL_OP_PATTERNS = { (torch.ops.aten.avg_pool2d.default,): CortexMAvgPool2DCheck, (torch.ops.aten.max_pool2d.default,): CortexMMaxPool2DCheck, @@ -161,4 +168,5 @@ | CONV_TRANSPOSE_OP_PATTERNS | POOL_OP_PATTERNS | BMM_OP_PATTERNS + | ACTIVATION_OP_PATTERNS ) diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py index 27b958627bb..9793f94f2c6 100644 --- a/backends/cortex_m/test/models/test_silero_vad.py +++ b/backends/cortex_m/test/models/test_silero_vad.py @@ -36,9 +36,18 @@ "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12, - "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 15, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14, } +# The final `sigmoid(final_conv(x))` now lowers to cortex_m.quantized_activation. +# The 3 remaining sigmoids and 2 tanhs are LSTMCell gates: PyTorch export +# captures nn.LSTMCell as a single high-level op, so the quantizer never sees +# the gate activations and can't annotate them. They're decomposed only at +# to_edge -- which runs after the quantizer, so by then the gates have no +# qparams to fold and the lowering pass correctly skips them. The unblocker +# is a pre-annotation decompose pass that splits nn.LSTMCell into linear + +# split + sigmoid + tanh + add + mul *before* prepare_pt2e runs; tracked as +# the LSTMCell verification follow-up. ops_after_transforms: dict[str, int] = { "executorch_exir_dialects_edge__ops_aten_abs_default": 2, "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2, @@ -52,7 +61,7 @@ "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2, "executorch_exir_dialects_edge__ops_aten_relu_default": 5, "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2, - "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 3, "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, @@ -61,8 +70,9 @@ "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, - "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6, - "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 7, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 7, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1, "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, } diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py new file mode 100644 index 00000000000..6ae82e1e70c --- /dev/null +++ b/backends/cortex_m/test/ops/test_activation_quant.py @@ -0,0 +1,152 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) + + +# A single per-op `ops_after_transforms` shape is enough: every supported +# activation lowers to exactly one cortex_m.quantized_activation, with the +# AoT LUT stored as a constant placeholder and a single quant/dequant pair +# at the graph boundary. +_OPS_BEFORE = { + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, +} +_OPS_AFTER = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, +} + + +class _Sigmoid(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.sigmoid(x) + + +class _Tanh(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_tanh_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.tanh(x) + + +class _SiLU(torch.nn.Module): + ops_before_transforms = { + **_OPS_BEFORE, + "executorch_exir_dialects_edge__ops_aten_silu_default": 1, + } + ops_after_transforms = _OPS_AFTER + + def forward(self, x): + return torch.nn.functional.silu(x) + + +import torch as _torch + + +def _zero_input(shape): + return _torch.zeros(shape, dtype=_torch.float32) + + +# Wide-magnitude inputs exercise the `max(-128, min(127, q_out))` clamp inside +# build_activation_lut; shifted-ramp inputs push the quantizer to pick a +# non-zero `input_zp`, exercising the `(q - input_zp) * input_scale` term in +# the LUT formula; all-zero inputs pin down the lut entry at `input_zp + 128`. +test_cases = { + "sigmoid_rank1": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-6, 6, (16,)),), + ), + "sigmoid_rank4": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),), + ), + "sigmoid_saturating": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-50, 50, (32,)),), + ), + "sigmoid_asymmetric_zp": McuTestCase( + model=_Sigmoid(), + example_inputs=(ramp_tensor(-1, 9, (16,)),), + ), + "sigmoid_zero": McuTestCase( + model=_Sigmoid(), + example_inputs=(_zero_input((16,)),), + ), + "tanh_rank1": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-3, 3, (16,)),), + ), + "tanh_rank3": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-2, 2, (1, 4, 16)),), + ), + "tanh_saturating": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-30, 30, (32,)),), + ), + "tanh_asymmetric_zp": McuTestCase( + model=_Tanh(), + example_inputs=(ramp_tensor(-1, 5, (16,)),), + ), + "tanh_zero": McuTestCase( + model=_Tanh(), + example_inputs=(_zero_input((16,)),), + ), + "silu_rank1": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-6, 6, (16,)),), + ), + "silu_rank4": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),), + ), + "silu_saturating": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-50, 50, (32,)),), + ), + "silu_asymmetric_zp": McuTestCase( + model=_SiLU(), + example_inputs=(ramp_tensor(-1, 9, (16,)),), + ), + "silu_zero": McuTestCase( + model=_SiLU(), + example_inputs=(_zero_input((16,)),), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_quantized_activation(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, + test_case.model.ops_after_transforms, + qtol=1, + ) + + +@parametrize("test_case", test_cases) +def test_implementation_quantized_activation(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation(qtol=1) diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py index e9912d03cad..5a56ad62e92 100644 --- a/backends/cortex_m/test/tester.py +++ b/backends/cortex_m/test/tester.py @@ -42,6 +42,14 @@ def __init__(self): torch.ops.aten.hardsigmoid_.default, torch.ops.aten.hardswish.default, torch.ops.aten.hardswish_.default, + # silu naturally decomposes to sigmoid*x at the to_edge step. + # Preserve it so the LUT lowering can collapse it into a single + # cortex_m.quantized_activation call rather than emitting an + # extra elementwise mul. Set globally because no per-test + # opt-out exists today; any new cortex_m test that uses SiLU + # must therefore expect a single aten.silu op in the edge graph + # (not sigmoid+mul). + torch.ops.aten.silu.default, ], _check_ir_validity=False, _core_aten_ops_exception_list=[torch.ops.aten.max_pool2d.default],