diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 627406c1935..f88a6306fed 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -81,6 +81,7 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_activation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_avg_pool2d.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_batch_matmul.cpp
diff --git a/backends/cortex_m/ops/op_quantized_activation.cpp b/backends/cortex_m/ops/op_quantized_activation.cpp
new file mode 100644
index 00000000000..fb9b4768acf
--- /dev/null
+++ b/backends/cortex_m/ops/op_quantized_activation.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+#include <cstring>
+
+#if defined(__ARM_FEATURE_MVE) && (__ARM_FEATURE_MVE & 1)
+#include <arm_mve.h>
+#define HAS_HELIUM_SIMD 1
+#endif
+
+#if defined(ARM_MATH_DSP) && !defined(HAS_HELIUM_SIMD)
+#include <arm_acle.h>
+#define HAS_DSP_PACKED_LUT 1
+#endif
+
+namespace cortex_m {
+namespace native {
+
+#if defined(HAS_DSP_PACKED_LUT)
+// Local 4-byte read/write helpers. We deliberately don't include
+// `arm_nnsupportfunctions.h` for the equivalent CMSIS-NN `arm_nn_read_s8x4_ia`
+// / `arm_nn_write_s8x4_ia` -- the header is public but pulls in the entire
+// CMSIS-NN support surface (~1500 lines) just for two memcpy wrappers.
+static inline uint32_t read_u8x4_ia(const int8_t** in) {
+  uint32_t val;
+  std::memcpy(&val, *in, 4);
+  *in += 4;
+  return val;
+}
+
+static inline void write_u8x4_ia(int8_t** out, uint32_t val) {
+  std::memcpy(*out, &val, 4);
+  *out += 4;
+}
+#endif
+
+// cppcheck-suppress unusedFunction
+Tensor& quantized_activation_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Tensor& lut,
+    Tensor& out) {
+  ET_CHECK_MSG(
+      input.scalar_type() == ScalarType::Char,
+      "quantized_activation: input must be int8");
+  ET_CHECK_MSG(
+      out.scalar_type() == ScalarType::Char,
+      "quantized_activation: output must be int8");
+  ET_CHECK_MSG(
+      lut.scalar_type() == ScalarType::Char,
+      "quantized_activation: lut must be int8");
+  ET_CHECK_MSG(
+      lut.numel() == 256,
+      "quantized_activation: lut must have 256 entries, got %" PRId64,
+      static_cast<int64_t>(lut.numel()));
+  ET_CHECK_MSG(
+      input.numel() == out.numel(),
+      "quantized_activation: input and output must have the same numel");
+
+  const int8_t* in_data = input.const_data_ptr<int8_t>();
+  const int8_t* lut_data = lut.const_data_ptr<int8_t>();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+
+  // The LUT is precomputed AoT from the input/output qparams and the
+  // activation function (sigmoid / tanh / silu / ...), so the kernel does not
+  // need to know which activation it is implementing. The signed int8 input
+  // is biased by 128 to use it as an unsigned [0, 255] table index.
+  const int64_t n = input.numel();
+  int64_t i = 0;
+
+#if defined(HAS_HELIUM_SIMD)
+  // M55/M85: 16 lanes per iteration. Reinterpret the int8 input as uint8
+  // (bit-identical load), add 128 mod 256 to produce a uint8 LUT index, then
+  // gather-load the int8 result from the LUT.
+  for (; i + 15 < n; i += 16) {
+    uint8x16_t in_u8 =
+        vldrbq_u8(reinterpret_cast<const uint8_t*>(in_data + i));
+    uint8x16_t idx = vaddq_n_u8(in_u8, 128);
+    int8x16_t result = vldrbq_gather_offset_s8(lut_data, idx);
+    vstrbq_s8(out_data + i, result);
+  }
+#elif defined(HAS_DSP_PACKED_LUT)
+  // M4/M7 (DSP, no MVE): process 4 bytes per iteration. The DSP win comes from
+  // (a) folding 4 byte-loads into one word-load, (b) batching the +128 bias
+  // with `__uadd8`, and (c) folding 4 byte-stores into one word-store. The
+  // LUT lookups themselves still hit memory four times per word -- no DSP
+  // gather instruction exists on M-class.
+  const int8_t* in_ptr = in_data;
+  int8_t* out_ptr = out_data;
+  const int64_t word_iters = n >> 2;
+  for (int64_t w = 0; w < word_iters; ++w) {
+    const uint32_t in_word = read_u8x4_ia(&in_ptr);
+    const uint32_t idx_word = __uadd8(in_word, 0x80808080u);
+    const uint32_t out_word =
+        static_cast<uint32_t>(static_cast<uint8_t>(lut_data[idx_word & 0xFFu])) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 8) & 0xFFu]))
+         << 8) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 16) & 0xFFu]))
+         << 16) |
+        (static_cast<uint32_t>(static_cast<uint8_t>(lut_data[(idx_word >> 24) & 0xFFu]))
+         << 24);
+    write_u8x4_ia(&out_ptr, out_word);
+  }
+  i = word_iters << 2;
+#endif
+
+  // 4x-unrolled scalar tail. On M-class cores without MVE or DSP the unroll
+  // lets the compiler issue independent LUT loads; on the MVE / DSP paths
+  // above this only runs for the < 16- (or < 4-) element remainder.
+  for (; i + 3 < n; i += 4) {
+    out_data[i + 0] = lut_data[static_cast<uint8_t>(in_data[i + 0] + 128)];
+    out_data[i + 1] = lut_data[static_cast<uint8_t>(in_data[i + 1] + 128)];
+    out_data[i + 2] = lut_data[static_cast<uint8_t>(in_data[i + 2] + 128)];
+    out_data[i + 3] = lut_data[static_cast<uint8_t>(in_data[i + 3] + 128)];
+  }
+  for (; i < n; ++i) {
+    out_data[i] = lut_data[static_cast<uint8_t>(in_data[i] + 128)];
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index d4393bc7ada..4c6fb44e89d 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -264,6 +264,35 @@ def quantized_mul_impl(
     return result
 
 
+# ===================================================================
+# QUANTIZED ACTIVATION (LUT) OPERATION DEFINITION
+# ===================================================================
+# Generic table-lookup activation. The 256-entry int8 LUT is precomputed AoT
+# from the input/output qparams and the activation function (sigmoid, tanh,
+# silu, ...), so the kernel is identical regardless of which activation it
+# evaluates: out[i] = lut[input[i] + 128].
+lib.define("quantized_activation(Tensor input, Tensor lut) -> Tensor")
+lib.define(
+    "quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
+@register_fake("cortex_m::quantized_activation")  # type: ignore[misc]
+def quantized_activation_meta(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    assert input.dtype == torch.int8, "quantized_activation input must be int8"
+    assert lut.dtype == torch.int8 and lut.numel() == 256, (
+        "quantized_activation lut must be int8 with 256 entries; "
+        f"got dtype={lut.dtype}, numel={lut.numel()}"
+    )
+    return torch.empty_like(input)
+
+
+@impl(lib, "quantized_activation", "CompositeExplicitAutograd")  # type: ignore[misc]
+def quantized_activation_impl(input: torch.Tensor, lut: torch.Tensor) -> torch.Tensor:
+    indices = input.to(torch.int32) + 128
+    return lut[indices].to(torch.int8)
+
+
 # ===================================================================
 # QUANTIZED BATCH MATMUL OPERATION DEFINITION
 # ===================================================================
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index 8db109dea43..8eacf2f49b9 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -29,6 +29,12 @@
     - arg_meta: null
       kernel_name: cortex_m::quantized_mul_out
 
+- func: cortex_m::quantized_activation.out(Tensor input, Tensor lut, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_activation_out
+
 - func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index 5704645caf8..24cc85bac66 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -13,7 +13,10 @@
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 
 from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass
-from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+from executorch.backends.cortex_m.passes.passes_utils import (
+    build_activation_lut,
+    quantize_multiplier_aot,
+)
 from executorch.backends.cortex_m.passes.scratch_buffer_sizes import (
     required_cmsis_nn_buffer_sizes,
 )
@@ -483,6 +486,38 @@ def _get_bmm_replacement(self, node):
         )
         return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args
 
+    def _get_activation_replacement(self, node):
+        """Lower a standalone quantized sigmoid / tanh / silu to a single
+        cortex_m.quantized_activation call backed by an AoT-built 256-entry
+        int8 LUT. The kernel is shape-agnostic; the LUT encodes both the
+        activation function and the input/output qparams.
+        """
+        input_qparams = node.meta["input_qparams"][0]
+        output_qparams = node.meta["output_qparams"][0]
+        lut_tensor = build_activation_lut(
+            node.target,
+            float(input_qparams.scale),
+            int(input_qparams.zp),
+            float(output_qparams.scale),
+            int(output_qparams.zp),
+        )
+
+        # Constant placeholders must appear before user-input placeholders;
+        # anchor on the first existing placeholder so the new LUT lands in the
+        # constant-placeholder block at the top of the graph.
+        first_placeholder = next(n for n in node.graph.nodes if n.op == "placeholder")
+        with node.graph.inserting_before(first_placeholder):
+            lut_node = create_constant_placeholder(
+                self.exported_program,
+                node.graph,
+                node.name + "_lut",
+                InputKind.PARAMETER,
+                lut_tensor,
+            )
+
+        new_args = (node.args[0], lut_node)
+        return exir_ops.edge.cortex_m.quantized_activation.default, new_args
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         modified = False
         for node in graph_module.graph.nodes:
@@ -506,6 +541,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                         op, args = self._get_convolution_replacement(node)
                 case exir_ops.edge.aten.bmm.default:
                     op, args = self._get_bmm_replacement(node)
+                case (
+                    exir_ops.edge.aten.sigmoid.default
+                    | exir_ops.edge.aten.tanh.default
+                    | exir_ops.edge.aten.silu.default
+                ):
+                    op, args = self._get_activation_replacement(node)
                 case _:
                     continue
 
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
index fcbfa301b06..24e2da95dba 100644
--- a/backends/cortex_m/passes/passes_utils.py
+++ b/backends/cortex_m/passes/passes_utils.py
@@ -190,6 +190,67 @@ def is_qualified_int8_node(args) -> bool:
         return False
 
 
+def _stable_sigmoid(x: float) -> float:
+    # Always exponentiate the non-positive value so `math.exp` never overflows
+    # for unusually large `|x|` (e.g. wide-range input qparams). Algebraically
+    # identical to `1 / (1 + exp(-x))`.
+    if x >= 0:
+        return 1.0 / (1.0 + math.exp(-x))
+    e = math.exp(x)
+    return e / (1.0 + e)
+
+
+def _stable_silu(x: float) -> float:
+    return x * _stable_sigmoid(x)
+
+
+_ACTIVATION_FNS = {
+    exir_ops.edge.aten.sigmoid.default: _stable_sigmoid,
+    exir_ops.edge.aten.tanh.default: math.tanh,
+    exir_ops.edge.aten.silu.default: _stable_silu,
+}
+
+
+def _round_half_away_from_zero(x: float) -> int:
+    # Matches the rounding convention `requantize_cmsis` (above) applies after
+    # the right-shift step: ties on positive values round toward +∞, ties on
+    # negative values round toward -∞. Python's built-in `round` would use
+    # banker's rounding instead and disagree at exact half-integers.
+    return int(math.copysign(math.floor(abs(x) + 0.5), x)) if x != 0 else 0
+
+
+def build_activation_lut(
+    target,
+    input_scale: float,
+    input_zp: int,
+    output_scale: float,
+    output_zp: int,
+) -> torch.Tensor:
+    """AoT-compute a 256-entry int8 lookup table for a quantized activation.
+
+    `target` is the edge-dialect op being lowered (e.g.
+    `exir_ops.edge.aten.sigmoid.default`).
+
+    The LUT is indexed by the input byte value biased by 128: for any int8
+    input `q_in`, the kernel reads `lut[q_in + 128]` to get the int8 output.
+    Because the LUT is computed in float and quantized once per entry, the
+    runtime kernel is a single memory-lookup with no requantization math.
+    """
+    if target not in _ACTIVATION_FNS:
+        raise ValueError(
+            f"build_activation_lut: unsupported activation target {target!r} "
+            f"(supported: {sorted(t.__name__ for t in _ACTIVATION_FNS)})"
+        )
+    f = _ACTIVATION_FNS[target]
+    lut = torch.empty(256, dtype=torch.int8)
+    for q in range(-128, 128):
+        x = (q - input_zp) * input_scale
+        y = f(x)
+        q_out = _round_half_away_from_zero(y / output_scale + output_zp)
+        lut[q + 128] = max(-128, min(127, q_out))
+    return lut
+
+
 def quantize_multiplier_aot(scale: float) -> tuple[int, int]:
     if scale == 0.0:
         return 0, 0
diff --git a/backends/cortex_m/quantizer/pattern_checkers.py b/backends/cortex_m/quantizer/pattern_checkers.py
index 860d8345607..5715ca042de 100644
--- a/backends/cortex_m/quantizer/pattern_checkers.py
+++ b/backends/cortex_m/quantizer/pattern_checkers.py
@@ -99,6 +99,25 @@ def check_quantization_config(
         return is_int8
 
 
+class CortexMActivationCheck(PatternCheck):
+    """Accept standalone elementwise activations (sigmoid / tanh / silu)
+    that the LUT-based cortex_m.quantized_activation op handles uniformly.
+
+    The kernel is shape-agnostic and the LUT is computed AoT from per-tensor
+    qparams, so the only thing to enforce is int8 per-tensor quantization.
+    """
+
+    @classmethod
+    def check_quantization_config(
+        cls, pattern: list[Node], quantization_config: QuantizationConfig
+    ) -> bool:
+        is_int8 = cls.is_int8_activations(quantization_config)
+        is_per_tensor = cls.is_per_tensor(
+            quantization_config.get_input_act_qspec()
+        ) and cls.is_per_tensor(quantization_config.get_output_act_qspec())
+        return is_int8 and is_per_tensor
+
+
 class CortexMSoftmaxCheck(PatternCheck):
 
     @classmethod
diff --git a/backends/cortex_m/quantizer/quantizer_support.py b/backends/cortex_m/quantizer/quantizer_support.py
index 3dfbb67638a..317189a5f3e 100644
--- a/backends/cortex_m/quantizer/quantizer_support.py
+++ b/backends/cortex_m/quantizer/quantizer_support.py
@@ -5,6 +5,7 @@
 
 import torch
 from executorch.backends.cortex_m.quantizer.pattern_checkers import (
+    CortexMActivationCheck,
     CortexMAddMulCheck,
     CortexMAvgPool2DCheck,
     CortexMBmmCheck,
@@ -119,6 +120,12 @@
     (torch.ops.aten.softmax.int,): CortexMSoftmaxCheck,
 }
 
+ACTIVATION_OP_PATTERNS = {
+    (torch.ops.aten.sigmoid.default,): CortexMActivationCheck,
+    (torch.ops.aten.tanh.default,): CortexMActivationCheck,
+    (torch.ops.aten.silu.default,): CortexMActivationCheck,
+}
+
 POOL_OP_PATTERNS = {
     (torch.ops.aten.avg_pool2d.default,): CortexMAvgPool2DCheck,
     (torch.ops.aten.max_pool2d.default,): CortexMMaxPool2DCheck,
@@ -161,4 +168,5 @@
     | CONV_TRANSPOSE_OP_PATTERNS
     | POOL_OP_PATTERNS
     | BMM_OP_PATTERNS
+    | ACTIVATION_OP_PATTERNS
 )
diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py
index 27b958627bb..9793f94f2c6 100644
--- a/backends/cortex_m/test/models/test_silero_vad.py
+++ b/backends/cortex_m/test/models/test_silero_vad.py
@@ -36,9 +36,18 @@
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
     "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12,
-    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 15,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 14,
 }
+# The final `sigmoid(final_conv(x))` now lowers to cortex_m.quantized_activation.
+# The 3 remaining sigmoids and 2 tanhs are LSTMCell gates: PyTorch export
+# captures nn.LSTMCell as a single high-level op, so the quantizer never sees
+# the gate activations and can't annotate them. They're decomposed only at
+# to_edge -- which runs after the quantizer, so by then the gates have no
+# qparams to fold and the lowering pass correctly skips them. The unblocker
+# is a pre-annotation decompose pass that splits nn.LSTMCell into linear +
+# split + sigmoid + tanh + add + mul *before* prepare_pt2e runs; tracked as
+# the LSTMCell verification follow-up.
 ops_after_transforms: dict[str, int] = {
     "executorch_exir_dialects_edge__ops_aten_abs_default": 2,
     "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2,
@@ -52,7 +61,7 @@
     "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2,
     "executorch_exir_dialects_edge__ops_aten_relu_default": 5,
     "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2,
-    "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4,
+    "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 3,
     "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2,
     "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1,
     "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1,
@@ -61,8 +70,9 @@
     "executorch_exir_dialects_edge__ops_aten_tanh_default": 2,
     "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2,
     "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
-    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6,
-    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 7,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 7,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1,
     "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
 }
 
diff --git a/backends/cortex_m/test/ops/test_activation_quant.py b/backends/cortex_m/test/ops/test_activation_quant.py
new file mode 100644
index 00000000000..6ae82e1e70c
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_activation_quant.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+
+# A single per-op `ops_after_transforms` shape is enough: every supported
+# activation lowers to exactly one cortex_m.quantized_activation, with the
+# AoT LUT stored as a constant placeholder and a single quant/dequant pair
+# at the graph boundary.
+_OPS_BEFORE = {
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+    "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+}
+_OPS_AFTER = {
+    "executorch_exir_dialects_edge__ops_cortex_m_quantized_activation_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+    "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+}
+
+
+class _Sigmoid(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.sigmoid(x)
+
+
+class _Tanh(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_tanh_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.tanh(x)
+
+
+class _SiLU(torch.nn.Module):
+    ops_before_transforms = {
+        **_OPS_BEFORE,
+        "executorch_exir_dialects_edge__ops_aten_silu_default": 1,
+    }
+    ops_after_transforms = _OPS_AFTER
+
+    def forward(self, x):
+        return torch.nn.functional.silu(x)
+
+
+import torch as _torch
+
+
+def _zero_input(shape):
+    return _torch.zeros(shape, dtype=_torch.float32)
+
+
+# Wide-magnitude inputs exercise the `max(-128, min(127, q_out))` clamp inside
+# build_activation_lut; shifted-ramp inputs push the quantizer to pick a
+# non-zero `input_zp`, exercising the `(q - input_zp) * input_scale` term in
+# the LUT formula; all-zero inputs pin down the lut entry at `input_zp + 128`.
+test_cases = {
+    "sigmoid_rank1": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-6, 6, (16,)),),
+    ),
+    "sigmoid_rank4": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),),
+    ),
+    "sigmoid_saturating": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-50, 50, (32,)),),
+    ),
+    "sigmoid_asymmetric_zp": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(ramp_tensor(-1, 9, (16,)),),
+    ),
+    "sigmoid_zero": McuTestCase(
+        model=_Sigmoid(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+    "tanh_rank1": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-3, 3, (16,)),),
+    ),
+    "tanh_rank3": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-2, 2, (1, 4, 16)),),
+    ),
+    "tanh_saturating": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-30, 30, (32,)),),
+    ),
+    "tanh_asymmetric_zp": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(ramp_tensor(-1, 5, (16,)),),
+    ),
+    "tanh_zero": McuTestCase(
+        model=_Tanh(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+    "silu_rank1": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-6, 6, (16,)),),
+    ),
+    "silu_rank4": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-4, 4, (1, 8, 4, 4)),),
+    ),
+    "silu_saturating": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-50, 50, (32,)),),
+    ),
+    "silu_asymmetric_zp": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(ramp_tensor(-1, 9, (16,)),),
+    ),
+    "silu_zero": McuTestCase(
+        model=_SiLU(),
+        example_inputs=(_zero_input((16,)),),
+    ),
+}
+
+
+@parametrize("test_case", test_cases)
+def test_dialect_quantized_activation(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms,
+        test_case.model.ops_after_transforms,
+        qtol=1,
+    )
+
+
+@parametrize("test_case", test_cases)
+def test_implementation_quantized_activation(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation(qtol=1)
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
index e9912d03cad..5a56ad62e92 100644
--- a/backends/cortex_m/test/tester.py
+++ b/backends/cortex_m/test/tester.py
@@ -42,6 +42,14 @@ def __init__(self):
                 torch.ops.aten.hardsigmoid_.default,
                 torch.ops.aten.hardswish.default,
                 torch.ops.aten.hardswish_.default,
+                # silu naturally decomposes to sigmoid*x at the to_edge step.
+                # Preserve it so the LUT lowering can collapse it into a single
+                # cortex_m.quantized_activation call rather than emitting an
+                # extra elementwise mul. Set globally because no per-test
+                # opt-out exists today; any new cortex_m test that uses SiLU
+                # must therefore expect a single aten.silu op in the edge graph
+                # (not sigmoid+mul).
+                torch.ops.aten.silu.default,
             ],
             _check_ir_validity=False,
             _core_aten_ops_exception_list=[torch.ops.aten.max_pool2d.default],