From 793438c3d177d95ecdbd1f9ebf66079c943f984c Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Fri, 29 Aug 2025 14:09:46 -0700
Subject: [PATCH 1/5] Summary: Minor cleanup post quantized_add op

Test Plan: examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qadd2 --no_delegate

Reviewers:

Subscribers:

Tasks:

Tags:
---
 backends/cortex_m/ops/cortex_m_ops_common.h | 21 ++++++++++++----
 backends/cortex_m/ops/op_quantized_add.cpp  | 27 +++++++++++----------
 2 files changed, 30 insertions(+), 18 deletions(-)
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
index 0bde2ddff17..5ef2d9d4bf9 100644
--- a/backends/cortex_m/ops/cortex_m_ops_common.h
+++ b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -17,11 +17,6 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
 using Scalar = torch::executor::Scalar;
@@ -139,3 +134,19 @@ inline Error resize_to_broadcast_target_size(
   return executorch::runtime::resize_tensor(
       output, {expected_output_size, expected_output_dim});
 }
+
+/**
+ * Convert Scalar to CMSIS-NN int32 format
+ * For multipliers, zero_points, etc. from quantize_multiplier_aot
+ */
+inline int32_t extractScalarToInt32(const Scalar& scalar_value) {
+  return static_cast<int32_t>(scalar_value.to<int64_t>());
+}
+
+/**
+ * Convert Scalar to CMSIS-NN int format
+ * For shift values from quantize_multiplier_aot
+ */
+inline int extractScalarToInt(const Scalar& scalar_value) {
+  return static_cast<int>(scalar_value.to<int64_t>());
+}
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
index 47f6df6bfc5..044c2bd92d5 100644
--- a/backends/cortex_m/ops/op_quantized_add.cpp
+++ b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -8,6 +8,11 @@
 
 #include "cortex_m_ops_common.h"
 
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
@@ -54,19 +59,15 @@ Tensor& quantized_add_out(
       "quantized_add_out: input1_int8.sizes() = %zu",
       input1_int8.sizes().size());
 
-  // FIX: Use template types that ExecutorTorch definitely provides
-  // Use to<int64_t>() and to<double>() which are commonly instantiated
-  int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
-  int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
-  int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
-
-  int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
-  int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
-  int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
-
-  int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
-  int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
-  int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
+  int32_t zp1 = extractScalarToInt32(input1_zero_point);
+  int32_t input1_mult = extractScalarToInt32(input1_multiplier);
+  int input1_shift_val = extractScalarToInt(input1_shift);
+  int32_t zp2 = extractScalarToInt32(input2_zero_point);
+  int32_t input2_mult = extractScalarToInt32(input2_multiplier);
+  int input2_shift_val = extractScalarToInt(input2_shift);
+  int32_t out_zp = extractScalarToInt32(output_zero_point);
+  int32_t output_mult = extractScalarToInt32(output_multiplier);
+  int output_shift_val = extractScalarToInt(output_shift);
 
   // Left shift to maximize precision (tune as needed)
   const int32_t left_shift = 20;

From a62ee11b50560d37fe661e06ab55d35b232bbfca Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Fri, 29 Aug 2025 14:54:57 -0700
Subject: [PATCH 2/5] Add tensor accessor and to umbrella includes (#13826)

---
 extension/tensor/tensor.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h
index 80a41018a20..467fbb1c160 100644
--- a/extension/tensor/tensor.h
+++ b/extension/tensor/tensor.h
@@ -9,5 +9,6 @@
 #pragma once
 
 // Umbrella header for the Tensor extension.
+#include <executorch/extension/tensor/tensor_accessor.h>
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>

From dc59852d06eeea34468cd32e579ddd2314c306aa Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Fri, 29 Aug 2025 15:26:06 -0700
Subject: [PATCH 3/5] Clean up includes in text_decoder_runner.h (#13827)

---
 extension/llm/runner/text_decoder_runner.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 59761396f5c..9a090de50d6 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -12,9 +12,6 @@
 
 #include <executorch/extension/llm/runner/io_manager/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/platform/compiler.h>
 
 namespace executorch {
 namespace extension {

From 7097ee1d06c04807fbb9143685761ae94bac9348 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Fri, 29 Aug 2025 15:26:28 -0700
Subject: [PATCH 4/5] Update executorch branch to swiftpm-0.8.0.20250829
 (#13828)

---
 .../demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 94c09dc9c32..8c5743291e7 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -960,7 +960,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.8.0.20250724";
+				branch = "swiftpm-0.8.0.20250829";
 				kind = branch;
 			};
 		};

From da132c333562be15c1d0c2360d93a133ffd68b77 Mon Sep 17 00:00:00 2001
From: Github Executorch <github_executorch@arm.com>
Date: Fri, 29 Aug 2025 15:33:27 -0700
Subject: [PATCH 5/5] Summary: Fix CI job: test-arm-backend
 (test_run_ethosu_fvp) / linux-job.

Add 'qdq_fusion_pass' flag to run.sh & aot_arm_compiler. This
allows certain CI jobs not to fail as we conditionally allow fusion op
to be enabled / disabled with this flag. Note that since the fusion op
passes are still WIP and not all ops are yet supported, it is good to
rely on this flag until we have full fledged support for fusion ops.

Test Plan:
1. examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u55-128 --model_name=qops --bundleio --no_delegate --portable_kernels="aten::sub.out,aten::add.out,aten::mul.out" --qdq_fusion_op=true

// The above fails as 'QuantOpTest' has mixed binary operations.

2. examples/arm/run.sh --et_build_root=arm_test/test_run
   --target=ethos-u55-128 --model_name=qops --bundleio --no_delegate
--portable_kernels="aten::sub.out,aten::add.out,aten::mul.out"  -->
passes (as default flag value is set to false)

Reviewers:

Subscribers:

Tasks:

Tags:
---
 examples/arm/aot_arm_compiler.py | 24 ++++++++++++++++++------
 examples/arm/run.sh              |  5 ++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 72e91fc640d..69f4d719b32 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -600,6 +600,12 @@ def get_args():
         action="store_false",
         help="Disable strict checking while exporting models.",
     )
+    parser.add_argument(
+        "--enable_qdq_fusion_pass",
+        action="store",
+        default=False,
+        help="Skip the QuantizedOpFusionPass fusion step (default: False)",
+    )
     args = parser.parse_args()
 
     if args.evaluate and (
@@ -791,14 +797,20 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
     return model_int8, edge
 
 
-def transform_for_cortex_m_backend(edge):
+def transform_for_cortex_m_backend(edge, args):
     # Let's make sure we are using optimized Cortex M backend
     # NB: If we can't find and replace ops those are expected to be replaced,
     # bad things will happen at runtime, like "missing operator" errors!
-    # Instantiate the pass
-    replace_quant_pass = ReplaceQuantNodesPass()
-    quantized_op_fusion_pass = QuantizedOpFusionPass()
-    edge = edge.transform([replace_quant_pass, quantized_op_fusion_pass])
+
+    # Instantiate the mandatory ReplaceQuantNodesPass
+    passes = [ReplaceQuantNodesPass()]
+
+    # Conditionally add the QuantizedOpFusionPass
+    if args.enable_qdq_fusion_pass.lower() == "true":
+        passes.append(QuantizedOpFusionPass())
+
+    # Apply the passes
+    edge = edge.transform(passes)
 
     return edge
 
@@ -835,7 +847,7 @@ def transform_for_cortex_m_backend(edge):
         )
 
     # Transform so we can use ops from the Cortex M backend
-    edge = transform_for_cortex_m_backend(edge)
+    edge = transform_for_cortex_m_backend(edge, args)
 
     dump_delegation_info(edge, args.intermediates)
 
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 172635a7744..b3828041068 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -40,6 +40,7 @@ ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
 scratch_dir_set=false
 toolchain=arm-none-eabi-gcc
 select_ops_list="aten::_softmax.out"
+qdq_fusion_op=false
 
 function help() {
     echo "Usage: $(basename $0) [options]"
@@ -69,6 +70,7 @@ function help() {
     echo "  --pte_placement=<elf|ADDR>             Ethos-U: Control if runtime has PTE baked into the elf or if its placed in memory outside of the elf, defaults to ${pte_placement}"
     echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
+    echo "  --qdq_fusion_op=<true/false>           Enable/Disable QDQ fusion op"
     exit 0
 }
 
@@ -96,6 +98,7 @@ for arg in "$@"; do
       --pte_placement=*) pte_placement="${arg#*=}";;
       --et_build_root=*) et_build_root="${arg#*=}";;
       --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;;
+      --qdq_fusion_op=*) qdq_fusion_op="${arg#*=}";;
       *)
       ;;
     esac
@@ -275,7 +278,7 @@ for i in "${!test_model[@]}"; do
         model_compiler_flags="${model_compiler_flags} --model_input=${model_input}"
     fi
 
-    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config}"
+    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config} --enable_qdq_fusion_pass=${qdq_fusion_op}"
     echo "CALL ${ARM_AOT_CMD}" >&2
     ${ARM_AOT_CMD} 1>&2