diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
index 0bde2ddff17..5ef2d9d4bf9 100644
--- a/backends/cortex_m/ops/cortex_m_ops_common.h
+++ b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -17,11 +17,6 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
 using Scalar = torch::executor::Scalar;
@@ -139,3 +134,19 @@ inline Error resize_to_broadcast_target_size(
   return executorch::runtime::resize_tensor(
       output, {expected_output_size, expected_output_dim});
 }
+
+/**
+ * Convert Scalar to CMSIS-NN int32 format
+ * For multipliers, zero_points, etc. from quantize_multiplier_aot
+ */
+inline int32_t extractScalarToInt32(const Scalar& scalar_value) {
+  return static_cast<int32_t>(scalar_value.to<int64_t>());
+}
+
+/**
+ * Convert Scalar to CMSIS-NN int format
+ * For shift values from quantize_multiplier_aot
+ */
+inline int extractScalarToInt(const Scalar& scalar_value) {
+  return static_cast<int>(scalar_value.to<int64_t>());
+}
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
index 47f6df6bfc5..044c2bd92d5 100644
--- a/backends/cortex_m/ops/op_quantized_add.cpp
+++ b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -8,6 +8,11 @@
 
 #include "cortex_m_ops_common.h"
 
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
 namespace cortex_m {
 namespace native {
 using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
@@ -54,19 +59,15 @@ Tensor& quantized_add_out(
       "quantized_add_out: input1_int8.sizes() = %zu",
       input1_int8.sizes().size());
 
-  // FIX: Use template types that ExecutorTorch definitely provides
-  // Use to<int64_t>() and to<double>() which are commonly instantiated
-  int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
-  int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
-  int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
-
-  int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
-  int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
-  int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
-
-  int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
-  int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
-  int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
+  int32_t zp1 = extractScalarToInt32(input1_zero_point);
+  int32_t input1_mult = extractScalarToInt32(input1_multiplier);
+  int input1_shift_val = extractScalarToInt(input1_shift);
+  int32_t zp2 = extractScalarToInt32(input2_zero_point);
+  int32_t input2_mult = extractScalarToInt32(input2_multiplier);
+  int input2_shift_val = extractScalarToInt(input2_shift);
+  int32_t out_zp = extractScalarToInt32(output_zero_point);
+  int32_t output_mult = extractScalarToInt32(output_multiplier);
+  int output_shift_val = extractScalarToInt(output_shift);
 
   // Left shift to maximize precision (tune as needed)
   const int32_t left_shift = 20;
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 72e91fc640d..69f4d719b32 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -600,6 +600,12 @@ def get_args():
         action="store_false",
         help="Disable strict checking while exporting models.",
     )
+    parser.add_argument(
+        "--enable_qdq_fusion_pass",
+        action="store",
+        default=False,
+        help="Skip the QuantizedOpFusionPass fusion step (default: False)",
+    )
     args = parser.parse_args()
 
     if args.evaluate and (
@@ -791,14 +797,20 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
     return model_int8, edge
 
 
-def transform_for_cortex_m_backend(edge):
+def transform_for_cortex_m_backend(edge, args):
     # Let's make sure we are using optimized Cortex M backend
     # NB: If we can't find and replace ops those are expected to be replaced,
     # bad things will happen at runtime, like "missing operator" errors!
-    # Instantiate the pass
-    replace_quant_pass = ReplaceQuantNodesPass()
-    quantized_op_fusion_pass = QuantizedOpFusionPass()
-    edge = edge.transform([replace_quant_pass, quantized_op_fusion_pass])
+
+    # Instantiate the mandatory ReplaceQuantNodesPass
+    passes = [ReplaceQuantNodesPass()]
+
+    # Conditionally add the QuantizedOpFusionPass
+    if args.enable_qdq_fusion_pass.lower() == "true":
+        passes.append(QuantizedOpFusionPass())
+
+    # Apply the passes
+    edge = edge.transform(passes)
 
     return edge
 
@@ -835,7 +847,7 @@ def transform_for_cortex_m_backend(edge):
         )
 
     # Transform so we can use ops from the Cortex M backend
-    edge = transform_for_cortex_m_backend(edge)
+    edge = transform_for_cortex_m_backend(edge, args)
 
     dump_delegation_info(edge, args.intermediates)
 
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 172635a7744..b3828041068 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -40,6 +40,7 @@ ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
 scratch_dir_set=false
 toolchain=arm-none-eabi-gcc
 select_ops_list="aten::_softmax.out"
+qdq_fusion_op=false
 
 function help() {
     echo "Usage: $(basename $0) [options]"
@@ -69,6 +70,7 @@ function help() {
     echo "  --pte_placement=<elf|ADDR>             Ethos-U: Control if runtime has PTE baked into the elf or if its placed in memory outside of the elf, defaults to ${pte_placement}"
     echo "  --et_build_root=<FOLDER>               Executorch build output root folder to use, defaults to ${et_build_root}"
     echo "  --scratch-dir=<FOLDER>                 Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
+    echo "  --qdq_fusion_op=<true/false>           Enable/Disable QDQ fusion op"
     exit 0
 }
 
@@ -96,6 +98,7 @@ for arg in "$@"; do
       --pte_placement=*) pte_placement="${arg#*=}";;
       --et_build_root=*) et_build_root="${arg#*=}";;
       --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;;
+      --qdq_fusion_op=*) qdq_fusion_op="${arg#*=}";;
       *)
       ;;
     esac
@@ -275,7 +278,7 @@ for i in "${!test_model[@]}"; do
         model_compiler_flags="${model_compiler_flags} --model_input=${model_input}"
     fi
 
-    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config}"
+    ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config} --enable_qdq_fusion_pass=${qdq_fusion_op}"
     echo "CALL ${ARM_AOT_CMD}" >&2
     ${ARM_AOT_CMD} 1>&2
 
diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
index 94c09dc9c32..8c5743291e7 100644
--- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
+++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj
@@ -960,7 +960,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/pytorch/executorch";
 			requirement = {
-				branch = "swiftpm-0.8.0.20250724";
+				branch = "swiftpm-0.8.0.20250829";
 				kind = branch;
 			};
 		};
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 59761396f5c..9a090de50d6 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -12,9 +12,6 @@
 
 #include <executorch/extension/llm/runner/io_manager/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/platform/compiler.h>
 
 namespace executorch {
 namespace extension {
diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h
index 80a41018a20..467fbb1c160 100644
--- a/extension/tensor/tensor.h
+++ b/extension/tensor/tensor.h
@@ -9,5 +9,6 @@
 #pragma once
 
 // Umbrella header for the Tensor extension.
+#include <executorch/extension/tensor/tensor_accessor.h>
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>