diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h index 0bde2ddff17..5ef2d9d4bf9 100644 --- a/backends/cortex_m/ops/cortex_m_ops_common.h +++ b/backends/cortex_m/ops/cortex_m_ops_common.h @@ -17,11 +17,6 @@ #include #include -// Include CMSIS-NN headers with C linkage -extern "C" { -#include "arm_nnfunctions.h" -} - using Tensor = torch::executor::Tensor; using ScalarType = executorch::aten::ScalarType; using Scalar = torch::executor::Scalar; @@ -139,3 +134,19 @@ inline Error resize_to_broadcast_target_size( return executorch::runtime::resize_tensor( output, {expected_output_size, expected_output_dim}); } + +/** + * Convert Scalar to CMSIS-NN int32 format + * For multipliers, zero_points, etc. from quantize_multiplier_aot + */ +inline int32_t extractScalarToInt32(const Scalar& scalar_value) { + return static_cast(scalar_value.to()); +} + +/** + * Convert Scalar to CMSIS-NN int format + * For shift values from quantize_multiplier_aot + */ +inline int extractScalarToInt(const Scalar& scalar_value) { + return static_cast(scalar_value.to()); +} diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp index 47f6df6bfc5..044c2bd92d5 100644 --- a/backends/cortex_m/ops/op_quantized_add.cpp +++ b/backends/cortex_m/ops/op_quantized_add.cpp @@ -8,6 +8,11 @@ #include "cortex_m_ops_common.h" +// Include CMSIS-NN headers with C linkage +extern "C" { +#include "arm_nnfunctions.h" +} + namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; @@ -54,19 +59,15 @@ Tensor& quantized_add_out( "quantized_add_out: input1_int8.sizes() = %zu", input1_int8.sizes().size()); - // FIX: Use template types that ExecutorTorch definitely provides - // Use to() and to() which are commonly instantiated - int32_t zp1 = static_cast(input1_zero_point.to()); - int32_t input1_mult = static_cast(input1_multiplier.to()); - int input1_shift_val = static_cast(input1_shift.to()); - - int32_t zp2 = static_cast(input2_zero_point.to()); - int32_t input2_mult = static_cast(input2_multiplier.to()); - int input2_shift_val = static_cast(input2_shift.to()); - - int32_t out_zp = static_cast(output_zero_point.to()); - int32_t output_mult = static_cast(output_multiplier.to()); - int output_shift_val = static_cast(output_shift.to()); + int32_t zp1 = extractScalarToInt32(input1_zero_point); + int32_t input1_mult = extractScalarToInt32(input1_multiplier); + int input1_shift_val = extractScalarToInt(input1_shift); + int32_t zp2 = extractScalarToInt32(input2_zero_point); + int32_t input2_mult = extractScalarToInt32(input2_multiplier); + int input2_shift_val = extractScalarToInt(input2_shift); + int32_t out_zp = extractScalarToInt32(output_zero_point); + int32_t output_mult = extractScalarToInt32(output_multiplier); + int output_shift_val = extractScalarToInt(output_shift); // Left shift to maximize precision (tune as needed) const int32_t left_shift = 20; diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index 72e91fc640d..69f4d719b32 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -600,6 +600,12 @@ def get_args(): action="store_false", help="Disable strict checking while exporting models.", ) + parser.add_argument( + "--enable_qdq_fusion_pass", + action="store", + default=False, + help="Skip the QuantizedOpFusionPass fusion step (default: False)", + ) args = parser.parse_args() if args.evaluate and ( @@ -791,14 +797,20 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_ return model_int8, edge -def transform_for_cortex_m_backend(edge): +def transform_for_cortex_m_backend(edge, args): # Let's make sure we are using optimized Cortex M backend # NB: If we can't find and replace ops those are expected to be replaced, # bad things will happen at runtime, like "missing operator" errors! - # Instantiate the pass - replace_quant_pass = ReplaceQuantNodesPass() - quantized_op_fusion_pass = QuantizedOpFusionPass() - edge = edge.transform([replace_quant_pass, quantized_op_fusion_pass]) + + # Instantiate the mandatory ReplaceQuantNodesPass + passes = [ReplaceQuantNodesPass()] + + # Conditionally add the QuantizedOpFusionPass + if args.enable_qdq_fusion_pass.lower() == "true": + passes.append(QuantizedOpFusionPass()) + + # Apply the passes + edge = edge.transform(passes) return edge @@ -835,7 +847,7 @@ def transform_for_cortex_m_backend(edge): ) # Transform so we can use ops from the Cortex M backend - edge = transform_for_cortex_m_backend(edge) + edge = transform_for_cortex_m_backend(edge, args) dump_delegation_info(edge, args.intermediates) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 172635a7744..b3828041068 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -40,6 +40,7 @@ ethos_u_scratch_dir=${script_dir}/ethos-u-scratch scratch_dir_set=false toolchain=arm-none-eabi-gcc select_ops_list="aten::_softmax.out" +qdq_fusion_op=false function help() { echo "Usage: $(basename $0) [options]" @@ -69,6 +70,7 @@ function help() { echo " --pte_placement= Ethos-U: Control if runtime has PTE baked into the elf or if its placed in memory outside of the elf, defaults to ${pte_placement}" echo " --et_build_root= Executorch build output root folder to use, defaults to ${et_build_root}" echo " --scratch-dir= Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}" + echo " --qdq_fusion_op= Enable/Disable QDQ fusion op" exit 0 } @@ -96,6 +98,7 @@ for arg in "$@"; do --pte_placement=*) pte_placement="${arg#*=}";; --et_build_root=*) et_build_root="${arg#*=}";; --scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;; + --qdq_fusion_op=*) qdq_fusion_op="${arg#*=}";; *) ;; esac @@ -275,7 +278,7 @@ for i in "${!test_model[@]}"; do model_compiler_flags="${model_compiler_flags} --model_input=${model_input}" fi - ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config}" + ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config} --enable_qdq_fusion_pass=${qdq_fusion_op}" echo "CALL ${ARM_AOT_CMD}" >&2 ${ARM_AOT_CMD} 1>&2 diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index 94c09dc9c32..8c5743291e7 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -960,7 +960,7 @@ isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/pytorch/executorch"; requirement = { - branch = "swiftpm-0.8.0.20250724"; + branch = "swiftpm-0.8.0.20250829"; kind = branch; }; }; diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h index 59761396f5c..9a090de50d6 100644 --- a/extension/llm/runner/text_decoder_runner.h +++ b/extension/llm/runner/text_decoder_runner.h @@ -12,9 +12,6 @@ #include #include -#include -#include -#include namespace executorch { namespace extension { diff --git a/extension/tensor/tensor.h b/extension/tensor/tensor.h index 80a41018a20..467fbb1c160 100644 --- a/extension/tensor/tensor.h +++ b/extension/tensor/tensor.h @@ -9,5 +9,6 @@ #pragma once // Umbrella header for the Tensor extension. +#include #include #include