Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions backends/cortex_m/ops/cortex_m_ops_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

// Include CMSIS-NN headers with C linkage
extern "C" {
#include "arm_nnfunctions.h"
}

using Tensor = torch::executor::Tensor;
using ScalarType = executorch::aten::ScalarType;
using Scalar = torch::executor::Scalar;
Expand Down Expand Up @@ -139,3 +134,19 @@ inline Error resize_to_broadcast_target_size(
return executorch::runtime::resize_tensor(
output, {expected_output_size, expected_output_dim});
}

/**
* Convert Scalar to CMSIS-NN int32 format
* For multipliers, zero_points, etc. from quantize_multiplier_aot
*/
inline int32_t extractScalarToInt32(const Scalar& scalar_value) {
return static_cast<int32_t>(scalar_value.to<int64_t>());
}

/**
* Convert Scalar to CMSIS-NN int format
* For shift values from quantize_multiplier_aot
*/
inline int extractScalarToInt(const Scalar& scalar_value) {
return static_cast<int>(scalar_value.to<int64_t>());
}
27 changes: 14 additions & 13 deletions backends/cortex_m/ops/op_quantized_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

#include "cortex_m_ops_common.h"

// Include CMSIS-NN headers with C linkage
extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
Expand Down Expand Up @@ -54,19 +59,15 @@ Tensor& quantized_add_out(
"quantized_add_out: input1_int8.sizes() = %zu",
input1_int8.sizes().size());

// FIX: Use template types that ExecutorTorch definitely provides
// Use to<int64_t>() and to<double>() which are commonly instantiated
int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());

int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());

int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
int32_t zp1 = extractScalarToInt32(input1_zero_point);
int32_t input1_mult = extractScalarToInt32(input1_multiplier);
int input1_shift_val = extractScalarToInt(input1_shift);
int32_t zp2 = extractScalarToInt32(input2_zero_point);
int32_t input2_mult = extractScalarToInt32(input2_multiplier);
int input2_shift_val = extractScalarToInt(input2_shift);
int32_t out_zp = extractScalarToInt32(output_zero_point);
int32_t output_mult = extractScalarToInt32(output_multiplier);
int output_shift_val = extractScalarToInt(output_shift);

// Left shift to maximize precision (tune as needed)
const int32_t left_shift = 20;
Expand Down
24 changes: 18 additions & 6 deletions examples/arm/aot_arm_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,12 @@ def get_args():
action="store_false",
help="Disable strict checking while exporting models.",
)
parser.add_argument(
"--enable_qdq_fusion_pass",
action="store",
default=False,
help="Skip the QuantizedOpFusionPass fusion step (default: False)",
)
args = parser.parse_args()

if args.evaluate and (
Expand Down Expand Up @@ -791,14 +797,20 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
return model_int8, edge


def transform_for_cortex_m_backend(edge):
def transform_for_cortex_m_backend(edge, args):
# Let's make sure we are using optimized Cortex M backend
# NB: If we can't find and replace ops those are expected to be replaced,
# bad things will happen at runtime, like "missing operator" errors!
# Instantiate the pass
replace_quant_pass = ReplaceQuantNodesPass()
quantized_op_fusion_pass = QuantizedOpFusionPass()
edge = edge.transform([replace_quant_pass, quantized_op_fusion_pass])

# Instantiate the mandatory ReplaceQuantNodesPass
passes = [ReplaceQuantNodesPass()]

# Conditionally add the QuantizedOpFusionPass
if args.enable_qdq_fusion_pass.lower() == "true":
passes.append(QuantizedOpFusionPass())

# Apply the passes
edge = edge.transform(passes)

return edge

Expand Down Expand Up @@ -835,7 +847,7 @@ def transform_for_cortex_m_backend(edge):
)

# Transform so we can use ops from the Cortex M backend
edge = transform_for_cortex_m_backend(edge)
edge = transform_for_cortex_m_backend(edge, args)

dump_delegation_info(edge, args.intermediates)

Expand Down
5 changes: 4 additions & 1 deletion examples/arm/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ ethos_u_scratch_dir=${script_dir}/ethos-u-scratch
scratch_dir_set=false
toolchain=arm-none-eabi-gcc
select_ops_list="aten::_softmax.out"
qdq_fusion_op=false

function help() {
echo "Usage: $(basename $0) [options]"
Expand Down Expand Up @@ -69,6 +70,7 @@ function help() {
echo " --pte_placement=<elf|ADDR> Ethos-U: Control if runtime has PTE baked into the elf or if its placed in memory outside of the elf, defaults to ${pte_placement}"
echo " --et_build_root=<FOLDER> Executorch build output root folder to use, defaults to ${et_build_root}"
echo " --scratch-dir=<FOLDER> Path to your Ethos-U scrach dir if you not using default ${ethos_u_scratch_dir}"
echo " --qdq_fusion_op=<true/false> Enable/Disable QDQ fusion op"
exit 0
}

Expand Down Expand Up @@ -96,6 +98,7 @@ for arg in "$@"; do
--pte_placement=*) pte_placement="${arg#*=}";;
--et_build_root=*) et_build_root="${arg#*=}";;
--scratch-dir=*) ethos_u_scratch_dir="${arg#*=}" ; scratch_dir_set=true ;;
--qdq_fusion_op=*) qdq_fusion_op="${arg#*=}";;
*)
;;
esac
Expand Down Expand Up @@ -275,7 +278,7 @@ for i in "${!test_model[@]}"; do
model_compiler_flags="${model_compiler_flags} --model_input=${model_input}"
fi

ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config}"
ARM_AOT_CMD="python3 -m examples.arm.aot_arm_compiler --model_name=${model} --target=${target} ${model_compiler_flags} --intermediate=${output_folder} --output=${pte_file} --system_config=${system_config} --memory_mode=${memory_mode} $bundleio_flag ${etrecord_flag} --config=${config} --enable_qdq_fusion_pass=${qdq_fusion_op}"
echo "CALL ${ARM_AOT_CMD}" >&2
${ARM_AOT_CMD} 1>&2

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -960,7 +960,7 @@
isa = XCRemoteSwiftPackageReference;
repositoryURL = "https://github.com/pytorch/executorch";
requirement = {
branch = "swiftpm-0.8.0.20250724";
branch = "swiftpm-0.8.0.20250829";
kind = branch;
};
};
Expand Down
3 changes: 0 additions & 3 deletions extension/llm/runner/text_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@

#include <executorch/extension/llm/runner/io_manager/io_manager.h>
#include <executorch/extension/llm/sampler/sampler.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/tensor/tensor.h>
#include <executorch/runtime/platform/compiler.h>

namespace executorch {
namespace extension {
Expand Down
1 change: 1 addition & 0 deletions extension/tensor/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
#pragma once

// Umbrella header for the Tensor extension.
#include <executorch/extension/tensor/tensor_accessor.h>
#include <executorch/extension/tensor/tensor_ptr.h>
#include <executorch/extension/tensor/tensor_ptr_maker.h>
Loading