Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Kernel library for Cortex-M operators. Please keep this file formatted by
# running:
# ~~~
# cmake-format -i CMakeLists.txt
# ~~~
cmake_minimum_required(VERSION 3.19)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
Expand All @@ -24,29 +19,65 @@ endif()

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
include(FetchContent)

# CMSIS-NN version to download
set(CMSIS_NN_VERSION
"v4.1.0"
CACHE STRING "CMSIS-NN version to download"
)

# Declare CMSIS-NN as a FetchContent project
FetchContent_Declare(
cmsis_nn
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
GIT_TAG ${CMSIS_NN_VERSION}
)

# Download and make CMSIS-NN available
FetchContent_MakeAvailable(cmsis_nn)

# Print paths for debugging
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")

# Cortex-M ops kernel sources
set(_cortex_m_kernels__srcs
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
)

# Generate C++ bindings to register kernels into Executorch (for runtime). Here
# select all ops in operators.yaml
# Generate C++ bindings to register kernels into Executorch (for runtime)
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")

# Generate bindings for the kernels
generate_bindings_for_kernels(
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
)
message("Generated files ${gen_command_sources}")

# Build a library for _cortex_m_kernels_srcs
# Build a library for cortex_m_kernels
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
target_link_libraries(cortex_m_kernels PRIVATE executorch)
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})

# Include directories for cortex_m_kernels
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

besides _common_compile_options don't you also need cortex-m specific compile options?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comes from the toolchain file.

target_include_directories(
cortex_m_kernels
PRIVATE ${EXECUTORCH_ROOT}/..
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
${cmsis_nn_SOURCE_DIR}/Include
)

# Link directly to the CMSIS-NN static library file
target_link_libraries(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it done this way instead of directly adding the library target?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CMSIS-NN library is linked by specifying the static library file path directly because CMSIS-NN is brought in via FetchContent and does not define a CMake target in the build.

cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
)

# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
# CMSIS-NN target name (usually 'cmsis-nn')
add_dependencies(cortex_m_kernels cmsis-nn)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are no C++ testing?

if(CORTEX_M_BUILD_TESTS)
  enable_testing()
  add_subdirectory(tests)
endif()

Copy link
Contributor Author

@psiddh psiddh Aug 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mergennachin Created task to track dedicated test suite for cortex_m ops (that should cover Python tests, C++ unit tests & E2E tests): #13739. (and the initial work on this: PR : #1357)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI we can't run cortex-m op unittests without FVP. but for c++ only utils, sure.

# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
gen_operators_lib(
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
Expand Down
1 change: 1 addition & 0 deletions backends/cortex_m/ops/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ python_library(
],
deps = [
"fbcode//caffe2:torch",
"//executorch/backends/cortex_m/passes:passes_utils",
],
)

Expand Down
141 changes: 141 additions & 0 deletions backends/cortex_m/ops/cortex_m_ops_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>

#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

// Include CMSIS-NN headers with C linkage
extern "C" {
#include "arm_nnfunctions.h"
}

using Tensor = torch::executor::Tensor;
using ScalarType = executorch::aten::ScalarType;
using Scalar = torch::executor::Scalar;
using Error = executorch::runtime::Error;

// Basic tensor type / layout validation and dimension order checking
inline void validate_cmsis_nn_tensor_requirements(
const Tensor& input1,
const Tensor& input2,
Tensor& output,
ScalarType expected_dtype = ScalarType::Char,
bool require_channels_last = false) {
// Basic dtype validation
ET_CHECK_MSG(
input1.scalar_type() == expected_dtype,
"Input1 dtype must be %hhd",
expected_dtype);
ET_CHECK_MSG(
input2.scalar_type() == expected_dtype,
"Input2 dtype must be %hhd",
expected_dtype);
ET_CHECK_MSG(
output.scalar_type() == expected_dtype,
"Output dtype must be %hhd",
expected_dtype);

// Dim order consistency
ET_CHECK_MSG(
executorch::runtime::tensors_have_same_dim_order(input1, input2, output),
"Tensors must have same dimension order");

// TBD: Validate memory alignment (CMSIS-NN requirement)
}

inline void validate_single_quant_params(
const Scalar& zero_point,
const Scalar& multiplier,
const Scalar& shift,
const char* param_name) {
int64_t zp_val = zero_point.to<int64_t>();
int64_t mult_val = multiplier.to<int64_t>();
int64_t shift_val = shift.to<int64_t>();

ET_CHECK_MSG(
zp_val >= std::numeric_limits<int8_t>::min() &&
zp_val <= std::numeric_limits<int8_t>::max(),
"%s zero point must be in int8 range [Value: %d]",
param_name,
zp_val);

ET_CHECK_MSG(
mult_val >= std::numeric_limits<int32_t>::min() &&
mult_val <= std::numeric_limits<int32_t>::max(),
"%s multiplier must be in int32 range [Value: %d]",
param_name,
mult_val);

ET_CHECK_MSG(
shift_val >= -31 && shift_val <= 31,
"%s shift must be in range [-31, 31] [Value: %d]",
param_name,
shift_val);
}

/**
* Validate quantization parameters for inputs and output.
*
* Checks that zero points fit in int8 range, multipliers fit in int32 range,
* and shifts are within a valid bit-shift range (0-31).
*
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
* and CMSIS-NN kernel expectations.
*
* Raises errors via ET_KERNEL_CHECK if any check fails.
*/
inline void validate_quantization_params(
const Scalar& zero_point1,
const Scalar& multiplier1,
const Scalar& shift1,
const Scalar& zero_point2,
const Scalar& multiplier2,
const Scalar& shift2,
const Scalar& output_zero_point,
const Scalar& output_multiplier,
const Scalar& output_shift,
Tensor& output) {
Comment on lines +100 to +110
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make it a util to check a single quant params

Suggested change
inline void validate_quantization_params(
const Scalar& zero_point1,
const Scalar& multiplier1,
const Scalar& shift1,
const Scalar& zero_point2,
const Scalar& multiplier2,
const Scalar& shift2,
const Scalar& output_zero_point,
const Scalar& output_multiplier,
const Scalar& output_shift,
Tensor& output) {
inline void validate_quantization_params(
const Scalar& zero_point,
const Scalar& multiplier,
const Scalar& shift) {

validate_single_quant_params(
zero_point1, multiplier1, shift1, "Single quant Input1");
validate_single_quant_params(
zero_point2, multiplier2, shift2, "Single quant Input2");
validate_single_quant_params(
output_zero_point,
output_multiplier,
output_shift,
"Single quant Output");
}

inline Error resize_to_broadcast_target_size(
const Tensor& input1,
const Tensor& input2,
Tensor& output) {
static constexpr int kTensorDimensionLimit = 5;
Tensor::SizesType expected_output_size[kTensorDimensionLimit];
size_t expected_output_dim = 0;
auto err = torch::executor::get_broadcast_target_size(
input1,
input2,
expected_output_size,
kTensorDimensionLimit,
&expected_output_dim);

if (err != Error::Ok)
return err;

return executorch::runtime::resize_tensor(
output, {expected_output_size, expected_output_dim});
}
149 changes: 149 additions & 0 deletions backends/cortex_m/ops/op_quantized_add.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "cortex_m_ops_common.h"

namespace cortex_m {
namespace native {
using KernelRuntimeContext = torch::executor::KernelRuntimeContext;

Tensor& quantized_add_out(
KernelRuntimeContext& context,
const Tensor& input1_int8,
const Scalar& input1_zero_point,
const Scalar& input1_multiplier,
const Scalar& input1_shift,
const Tensor& input2_int8,
const Scalar& input2_zero_point,
const Scalar& input2_multiplier,
const Scalar& input2_shift,
const Scalar& output_zero_point,
const Scalar& output_multiplier,
const Scalar& output_shift,
Tensor& out) {
// Validate tensor types and dim order
validate_cmsis_nn_tensor_requirements(input1_int8, input2_int8, out);

// Validate quantization parameters
validate_quantization_params(
input1_zero_point,
input1_multiplier,
input1_shift,
input2_zero_point,
input2_multiplier,
input2_shift,
output_zero_point,
output_multiplier,
output_shift,
out);

// Broadcast if needed
auto result = resize_to_broadcast_target_size(input1_int8, input2_int8, out);
ET_CHECK_MSG(
(result == Error::Ok),
"Failed to resize output tensor. Status: [%d]",
result);

ET_LOG(
Info,
"quantized_add_out: input1_int8.sizes() = %zu",
input1_int8.sizes().size());

// FIX: Use template types that ExecutorTorch definitely provides
// Use to<int64_t>() and to<double>() which are commonly instantiated
int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());

int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());

int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
int output_shift_val = static_cast<int>(output_shift.to<int64_t>());

// Left shift to maximize precision (tune as needed)
const int32_t left_shift = 20;
const int32_t activation_min = std::numeric_limits<int8_t>::min();
const int32_t activation_max = std::numeric_limits<int8_t>::max();

ET_LOG(
Info,
"Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
input1_mult,
input1_shift_val,
input2_mult,
input2_shift_val,
output_mult,
output_shift_val);

// Call CMSIS-NN kernel with precomputed parameters
arm_cmsis_nn_status status = arm_elementwise_add_s8(
Copy link
Contributor

@digantdesai digantdesai Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this handle if two inputs are of different shape i.e. [1,2,2] + [2,2,2]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it handles and I validated this in e2e flow

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could potentially be added as a pass as well: https://github.com/pytorch/executorch/blob/main/backends/arm/_passes/broadcast_args_pass.py. But long term the ideal solution would be to add broadcast support to CMSIS-NN to get it accelerated w/o memcopies.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can't be added as an AoT pass and must be handled at runtime because we support dynamic shapes with these operators.

input1_int8.const_data_ptr<int8_t>(),
input2_int8.const_data_ptr<int8_t>(),
static_cast<int32_t>(zp1),
input1_mult,
input1_shift_val,
static_cast<int32_t>(zp2),
input2_mult,
input2_shift_val,
left_shift,
out.mutable_data_ptr<int8_t>(),
static_cast<int32_t>(out_zp),
output_mult,
output_shift_val,
static_cast<int32_t>(out.numel()),
activation_min,
activation_max);

if (status != ARM_CMSIS_NN_SUCCESS) {
ET_LOG(
Error,
"quantized_add_out: arm_elementwise_add_s8 failed with status [%d]",
status);

context.fail(Error::Internal); // Fail the execution context
return out;
}
ET_LOG(
Info,
"quantized_add_out: Successfully completed with AoT-computed parameters!");

return out;
}

// Stub Implementation: Non-out variant for compatibility (functional variant)
// EXIR/ExecuTorch runs an out-variant pass that converts
// .default operations to .out variants before memory planning.
// In the pass we are calling quantized_add's default variant
// but ExecuTorch's kernel dispatch mechanism will end up calling the out
// variant. This stub is to make sure that compiler doesn't complain.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the compiler complaint?

I think you might need to write a version for add_out which doesn't take context arg IIRC.

Tensor quantized_add(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this?

KernelRuntimeContext& context,
const Tensor& input1_int8,
const Scalar& input1_zero_point,
const Scalar& input1_multiplier,
const Scalar& input1_shift,
const Tensor& input2_int8,
const Scalar& input2_zero_point,
const Scalar& input2_multiplier,
const Scalar& input2_shift,
const Scalar& output_zero_point,
const Scalar& output_multiplier,
const Scalar& output_shift) {
ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes());

// Crash on Debug builds if invoked
assert(False);
// This is to make sure compiler doesn't complain.
return const_cast<Tensor&>(input1_int8);
}

} // namespace native
} // namespace cortex_m
Loading
Loading