Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 52 additions & 32 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()

# Source root directory for executorch.
# Source root directory for executorch
if(NOT EXECUTORCH_ROOT)
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
endif()
Expand All @@ -21,70 +21,90 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
include(FetchContent)

# CMSIS-NN version to download
# CMSIS-NN configuration with dynamic path detection
set(CMSIS_NN_VERSION
"v4.1.0"
"v7.0.0"
CACHE STRING "CMSIS-NN version to download"
)

# Declare CMSIS-NN as a FetchContent project
FetchContent_Declare(
cmsis_nn
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
GIT_TAG ${CMSIS_NN_VERSION}
set(CMSIS_NN_LOCAL_PATH
""
CACHE PATH "Path to existing local CMSIS-NN installation"
)

# Download and make CMSIS-NN available
FetchContent_MakeAvailable(cmsis_nn)
# Try to find existing / local CMSIS-NN installation. This is useful for
# debugging and testing with local changes. This is not common, as the CMSIS-NN
# library is downloaded via FetchContent in the default/regular case.
if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}")
message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}")
add_subdirectory(${CMSIS_NN_LOCAL_PATH} cmsis_nn_build)
else()
# Use FetchContent with automatic fallback
message(STATUS "Using CMSIS-NN via FetchContent")

FetchContent_Declare(
cmsis_nn
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
GIT_TAG ${CMSIS_NN_VERSION}
GIT_SHALLOW TRUE
)

FetchContent_GetProperties(cmsis_nn)
if(NOT cmsis_nn_POPULATED)
FetchContent_Populate(cmsis_nn)
add_subdirectory(${cmsis_nn_SOURCE_DIR} ${cmsis_nn_BINARY_DIR})
endif()
endif()

# Print paths for debugging
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
# Add MVEI define to cmsis-nn target
if(TARGET cmsis-nn)
target_compile_definitions(cmsis-nn PUBLIC ARM_MATH_MVEI=1)
get_target_property(CMSIS_NN_INCLUDES cmsis-nn INTERFACE_INCLUDE_DIRECTORIES)
message(STATUS "CMSIS-NN include dirs: ${CMSIS_NN_INCLUDES}")
else()
message(
FATAL_ERROR
"CMSIS-NN target not found. Check your CMSIS_NN_LOCAL_PATH or network connection."
)
endif()

# Cortex-M ops kernel sources
set(_cortex_m_kernels__srcs
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
)

# Generate C++ bindings to register kernels into Executorch (for runtime)
# Generate C++ bindings to register kernels into Executorch
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")

generate_bindings_for_kernels(
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
)
message("Generated files ${gen_command_sources}")

# Build a library for cortex_m_kernels
# Build library for cortex_m_kernels
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})

# Include directories for cortex_m_kernels
target_include_directories(
# Use PRIVATE for implementation dependencies to avoid INTERFACE pollution
target_link_libraries(
cortex_m_kernels
PRIVATE ${EXECUTORCH_ROOT}/..
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
${cmsis_nn_SOURCE_DIR}/Include
PRIVATE cmsis-nn
PRIVATE executorch
)

# Link directly to the CMSIS-NN static library file
target_link_libraries(
cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
# Include directories for cortex_m_kernels
target_include_directories(
cortex_m_kernels PRIVATE ${EXECUTORCH_ROOT}/..
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
)

# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
# CMSIS-NN target name (usually 'cmsis-nn')
add_dependencies(cortex_m_kernels cmsis-nn)

# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
gen_operators_lib(
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
)

install(
TARGETS cortex_m_kernels cortex_m_ops_lib
TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
EXPORT ExecuTorchTargets
DESTINATION lib
PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
Expand Down
187 changes: 187 additions & 0 deletions backends/cortex_m/ops/cmsis_scratch_buffer_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once

#include "cortex_m_ops_common.h"
extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {

// During AOT phase, quantized_linear_fusion_pass allocates total buffer
// and passes in as 'Tensor'. (Total buffer = 8-byte header + x bytes)
// ┌─────────────────┬─────────────────────────────────────┐
// │ KernelSum Header│ CMSIS Workspace │
// │ (8 bytes) │ (x bytes) │
// └─────────────────┴─────────────────────────────────────┘
// │ │
// │ └─> Passed to CMSIS API
// │
// └─> State for kernel sum

// C++ Runtime:
// ┌─────────────────┬─────────────────────────────────────┐
// │ KernelSum Header│ CMSIS Workspace │
// │ (8 bytes) │ (x bytes) │
// └─────────────────┴─────────────────────────────────────┘
// ^ ^
// │ │
// scratch_ptr cmsis_workspace_ptr
// │ │
// ▼ ▼
// arm_vector_sum_s8() writes kernel sums (with bias if avail):
// [sum₀+bias₀][sum₁+bias₁][sum₂+bias₂]...[sum_{n-1}+bias_{n-1}]
// (n * 4-byte int32_t values = x bytes)
//
// - n = out_features (number of output features)
// - x = n * 4 bytes (total CMSIS buffer size)
// - Total buffer = 8 + x bytes

class CMSISScratchBufferContext final {
public:
CMSISScratchBufferContext(
Tensor& scratch_buffer,
const Tensor& weights,
const Tensor& weight_zero_point,
const torch::executor::optional<Tensor>& bias)
: scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
total_size_(scratch_buffer.size(0)),
base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
in_features_(weights.size(1)),
out_features_(weights.size(0)),
is_per_channel_(weight_zero_point.numel() > 1),
weight_data_offset_(calculate_offset(weights.const_data_ptr<int8_t>())),
weight_zp_data_offset_(
calculate_offset(weight_zero_point.const_data_ptr<int32_t>())),
bias_data_offset_(
bias.has_value()
? calculate_offset(bias.value().const_data_ptr<int32_t>())
: 0),
header_(reinterpret_cast<KernelSumHeader*>(scratch_ptr_)),
cmsis_workspace_ptr_(scratch_ptr_ + KERNEL_SUM_HEADER_SIZE) {
cmsis_nn_dims filter_dims = {in_features_, 1, 1, out_features_};
validate_size(filter_dims);
}

cmsis_nn_context get_cmsis_ctx() const {
cmsis_nn_context ctx;
ET_CHECK_MSG(
reinterpret_cast<uintptr_t>(cmsis_workspace_ptr_) % 4 == 0,
"CMSIS workspace not 4-byte aligned");
ctx.buf = cmsis_workspace_ptr_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to put an alignment check on this pointer before passing it down to CMSIS?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like this you meant ?
ET_CHECK_MSG(
reinterpret_cast<uintptr_t>(cmsis_workspace_ptr_) % 4 == 0,
"CMSIS workspace not 4-byte aligned");

ctx.size = get_cmsis_workspace_size();
return ctx;
}

bool is_kernel_sum_updated() const {
return header_->updated;
}

void compute_kernel_sums_if_needed() {
if (!header_->updated) {
arm_vector_sum_s8(
reinterpret_cast<int32_t*>(cmsis_workspace_ptr_),
in_features_,
out_features_,
get_weight_data(),
get_weight_zp_data()[0],
0,
get_bias_data());
header_->updated = true;
ET_LOG(
Info,
"Computed kernel sums. [required_bytes : %d]",
header_->required_size);
}
}

const int8_t* get_weight_data() const {
return reinterpret_cast<const int8_t*>(base_ptr_ + weight_data_offset_);
}

const int32_t* get_weight_zp_data() const {
return reinterpret_cast<const int32_t*>(base_ptr_ + weight_zp_data_offset_);
}

const int32_t* get_bias_data() const {
return bias_data_offset_ == 0
? nullptr
: reinterpret_cast<const int32_t*>(base_ptr_ + bias_data_offset_);
}

bool is_per_channel_quant() const {
return is_per_channel_;
}
int32_t get_in_features() const {
return in_features_;
}
int32_t get_out_features() const {
return out_features_;
}

private:
static constexpr size_t KERNEL_SUM_HEADER_SIZE = 8;

// Header for kernel sum computation state only
struct KernelSumHeader {
bool updated = false;
int32_t required_size = 0;
};
static_assert(
sizeof(KernelSumHeader) == KERNEL_SUM_HEADER_SIZE,
"KernelSumHeader must be exactly 8 bytes");

int8_t* scratch_ptr_;
size_t total_size_;
uint8_t* base_ptr_;

// Context members
const int32_t in_features_;
const int32_t out_features_;
const bool is_per_channel_;
const uint32_t weight_data_offset_;
const uint32_t weight_zp_data_offset_;
const uint32_t bias_data_offset_;

KernelSumHeader* header_;
int8_t* cmsis_workspace_ptr_;

uint32_t calculate_offset(const void* ptr) const {
if (ptr == nullptr)
return 0;

const uint8_t* ptr_bytes = reinterpret_cast<const uint8_t*>(ptr);
ET_CHECK_MSG(ptr_bytes >= base_ptr_, "Pointer is before base address");

const std::ptrdiff_t offset = ptr_bytes - base_ptr_;
ET_CHECK_MSG(
offset >= 0 && offset <= UINT32_MAX, "Offset out of valid range");
return static_cast<uint32_t>(offset);
}

size_t get_cmsis_workspace_size() const {
return total_size_ - KERNEL_SUM_HEADER_SIZE;
}

void validate_size(const cmsis_nn_dims& filter_dims) const {
header_->required_size =
arm_fully_connected_s8_get_buffer_size(&filter_dims);

ET_CHECK_MSG(
get_cmsis_workspace_size() >=
static_cast<size_t>(header_->required_size),
"Scratch buffer size %zu insufficient for required size %d",
get_cmsis_workspace_size(),
header_->required_size);
}
};

} // namespace native
} // namespace cortex_m
46 changes: 40 additions & 6 deletions backends/cortex_m/ops/cortex_m_ops_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ using ScalarType = executorch::aten::ScalarType;
using Scalar = torch::executor::Scalar;
using Error = executorch::runtime::Error;

// From arm_nn_math_types.h
#define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
#define ARM_NN_Q31_MIN ((int32_t)(0x80000000L))

// Basic tensor type / layout validation and dimension order checking
inline void validate_cmsis_nn_tensor_requirements(
const Tensor& input1,
Expand All @@ -32,16 +36,19 @@ inline void validate_cmsis_nn_tensor_requirements(
// Basic dtype validation
ET_CHECK_MSG(
input1.scalar_type() == expected_dtype,
"Input1 dtype must be %hhd",
expected_dtype);
"Input1 dtype must be %hhd, got %hhd",
expected_dtype,
input1.scalar_type());
ET_CHECK_MSG(
input2.scalar_type() == expected_dtype,
"Input2 dtype must be %hhd",
expected_dtype);
"Input2 dtype must be %hhd, got %hhd",
expected_dtype,
input2.scalar_type());
ET_CHECK_MSG(
output.scalar_type() == expected_dtype,
"Output dtype must be %hhd",
expected_dtype);
"Output dtype must be %hhd, got %hhd",
expected_dtype,
output.scalar_type());

// Dim order consistency
ET_CHECK_MSG(
Expand Down Expand Up @@ -114,6 +121,33 @@ inline void validate_quantization_params(
"Single quant Output");
}

// Refer to CMSIS-NN 'arm_nn_requantize' implementation for details:
// https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625
// multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX}
// shift : Range {-31, 30}
inline bool validate_per_channel_quant_params(
const int32_t* multipliers,
const int32_t* shifts,
int num_channels) {
for (int i = 0; i < num_channels; ++i) {
// Multiplier: {ARM_NN_Q31_MIN + 1, ARM_NN_Q31_MAX}
if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
ET_LOG(
Error,
"weight_multiplier[%d] out of CMSIS-NN range: %d",
i,
multipliers[i]);
return false;
}
// Shift: {-31, 30} for arm_nn_requantize
if (shifts[i] < -31 || shifts[i] > 30) {
ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
return false;
}
}
return true;
}

inline Error resize_to_broadcast_target_size(
const Tensor& input1,
const Tensor& input2,
Expand Down
Loading
Loading