-
Notifications
You must be signed in to change notification settings - Fork 685
Summary: Initial CMSS-NN Add Op #13296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,6 @@ | |
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
# Kernel library for Cortex-M operators. Please keep this file formatted by | ||
# running: | ||
# ~~~ | ||
# cmake-format -i CMakeLists.txt | ||
# ~~~ | ||
cmake_minimum_required(VERSION 3.19) | ||
|
||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
|
@@ -24,29 +19,65 @@ endif() | |
|
||
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) | ||
include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) | ||
include(FetchContent) | ||
|
||
# CMSIS-NN version to download | ||
set(CMSIS_NN_VERSION | ||
"v4.1.0" | ||
CACHE STRING "CMSIS-NN version to download" | ||
) | ||
|
||
# Declare CMSIS-NN as a FetchContent project | ||
FetchContent_Declare( | ||
cmsis_nn | ||
GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git | ||
GIT_TAG ${CMSIS_NN_VERSION} | ||
) | ||
|
||
# Download and make CMSIS-NN available | ||
FetchContent_MakeAvailable(cmsis_nn) | ||
|
||
# Print paths for debugging | ||
message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}") | ||
message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}") | ||
|
||
# Cortex-M ops kernel sources | ||
set(_cortex_m_kernels__srcs | ||
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp | ||
) | ||
|
||
# Generate C++ bindings to register kernels into Executorch (for runtime). Here | ||
# select all ops in operators.yaml | ||
# Generate C++ bindings to register kernels into Executorch (for runtime) | ||
set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml) | ||
gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}") | ||
|
||
# Generate bindings for the kernels | ||
generate_bindings_for_kernels( | ||
LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}" | ||
) | ||
message("Generated files ${gen_command_sources}") | ||
|
||
# Build a library for _cortex_m_kernels_srcs | ||
# Build a library for cortex_m_kernels | ||
add_library(cortex_m_kernels ${_cortex_m_kernels__srcs}) | ||
target_link_libraries(cortex_m_kernels PRIVATE executorch) | ||
target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options}) | ||
|
||
# Include directories for cortex_m_kernels | ||
target_include_directories( | ||
cortex_m_kernels | ||
PRIVATE ${EXECUTORCH_ROOT}/.. | ||
${EXECUTORCH_ROOT}/runtime/core/portable_type/c10 | ||
${cmsis_nn_SOURCE_DIR}/Include | ||
) | ||
|
||
# Link directly to the CMSIS-NN static library file | ||
target_link_libraries( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it done this way instead of directly adding the library target? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The CMSIS-NN library is linked by specifying the static library file path directly because CMSIS-NN is brought in via FetchContent and does not define a CMake target in the build. |
||
cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch | ||
) | ||
|
||
# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual | ||
# CMSIS-NN target name (usually 'cmsis-nn') | ||
add_dependencies(cortex_m_kernels cmsis-nn) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are no C++ testing?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mergennachin Created task to track dedicated test suite for cortex_m ops (that should cover Python tests, C++ unit tests & E2E tests): #13739. (and the initial work on this: PR : #1357) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI we can't run cortex-m op unittests without FVP. but for c++ only utils, sure. |
||
# cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime | ||
gen_operators_lib( | ||
LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,141 @@ | ||||||||||||||||||||||||||||||||
/* | ||||||||||||||||||||||||||||||||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||||||||||||||||||||||||||||||||
* All rights reserved. | ||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||
* This source code is licensed under the BSD-style license found in the | ||||||||||||||||||||||||||||||||
* LICENSE file in the root directory of this source tree. | ||||||||||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
#pragma once | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
#include <executorch/kernels/portable/cpu/util/broadcast_util.h> | ||||||||||||||||||||||||||||||||
#include <executorch/runtime/kernel/kernel_includes.h> | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
#include <executorch/kernels/portable/cpu/scalar_utils.h> | ||||||||||||||||||||||||||||||||
#include <executorch/kernels/portable/cpu/util/elementwise_util.h> | ||||||||||||||||||||||||||||||||
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h> | ||||||||||||||||||||||||||||||||
#include <executorch/runtime/kernel/kernel_includes.h> | ||||||||||||||||||||||||||||||||
#include <executorch/runtime/platform/assert.h> | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
// Include CMSIS-NN headers with C linkage | ||||||||||||||||||||||||||||||||
extern "C" { | ||||||||||||||||||||||||||||||||
#include "arm_nnfunctions.h" | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
using Tensor = torch::executor::Tensor; | ||||||||||||||||||||||||||||||||
using ScalarType = executorch::aten::ScalarType; | ||||||||||||||||||||||||||||||||
using Scalar = torch::executor::Scalar; | ||||||||||||||||||||||||||||||||
using Error = executorch::runtime::Error; | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
// Basic tensor type / layout validation and dimension order checking | ||||||||||||||||||||||||||||||||
inline void validate_cmsis_nn_tensor_requirements( | ||||||||||||||||||||||||||||||||
const Tensor& input1, | ||||||||||||||||||||||||||||||||
const Tensor& input2, | ||||||||||||||||||||||||||||||||
Tensor& output, | ||||||||||||||||||||||||||||||||
ScalarType expected_dtype = ScalarType::Char, | ||||||||||||||||||||||||||||||||
bool require_channels_last = false) { | ||||||||||||||||||||||||||||||||
// Basic dtype validation | ||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
input1.scalar_type() == expected_dtype, | ||||||||||||||||||||||||||||||||
"Input1 dtype must be %hhd", | ||||||||||||||||||||||||||||||||
expected_dtype); | ||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
input2.scalar_type() == expected_dtype, | ||||||||||||||||||||||||||||||||
"Input2 dtype must be %hhd", | ||||||||||||||||||||||||||||||||
expected_dtype); | ||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
output.scalar_type() == expected_dtype, | ||||||||||||||||||||||||||||||||
"Output dtype must be %hhd", | ||||||||||||||||||||||||||||||||
expected_dtype); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
// Dim order consistency | ||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
executorch::runtime::tensors_have_same_dim_order(input1, input2, output), | ||||||||||||||||||||||||||||||||
"Tensors must have same dimension order"); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
// TBD: Validate memory alignment (CMSIS-NN requirement) | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
inline void validate_single_quant_params( | ||||||||||||||||||||||||||||||||
const Scalar& zero_point, | ||||||||||||||||||||||||||||||||
const Scalar& multiplier, | ||||||||||||||||||||||||||||||||
const Scalar& shift, | ||||||||||||||||||||||||||||||||
const char* param_name) { | ||||||||||||||||||||||||||||||||
int64_t zp_val = zero_point.to<int64_t>(); | ||||||||||||||||||||||||||||||||
int64_t mult_val = multiplier.to<int64_t>(); | ||||||||||||||||||||||||||||||||
int64_t shift_val = shift.to<int64_t>(); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
zp_val >= std::numeric_limits<int8_t>::min() && | ||||||||||||||||||||||||||||||||
zp_val <= std::numeric_limits<int8_t>::max(), | ||||||||||||||||||||||||||||||||
"%s zero point must be in int8 range [Value: %d]", | ||||||||||||||||||||||||||||||||
param_name, | ||||||||||||||||||||||||||||||||
zp_val); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
mult_val >= std::numeric_limits<int32_t>::min() && | ||||||||||||||||||||||||||||||||
mult_val <= std::numeric_limits<int32_t>::max(), | ||||||||||||||||||||||||||||||||
"%s multiplier must be in int32 range [Value: %d]", | ||||||||||||||||||||||||||||||||
param_name, | ||||||||||||||||||||||||||||||||
mult_val); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
ET_CHECK_MSG( | ||||||||||||||||||||||||||||||||
shift_val >= -31 && shift_val <= 31, | ||||||||||||||||||||||||||||||||
"%s shift must be in range [-31, 31] [Value: %d]", | ||||||||||||||||||||||||||||||||
param_name, | ||||||||||||||||||||||||||||||||
shift_val); | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
/** | ||||||||||||||||||||||||||||||||
* Validate quantization parameters for inputs and output. | ||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||
* Checks that zero points fit in int8 range, multipliers fit in int32 range, | ||||||||||||||||||||||||||||||||
* and shifts are within a valid bit-shift range (0-31). | ||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||
* Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements | ||||||||||||||||||||||||||||||||
* and CMSIS-NN kernel expectations. | ||||||||||||||||||||||||||||||||
* | ||||||||||||||||||||||||||||||||
* Raises errors via ET_KERNEL_CHECK if any check fails. | ||||||||||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||||||||||
inline void validate_quantization_params( | ||||||||||||||||||||||||||||||||
const Scalar& zero_point1, | ||||||||||||||||||||||||||||||||
const Scalar& multiplier1, | ||||||||||||||||||||||||||||||||
const Scalar& shift1, | ||||||||||||||||||||||||||||||||
const Scalar& zero_point2, | ||||||||||||||||||||||||||||||||
const Scalar& multiplier2, | ||||||||||||||||||||||||||||||||
const Scalar& shift2, | ||||||||||||||||||||||||||||||||
const Scalar& output_zero_point, | ||||||||||||||||||||||||||||||||
const Scalar& output_multiplier, | ||||||||||||||||||||||||||||||||
const Scalar& output_shift, | ||||||||||||||||||||||||||||||||
Tensor& output) { | ||||||||||||||||||||||||||||||||
Comment on lines
+100
to
+110
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make it a util to check a single quant params
Suggested change
|
||||||||||||||||||||||||||||||||
validate_single_quant_params( | ||||||||||||||||||||||||||||||||
zero_point1, multiplier1, shift1, "Single quant Input1"); | ||||||||||||||||||||||||||||||||
validate_single_quant_params( | ||||||||||||||||||||||||||||||||
zero_point2, multiplier2, shift2, "Single quant Input2"); | ||||||||||||||||||||||||||||||||
validate_single_quant_params( | ||||||||||||||||||||||||||||||||
output_zero_point, | ||||||||||||||||||||||||||||||||
output_multiplier, | ||||||||||||||||||||||||||||||||
output_shift, | ||||||||||||||||||||||||||||||||
"Single quant Output"); | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
inline Error resize_to_broadcast_target_size( | ||||||||||||||||||||||||||||||||
const Tensor& input1, | ||||||||||||||||||||||||||||||||
const Tensor& input2, | ||||||||||||||||||||||||||||||||
Tensor& output) { | ||||||||||||||||||||||||||||||||
static constexpr int kTensorDimensionLimit = 5; | ||||||||||||||||||||||||||||||||
Tensor::SizesType expected_output_size[kTensorDimensionLimit]; | ||||||||||||||||||||||||||||||||
size_t expected_output_dim = 0; | ||||||||||||||||||||||||||||||||
auto err = torch::executor::get_broadcast_target_size( | ||||||||||||||||||||||||||||||||
input1, | ||||||||||||||||||||||||||||||||
input2, | ||||||||||||||||||||||||||||||||
expected_output_size, | ||||||||||||||||||||||||||||||||
kTensorDimensionLimit, | ||||||||||||||||||||||||||||||||
&expected_output_dim); | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
if (err != Error::Ok) | ||||||||||||||||||||||||||||||||
return err; | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
return executorch::runtime::resize_tensor( | ||||||||||||||||||||||||||||||||
output, {expected_output_size, expected_output_dim}); | ||||||||||||||||||||||||||||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
#include "cortex_m_ops_common.h" | ||
|
||
namespace cortex_m { | ||
namespace native { | ||
using KernelRuntimeContext = torch::executor::KernelRuntimeContext; | ||
|
||
Tensor& quantized_add_out( | ||
KernelRuntimeContext& context, | ||
const Tensor& input1_int8, | ||
const Scalar& input1_zero_point, | ||
const Scalar& input1_multiplier, | ||
const Scalar& input1_shift, | ||
const Tensor& input2_int8, | ||
const Scalar& input2_zero_point, | ||
const Scalar& input2_multiplier, | ||
const Scalar& input2_shift, | ||
const Scalar& output_zero_point, | ||
const Scalar& output_multiplier, | ||
const Scalar& output_shift, | ||
Tensor& out) { | ||
// Validate tensor types and dim order | ||
validate_cmsis_nn_tensor_requirements(input1_int8, input2_int8, out); | ||
|
||
// Validate quantization parameters | ||
validate_quantization_params( | ||
input1_zero_point, | ||
input1_multiplier, | ||
input1_shift, | ||
input2_zero_point, | ||
input2_multiplier, | ||
input2_shift, | ||
output_zero_point, | ||
output_multiplier, | ||
output_shift, | ||
out); | ||
|
||
// Broadcast if needed | ||
auto result = resize_to_broadcast_target_size(input1_int8, input2_int8, out); | ||
ET_CHECK_MSG( | ||
(result == Error::Ok), | ||
"Failed to resize output tensor. Status: [%d]", | ||
result); | ||
|
||
ET_LOG( | ||
Info, | ||
"quantized_add_out: input1_int8.sizes() = %zu", | ||
input1_int8.sizes().size()); | ||
|
||
// FIX: Use template types that ExecutorTorch definitely provides | ||
// Use to<int64_t>() and to<double>() which are commonly instantiated | ||
int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>()); | ||
int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>()); | ||
int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>()); | ||
|
||
int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>()); | ||
int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>()); | ||
int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>()); | ||
|
||
int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>()); | ||
int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>()); | ||
int output_shift_val = static_cast<int>(output_shift.to<int64_t>()); | ||
|
||
// Left shift to maximize precision (tune as needed) | ||
const int32_t left_shift = 20; | ||
const int32_t activation_min = std::numeric_limits<int8_t>::min(); | ||
const int32_t activation_max = std::numeric_limits<int8_t>::max(); | ||
|
||
ET_LOG( | ||
Info, | ||
"Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]", | ||
input1_mult, | ||
input1_shift_val, | ||
input2_mult, | ||
input2_shift_val, | ||
output_mult, | ||
output_shift_val); | ||
|
||
// Call CMSIS-NN kernel with precomputed parameters | ||
arm_cmsis_nn_status status = arm_elementwise_add_s8( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this handle if two inputs are of different shape i.e. [1,2,2] + [2,2,2] There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it handles and I validated this in e2e flow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could potentially be added as a pass as well: https://github.com/pytorch/executorch/blob/main/backends/arm/_passes/broadcast_args_pass.py. But long term the ideal solution would be to add broadcast support to CMSIS-NN to get it accelerated w/o memcopies. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It can't be added as an AoT pass and must be handled at runtime because we support dynamic shapes with these operators. |
||
input1_int8.const_data_ptr<int8_t>(), | ||
input2_int8.const_data_ptr<int8_t>(), | ||
static_cast<int32_t>(zp1), | ||
input1_mult, | ||
input1_shift_val, | ||
static_cast<int32_t>(zp2), | ||
input2_mult, | ||
input2_shift_val, | ||
left_shift, | ||
out.mutable_data_ptr<int8_t>(), | ||
static_cast<int32_t>(out_zp), | ||
output_mult, | ||
output_shift_val, | ||
static_cast<int32_t>(out.numel()), | ||
activation_min, | ||
activation_max); | ||
|
||
if (status != ARM_CMSIS_NN_SUCCESS) { | ||
ET_LOG( | ||
Error, | ||
"quantized_add_out: arm_elementwise_add_s8 failed with status [%d]", | ||
status); | ||
|
||
context.fail(Error::Internal); // Fail the execution context | ||
return out; | ||
} | ||
ET_LOG( | ||
Info, | ||
"quantized_add_out: Successfully completed with AoT-computed parameters!"); | ||
|
||
return out; | ||
} | ||
|
||
// Stub Implementation: Non-out variant for compatibility (functional variant) | ||
// EXIR/ExecuTorch runs an out-variant pass that converts | ||
// .default operations to .out variants before memory planning. | ||
// In the pass we are calling quantized_add's default variant | ||
// but ExecuTorch's kernel dispatch mechanism will end up calling the out | ||
// variant. This stub is to make sure that compiler doesn't complain. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the compiler complaint? I think you might need to write a version for |
||
Tensor quantized_add( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this? |
||
KernelRuntimeContext& context, | ||
const Tensor& input1_int8, | ||
const Scalar& input1_zero_point, | ||
const Scalar& input1_multiplier, | ||
const Scalar& input1_shift, | ||
const Tensor& input2_int8, | ||
const Scalar& input2_zero_point, | ||
const Scalar& input2_multiplier, | ||
const Scalar& input2_shift, | ||
const Scalar& output_zero_point, | ||
const Scalar& output_multiplier, | ||
const Scalar& output_shift) { | ||
ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes()); | ||
|
||
// Crash on Debug builds if invoked | ||
assert(False); | ||
// This is to make sure compiler doesn't complain. | ||
return const_cast<Tensor&>(input1_int8); | ||
} | ||
|
||
} // namespace native | ||
} // namespace cortex_m |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
besides
_common_compile_options
don't you also need cortex-m specific compile options?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comes from the toolchain file.