pytorch · psiddh · Aug 28, 2025 · Jul 17, 2025 · Aug 28, 2025 · mergennachin
@@ -5,11 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# Kernel library for Cortex-M operators. Please keep this file formatted by
-# running:
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
 cmake_minimum_required(VERSION 3.19)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -24,29 +19,65 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+include(FetchContent)
+
+# CMSIS-NN version to download
+set(CMSIS_NN_VERSION
+    "v4.1.0"
+    CACHE STRING "CMSIS-NN version to download"
+)
+
+# Declare CMSIS-NN as a FetchContent project
+FetchContent_Declare(
+  cmsis_nn
+  GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
+  GIT_TAG ${CMSIS_NN_VERSION}
+)
+
+# Download and make CMSIS-NN available
+FetchContent_MakeAvailable(cmsis_nn)
+
+# Print paths for debugging
+message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
+message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
 
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
 )
 
-# Generate C++ bindings to register kernels into Executorch (for runtime). Here
-# select all ops in operators.yaml
+# Generate C++ bindings to register kernels into Executorch (for runtime)
 set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
 gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
 
-# Generate bindings for the kernels
 generate_bindings_for_kernels(
   LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
 )
 message("Generated files ${gen_command_sources}")
 
-# Build a library for _cortex_m_kernels_srcs
+# Build a library for cortex_m_kernels
 add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
-target_link_libraries(cortex_m_kernels PRIVATE executorch)
 target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
 
+# Include directories for cortex_m_kernels
+target_include_directories(
+  cortex_m_kernels
+  PRIVATE ${EXECUTORCH_ROOT}/..
+          ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+          ${cmsis_nn_SOURCE_DIR}/Include
+)
+
+# Link directly to the CMSIS-NN static library file
+target_link_libraries(
+  cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
+)
+
+# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
+# CMSIS-NN target name (usually 'cmsis-nn')
+add_dependencies(cortex_m_kernels cmsis-nn)
+
 # cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
 gen_operators_lib(
   LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch

diff --git a/backends/cortex_m/ops/TARGETS b/backends/cortex_m/ops/TARGETS
@@ -16,6 +16,7 @@ python_library(
     ],
     deps = [
         "fbcode//caffe2:torch",
+        "//executorch/backends/cortex_m/passes:passes_utils",
     ],
 )
 

diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+using Tensor = torch::executor::Tensor;
+using ScalarType = executorch::aten::ScalarType;
+using Scalar = torch::executor::Scalar;
+using Error = executorch::runtime::Error;
+
+// Basic tensor type / layout validation and dimension order checking
+inline void validate_cmsis_nn_tensor_requirements(
+    const Tensor& input1,
+    const Tensor& input2,
+    Tensor& output,
+    ScalarType expected_dtype = ScalarType::Char,
+    bool require_channels_last = false) {
+  // Basic dtype validation
+  ET_CHECK_MSG(
+      input1.scalar_type() == expected_dtype,
+      "Input1 dtype must be %hhd",
+      expected_dtype);
+  ET_CHECK_MSG(
+      input2.scalar_type() == expected_dtype,
+      "Input2 dtype must be %hhd",
+      expected_dtype);
+  ET_CHECK_MSG(
+      output.scalar_type() == expected_dtype,
+      "Output dtype must be %hhd",
+      expected_dtype);
+
+  // Dim order consistency
+  ET_CHECK_MSG(
+      executorch::runtime::tensors_have_same_dim_order(input1, input2, output),
+      "Tensors must have same dimension order");
+
+  // TBD: Validate memory alignment (CMSIS-NN requirement)
+}
+
+inline void validate_single_quant_params(
+    const Scalar& zero_point,
+    const Scalar& multiplier,
+    const Scalar& shift,
+    const char* param_name) {
+  int64_t zp_val = zero_point.to<int64_t>();
+  int64_t mult_val = multiplier.to<int64_t>();
+  int64_t shift_val = shift.to<int64_t>();
+
+  ET_CHECK_MSG(
+      zp_val >= std::numeric_limits<int8_t>::min() &&
+          zp_val <= std::numeric_limits<int8_t>::max(),
+      "%s zero point must be in int8 range [Value: %d]",
+      param_name,
+      zp_val);
+
+  ET_CHECK_MSG(
+      mult_val >= std::numeric_limits<int32_t>::min() &&
+          mult_val <= std::numeric_limits<int32_t>::max(),
+      "%s multiplier must be in int32 range [Value: %d]",
+      param_name,
+      mult_val);
+
+  ET_CHECK_MSG(
+      shift_val >= -31 && shift_val <= 31,
+      "%s shift must be in range [-31, 31] [Value: %d]",
+      param_name,
+      shift_val);
+}
+
+/**
+ * Validate quantization parameters for inputs and output.
+ *
+ * Checks that zero points fit in int8 range, multipliers fit in int32 range,
+ * and shifts are within a valid bit-shift range (0-31).
+ *
+ * Ensures parameters comply with Ahead-Of-Time (AOT) quantization requirements
+ * and CMSIS-NN kernel expectations.
+ *
+ * Raises errors via ET_KERNEL_CHECK if any check fails.
+ */
+inline void validate_quantization_params(
+    const Scalar& zero_point1,
+    const Scalar& multiplier1,
+    const Scalar& shift1,
+    const Scalar& zero_point2,
+    const Scalar& multiplier2,
+    const Scalar& shift2,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift,
+    Tensor& output) {
-inline void validate_quantization_params(
-    const Scalar& zero_point1,
-    const Scalar& multiplier1,
-    const Scalar& shift1,
-    const Scalar& zero_point2,
-    const Scalar& multiplier2,
-    const Scalar& shift2,
-    const Scalar& output_zero_point,
-    const Scalar& output_multiplier,
-    const Scalar& output_shift,
-    Tensor& output) {
+inline void validate_quantization_params(
+    const Scalar& zero_point,
+    const Scalar& multiplier,
+    const Scalar& shift) {
-inline void validate_quantization_params(
-    const Scalar& zero_point1,
-    const Scalar& multiplier1,
-    const Scalar& shift1,
-    const Scalar& zero_point2,
-    const Scalar& multiplier2,
-    const Scalar& shift2,
-    const Scalar& output_zero_point,
-    const Scalar& output_multiplier,
-    const Scalar& output_shift,
-    Tensor& output) {
+inline void validate_quantization_params(
+    const Scalar& zero_point,
+    const Scalar& multiplier,
+    const Scalar& shift) {
+  validate_single_quant_params(
+      zero_point1, multiplier1, shift1, "Single quant Input1");
+  validate_single_quant_params(
+      zero_point2, multiplier2, shift2, "Single quant Input2");
+  validate_single_quant_params(
+      output_zero_point,
+      output_multiplier,
+      output_shift,
+      "Single quant Output");
+}
+
+inline Error resize_to_broadcast_target_size(
+    const Tensor& input1,
+    const Tensor& input2,
+    Tensor& output) {
+  static constexpr int kTensorDimensionLimit = 5;
+  Tensor::SizesType expected_output_size[kTensorDimensionLimit];
+  size_t expected_output_dim = 0;
+  auto err = torch::executor::get_broadcast_target_size(
+      input1,
+      input2,
+      expected_output_size,
+      kTensorDimensionLimit,
+      &expected_output_dim);
+
+  if (err != Error::Ok)
+    return err;
+
+  return executorch::runtime::resize_tensor(
+      output, {expected_output_size, expected_output_dim});
+}
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+namespace cortex_m {
+namespace native {
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& quantized_add_out(
+    KernelRuntimeContext& context,
+    const Tensor& input1_int8,
+    const Scalar& input1_zero_point,
+    const Scalar& input1_multiplier,
+    const Scalar& input1_shift,
+    const Tensor& input2_int8,
+    const Scalar& input2_zero_point,
+    const Scalar& input2_multiplier,
+    const Scalar& input2_shift,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift,
+    Tensor& out) {
+  // Validate tensor types and dim order
+  validate_cmsis_nn_tensor_requirements(input1_int8, input2_int8, out);
+
+  // Validate quantization parameters
+  validate_quantization_params(
+      input1_zero_point,
+      input1_multiplier,
+      input1_shift,
+      input2_zero_point,
+      input2_multiplier,
+      input2_shift,
+      output_zero_point,
+      output_multiplier,
+      output_shift,
+      out);
+
+  // Broadcast if needed
+  auto result = resize_to_broadcast_target_size(input1_int8, input2_int8, out);
+  ET_CHECK_MSG(
+      (result == Error::Ok),
+      "Failed to resize output tensor. Status: [%d]",
+      result);
+
+  ET_LOG(
+      Info,
+      "quantized_add_out: input1_int8.sizes() = %zu",
+      input1_int8.sizes().size());
+
+  // FIX: Use template types that ExecutorTorch definitely provides
+  // Use to<int64_t>() and to<double>() which are commonly instantiated
+  int32_t zp1 = static_cast<int32_t>(input1_zero_point.to<int64_t>());
+  int32_t input1_mult = static_cast<int32_t>(input1_multiplier.to<int64_t>());
+  int input1_shift_val = static_cast<int>(input1_shift.to<int64_t>());
+
+  int32_t zp2 = static_cast<int32_t>(input2_zero_point.to<int64_t>());
+  int32_t input2_mult = static_cast<int32_t>(input2_multiplier.to<int64_t>());
+  int input2_shift_val = static_cast<int>(input2_shift.to<int64_t>());
+
+  int32_t out_zp = static_cast<int32_t>(output_zero_point.to<int64_t>());
+  int32_t output_mult = static_cast<int32_t>(output_multiplier.to<int64_t>());
+  int output_shift_val = static_cast<int>(output_shift.to<int64_t>());
+
+  // Left shift to maximize precision (tune as needed)
+  const int32_t left_shift = 20;
+  const int32_t activation_min = std::numeric_limits<int8_t>::min();
+  const int32_t activation_max = std::numeric_limits<int8_t>::max();
+
+  ET_LOG(
+      Info,
+      "Using AoT-computed parameters: input1[mult=%d, shift=%d], input2[mult=%d, shift=%d], output[mult=%d, shift=%d]",
+      input1_mult,
+      input1_shift_val,
+      input2_mult,
+      input2_shift_val,
+      output_mult,
+      output_shift_val);
+
+  // Call CMSIS-NN kernel with precomputed parameters
+  arm_cmsis_nn_status status = arm_elementwise_add_s8(
+      input1_int8.const_data_ptr<int8_t>(),
+      input2_int8.const_data_ptr<int8_t>(),
+      static_cast<int32_t>(zp1),
+      input1_mult,
+      input1_shift_val,
+      static_cast<int32_t>(zp2),
+      input2_mult,
+      input2_shift_val,
+      left_shift,
+      out.mutable_data_ptr<int8_t>(),
+      static_cast<int32_t>(out_zp),
+      output_mult,
+      output_shift_val,
+      static_cast<int32_t>(out.numel()),
+      activation_min,
+      activation_max);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "quantized_add_out: arm_elementwise_add_s8 failed with status [%d]",
+        status);
+
+    context.fail(Error::Internal); // Fail the execution context
+    return out;
+  }
+  ET_LOG(
+      Info,
+      "quantized_add_out: Successfully completed with AoT-computed parameters!");
+
+  return out;
+}
+
+// Stub Implementation: Non-out variant for compatibility (functional variant)
+// EXIR/ExecuTorch runs an out-variant pass that converts
+// .default operations to .out variants before memory planning.
+// In the pass we are calling quantized_add's default variant
+// but ExecuTorch's kernel dispatch mechanism will end up calling the out
+// variant. This stub is to make sure that compiler doesn't complain.
+Tensor quantized_add(
+    KernelRuntimeContext& context,
+    const Tensor& input1_int8,
+    const Scalar& input1_zero_point,
+    const Scalar& input1_multiplier,
+    const Scalar& input1_shift,
+    const Tensor& input2_int8,
+    const Scalar& input2_zero_point,
+    const Scalar& input2_multiplier,
+    const Scalar& input2_shift,
+    const Scalar& output_zero_point,
+    const Scalar& output_multiplier,
+    const Scalar& output_shift) {
+  ET_LOG(Info, "quantized_add: input1_int8.sizes() = %zu", input1_int8.sizes());
+
+  // Crash on Debug builds if invoked
+  assert(False);
+  // This is to make sure compiler doesn't complain.
+  return const_cast<Tensor&>(input1_int8);
+}
+
+} // namespace native
+} // namespace cortex_m
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ python_library( @@
         ],
         deps = [
             "fbcode//caffe2:torch",
+            "//executorch/backends/cortex_m/passes:passes_utils",
         ],
     )
@@ Expand Down @@