pytorch · psiddh · Sep 19, 2025 · Sep 8, 2025 · digantdesai · Sep 18, 2025
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-# Source root directory for executorch.
+# Source root directory for executorch
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
@@ -21,70 +21,90 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 include(FetchContent)
 
-# CMSIS-NN version to download
+# CMSIS-NN configuration with dynamic path detection
 set(CMSIS_NN_VERSION
-    "v4.1.0"
+    "v7.0.0"
     CACHE STRING "CMSIS-NN version to download"
 )
-
-# Declare CMSIS-NN as a FetchContent project
-FetchContent_Declare(
-  cmsis_nn
-  GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
-  GIT_TAG ${CMSIS_NN_VERSION}
+set(CMSIS_NN_LOCAL_PATH
+    ""
+    CACHE PATH "Path to existing local CMSIS-NN installation"
 )
 
-# Download and make CMSIS-NN available
-FetchContent_MakeAvailable(cmsis_nn)
+# Try to find existing / local CMSIS-NN installation. This is useful for
+# debugging and testing with local changes. This is not common, as the CMSIS-NN
+# library is downloaded via FetchContent in the default/regular case.
+if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}")
+  message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}")
+  add_subdirectory(${CMSIS_NN_LOCAL_PATH} cmsis_nn_build)
+else()
+  # Use FetchContent with automatic fallback
+  message(STATUS "Using CMSIS-NN via FetchContent")
+
+  FetchContent_Declare(
+    cmsis_nn
+    GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
+    GIT_TAG ${CMSIS_NN_VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  FetchContent_GetProperties(cmsis_nn)
+  if(NOT cmsis_nn_POPULATED)
+    FetchContent_Populate(cmsis_nn)
+    add_subdirectory(${cmsis_nn_SOURCE_DIR} ${cmsis_nn_BINARY_DIR})
+  endif()
+endif()
 
-# Print paths for debugging
-message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
-message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
+# Add MVEI define to cmsis-nn target
+if(TARGET cmsis-nn)
+  target_compile_definitions(cmsis-nn PUBLIC ARM_MATH_MVEI=1)
+  get_target_property(CMSIS_NN_INCLUDES cmsis-nn INTERFACE_INCLUDE_DIRECTORIES)
+  message(STATUS "CMSIS-NN include dirs: ${CMSIS_NN_INCLUDES}")
+else()
+  message(
+    FATAL_ERROR
+      "CMSIS-NN target not found. Check your CMSIS_NN_LOCAL_PATH or network connection."
+  )
+endif()
 
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
 )
 
-# Generate C++ bindings to register kernels into Executorch (for runtime)
+# Generate C++ bindings to register kernels into Executorch
 set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
 gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
-
 generate_bindings_for_kernels(
   LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
 )
-message("Generated files ${gen_command_sources}")
 
-# Build a library for cortex_m_kernels
+# Build library for cortex_m_kernels
 add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
-target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
 
-# Include directories for cortex_m_kernels
-target_include_directories(
+# Use PRIVATE for implementation dependencies to avoid INTERFACE pollution
+target_link_libraries(
   cortex_m_kernels
-  PRIVATE ${EXECUTORCH_ROOT}/..
-          ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
-          ${cmsis_nn_SOURCE_DIR}/Include
+  PRIVATE cmsis-nn
+  PRIVATE executorch
 )
 
-# Link directly to the CMSIS-NN static library file
-target_link_libraries(
-  cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
+# Include directories for cortex_m_kernels
+target_include_directories(
+  cortex_m_kernels PRIVATE ${EXECUTORCH_ROOT}/..
+                           ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
 )
 
-# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
-# CMSIS-NN target name (usually 'cmsis-nn')
-add_dependencies(cortex_m_kernels cmsis-nn)
-
 # cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
 gen_operators_lib(
   LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
 )
 
 install(
-  TARGETS cortex_m_kernels cortex_m_ops_lib
+  TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
   EXPORT ExecuTorchTargets
   DESTINATION lib
   PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/

diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include "cortex_m_ops_common.h"
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+// During AOT phase, quantized_linear_fusion_pass allocates total buffer
+// and passes in as 'Tensor'. (Total buffer = 8-byte header + x bytes)
+// ┌─────────────────┬─────────────────────────────────────┐
+// │ KernelSum Header│        CMSIS Workspace              │
+// │    (8 bytes)    │         (x bytes)                   │
+// └─────────────────┴─────────────────────────────────────┘
+//          │                           │
+//          │                           └─> Passed to CMSIS API
+//          │
+//          └─> State for kernel sum
+
+// C++ Runtime:
+// ┌─────────────────┬─────────────────────────────────────┐
+// │ KernelSum Header│        CMSIS Workspace              │
+// │    (8 bytes)    │         (x bytes)                   │
+// └─────────────────┴─────────────────────────────────────┘
+// ^                 ^
+// │                 │
+// scratch_ptr       cmsis_workspace_ptr
+// │                 │
+// ▼                 ▼
+//             arm_vector_sum_s8() writes kernel sums (with bias if avail):
+//             [sum₀+bias₀][sum₁+bias₁][sum₂+bias₂]...[sum_{n-1}+bias_{n-1}]
+//             (n * 4-byte int32_t values = x bytes)
+//
+// - n = out_features (number of output features)
+// - x = n * 4 bytes (total CMSIS buffer size)
+// - Total buffer = 8 + x bytes
+
+class CMSISScratchBufferContext final {
+ public:
+  CMSISScratchBufferContext(
+      Tensor& scratch_buffer,
+      const Tensor& weights,
+      const Tensor& weight_zero_point,
+      const torch::executor::optional<Tensor>& bias)
+      : scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
+        total_size_(scratch_buffer.size(0)),
+        base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
+        in_features_(weights.size(1)),
+        out_features_(weights.size(0)),
+        is_per_channel_(weight_zero_point.numel() > 1),
+        weight_data_offset_(calculate_offset(weights.const_data_ptr<int8_t>())),
+        weight_zp_data_offset_(
+            calculate_offset(weight_zero_point.const_data_ptr<int32_t>())),
+        bias_data_offset_(
+            bias.has_value()
+                ? calculate_offset(bias.value().const_data_ptr<int32_t>())
+                : 0),
+        header_(reinterpret_cast<KernelSumHeader*>(scratch_ptr_)),
+        cmsis_workspace_ptr_(scratch_ptr_ + KERNEL_SUM_HEADER_SIZE) {
+    cmsis_nn_dims filter_dims = {in_features_, 1, 1, out_features_};
+    validate_size(filter_dims);
+  }
+
+  cmsis_nn_context get_cmsis_ctx() const {
+    cmsis_nn_context ctx;
+    ET_CHECK_MSG(
+        reinterpret_cast<uintptr_t>(cmsis_workspace_ptr_) % 4 == 0,
+        "CMSIS workspace not 4-byte aligned");
+    ctx.buf = cmsis_workspace_ptr_;
+    ctx.size = get_cmsis_workspace_size();
+    return ctx;
+  }
+
+  bool is_kernel_sum_updated() const {
+    return header_->updated;
+  }
+
+  void compute_kernel_sums_if_needed() {
+    if (!header_->updated) {
+      arm_vector_sum_s8(
+          reinterpret_cast<int32_t*>(cmsis_workspace_ptr_),
+          in_features_,
+          out_features_,
+          get_weight_data(),
+          get_weight_zp_data()[0],
+          0,
+          get_bias_data());
+      header_->updated = true;
+      ET_LOG(
+          Info,
+          "Computed kernel sums. [required_bytes : %d]",
+          header_->required_size);
+    }
+  }
+
+  const int8_t* get_weight_data() const {
+    return reinterpret_cast<const int8_t*>(base_ptr_ + weight_data_offset_);
+  }
+
+  const int32_t* get_weight_zp_data() const {
+    return reinterpret_cast<const int32_t*>(base_ptr_ + weight_zp_data_offset_);
+  }
+
+  const int32_t* get_bias_data() const {
+    return bias_data_offset_ == 0
+        ? nullptr
+        : reinterpret_cast<const int32_t*>(base_ptr_ + bias_data_offset_);
+  }
+
+  bool is_per_channel_quant() const {
+    return is_per_channel_;
+  }
+  int32_t get_in_features() const {
+    return in_features_;
+  }
+  int32_t get_out_features() const {
+    return out_features_;
+  }
+
+ private:
+  static constexpr size_t KERNEL_SUM_HEADER_SIZE = 8;
+
+  // Header for kernel sum computation state only
+  struct KernelSumHeader {
+    bool updated = false;
+    int32_t required_size = 0;
+  };
+  static_assert(
+      sizeof(KernelSumHeader) == KERNEL_SUM_HEADER_SIZE,
+      "KernelSumHeader must be exactly 8 bytes");
+
+  int8_t* scratch_ptr_;
+  size_t total_size_;
+  uint8_t* base_ptr_;
+
+  // Context members
+  const int32_t in_features_;
+  const int32_t out_features_;
+  const bool is_per_channel_;
+  const uint32_t weight_data_offset_;
+  const uint32_t weight_zp_data_offset_;
+  const uint32_t bias_data_offset_;
+
+  KernelSumHeader* header_;
+  int8_t* cmsis_workspace_ptr_;
+
+  uint32_t calculate_offset(const void* ptr) const {
+    if (ptr == nullptr)
+      return 0;
+
+    const uint8_t* ptr_bytes = reinterpret_cast<const uint8_t*>(ptr);
+    ET_CHECK_MSG(ptr_bytes >= base_ptr_, "Pointer is before base address");
+
+    const std::ptrdiff_t offset = ptr_bytes - base_ptr_;
+    ET_CHECK_MSG(
+        offset >= 0 && offset <= UINT32_MAX, "Offset out of valid range");
+    return static_cast<uint32_t>(offset);
+  }
+
+  size_t get_cmsis_workspace_size() const {
+    return total_size_ - KERNEL_SUM_HEADER_SIZE;
+  }
+
+  void validate_size(const cmsis_nn_dims& filter_dims) const {
+    header_->required_size =
+        arm_fully_connected_s8_get_buffer_size(&filter_dims);
+
+    ET_CHECK_MSG(
+        get_cmsis_workspace_size() >=
+            static_cast<size_t>(header_->required_size),
+        "Scratch buffer size %zu insufficient for required size %d",
+        get_cmsis_workspace_size(),
+        header_->required_size);
+  }
+};
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -22,6 +22,10 @@ using ScalarType = executorch::aten::ScalarType;
 using Scalar = torch::executor::Scalar;
 using Error = executorch::runtime::Error;
 
+// From arm_nn_math_types.h
+#define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
+#define ARM_NN_Q31_MIN ((int32_t)(0x80000000L))
+
 // Basic tensor type / layout validation and dimension order checking
 inline void validate_cmsis_nn_tensor_requirements(
     const Tensor& input1,
@@ -32,16 +36,19 @@ inline void validate_cmsis_nn_tensor_requirements(
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
-      "Input1 dtype must be %hhd",
-      expected_dtype);
+      "Input1 dtype must be %hhd, got %hhd",
+      expected_dtype,
+      input1.scalar_type());
   ET_CHECK_MSG(
       input2.scalar_type() == expected_dtype,
-      "Input2 dtype must be %hhd",
-      expected_dtype);
+      "Input2 dtype must be %hhd, got %hhd",
+      expected_dtype,
+      input2.scalar_type());
   ET_CHECK_MSG(
       output.scalar_type() == expected_dtype,
-      "Output dtype must be %hhd",
-      expected_dtype);
+      "Output dtype must be %hhd, got %hhd",
+      expected_dtype,
+      output.scalar_type());
 
   // Dim order consistency
   ET_CHECK_MSG(
@@ -114,6 +121,33 @@ inline void validate_quantization_params(
       "Single quant Output");
 }
 
+// Refer to CMSIS-NN 'arm_nn_requantize' implementation for details:
+// https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625
+// multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX}
+// shift     : Range {-31, 30}
+inline bool validate_per_channel_quant_params(
+    const int32_t* multipliers,
+    const int32_t* shifts,
+    int num_channels) {
+  for (int i = 0; i < num_channels; ++i) {
+    // Multiplier: {ARM_NN_Q31_MIN + 1, ARM_NN_Q31_MAX}
+    if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
+      ET_LOG(
+          Error,
+          "weight_multiplier[%d] out of CMSIS-NN range: %d",
+          i,
+          multipliers[i]);
+      return false;
+    }
+    // Shift: {-31, 30} for arm_nn_requantize
+    if (shifts[i] < -31 || shifts[i] > 30) {
+      ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
+      return false;
+    }
+  }
+  return true;
+}
+
 inline Error resize_to_broadcast_target_size(
     const Tensor& input1,
     const Tensor& input2,