pytorch · zonglinpeng · Oct 6, 2025 · Oct 7, 2025
diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -31,25 +31,24 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::vision::native::kernels::dequantize<uint8_t>(
+    kernels::dequantize<uint8_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::vision::native::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::vision::native::kernels::dequantize<uint16_t>(
+    kernels::dequantize<uint16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::vision::native::kernels::dequantize<int16_t>(
+    kernels::dequantize<int16_t>(
         out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Int) {
     const int32_t* input_data = input.const_data_ptr<int32_t>();
-    impl::vision::native::kernels::dequantize<int32_t>(
+    kernels::dequantize<int32_t>(
         out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(

diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -33,25 +33,25 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::vision::native::kernels::quantize<uint8_t>(
+    kernels::quantize<uint8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::vision::native::kernels::quantize<int8_t>(
+    kernels::quantize<int8_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::vision::native::kernels::quantize<uint16_t>(
+    kernels::quantize<uint16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::vision::native::kernels::quantize<int16_t>(
+    kernels::quantize<int16_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Int) {
     int32_t* out_data = out.mutable_data_ptr<int32_t>();
-    impl::vision::native::kernels::quantize<int32_t>(
+    kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(

diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -141,8 +141,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  ::impl::vision::native::kernels::quantize<OT>(
-                      val, inv_out_scale, out_zero_point);
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
@@ -267,8 +266,8 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
             }
             if (quantized) {
               float val = bias_scale * acc;
-              out_line[_oc] = ::impl::vision::native::kernels::quantize<OT>(
-                  val, inv_out_scale, out_zero_point);
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_line[_oc] = acc;
             }
@@ -530,6 +529,80 @@ void quantized_conv_per_tensor_out(
   }
 }
 
+void quantized_conv2d_nchw_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      false, // channel_last = false for NCHW
+      out);
+}
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
 } // namespace native
 } // namespace vision
 } // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
@@ -6,13 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <api.h>
 #include <executorch/backends/cadence/vision/kernels/kernels.h>
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <idma_init.h>
+#include <include/api.h>
+#include <include_private/idma_init.h>
 #include <stdio.h>
 
 using executorch::aten::ScalarType;

diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h
@@ -49,7 +49,7 @@ inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
             (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
         sum += x * w;
       }
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
+      out_data[i * out_dim + j] = impl::vision::kernels::quantize<T>(
           sum, requant_scale, out_zero_point);
     }
   }
@@ -121,8 +121,8 @@ inline __attribute__((always_inline)) void quantized_linear_per_channel_(
       // Compute the out_scale from out_multiplier and out_shift
       const float out_scale =
           -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
-      out_data[i * out_dim + j] = ::impl::vision::native::kernels::quantize<T>(
-          sum, out_scale, out_zero_point);
+      out_data[i * out_dim + j] =
+          impl::vision::kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }

diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl
@@ -21,6 +21,25 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
     if deps == None:
         deps = []
 
+    # Determine which headers to export based on operator name
+    exported_headers = ["operators.h"]
+
+    # Add quantized_ops.h header for quantized operators
+    quantized_ops = [
+        "quantized_fully_connected_out",
+        "quantized_matmul_out", 
+        "quantized_layer_norm",
+        "quantized_relu_out",
+        "quantized_conv_out",
+        "quantized_linear_out",
+        "quantize_per_tensor",
+        "dequantize_per_tensor",
+        "requantize_out"
+    ]
+
+    if name in quantized_ops:
+        exported_headers.append("quantized_ops.h")
+
     runtime.cxx_library(
         name = op_name,
         srcs = [op_name + ".cpp"],
@@ -31,7 +50,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_headers = ["operators.h"],
+        exported_headers = exported_headers,
     )
 
 OPERATORS = [

diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
@@ -1,31 +1,36 @@
 #ifndef __IDMA__INIT_H__
 #define __IDMA__INIT_H__
 
-#include "dtypes.h"
+#include "../include/dtypes.h"
 #include "common.h"
 
-#define IDMA_BUFF_SIZE 16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
+#define IDMA_BUFF_SIZE \
+  16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
 
 #ifndef PLACE_IN_DRAM0
-	#define PLACE_IN_DRAM0 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram0.data")))
+#define PLACE_IN_DRAM0 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
 #endif
 
 #ifndef PLACE_IN_DRAM1
-	#define PLACE_IN_DRAM1 __attribute__ ((aligned(2*IVP_SIMD_WIDTH), section(".dram1.data")))
+#define PLACE_IN_DRAM1 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
 #endif
 
 float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
 float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
 
-float32_t *inpData[2] = {&data_dram0[0], &data_dram1[0]};
-float32_t *outData[2] = {&data_dram0[IDMA_BUFF_SIZE / 4], &data_dram1[IDMA_BUFF_SIZE / 4]};
+float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
+float32_t* outData[2] = {
+    &data_dram0[IDMA_BUFF_SIZE / 4],
+    &data_dram1[IDMA_BUFF_SIZE / 4]};
 
 IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
 IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
 
-idma_buffer_t * descbuf[] = {
-  buffer_idma_ch0,
-  buffer_idma_ch1,
+idma_buffer_t* descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
 };
 
-#endif // __IDMA__INIT_H__
+#endif // __IDMA__INIT_H__