pytorch · facebook-github-bot · Sep 6, 2025 · Sep 6, 2025
diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl
@@ -16,6 +16,7 @@ def define_common_targets():
         compatible_with = ["ovr_config//cpu:xtensa"],
         visibility = [
             "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         exported_deps = [
             "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",

diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
@@ -12,16 +12,17 @@
 
 #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
 
-using Tensor = executorch::aten::Tensor;
-using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
-using ScalarType = executorch::aten::ScalarType;
-using ::executorch::aten::IntArrayRef;
-
 namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
 
+using ::cadence::impl::HiFi::kernels::quantize;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::torch::executor::KernelRuntimeContext;
+
 // This implements a generic 2d conv kernel that operates on raw pointers.
 // The version handles both quantized and fp32 convolutions.
 // The input is of shape [n x c x h x w]
@@ -145,7 +146,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+                  quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }

diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
@@ -7,21 +7,30 @@
  */
 
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include <math.h>
 #include <algorithm>
+#include <cmath>
 #include <cstring>
 #include <limits>
+
 namespace impl {
 namespace reference {
 namespace kernels {
 
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
 T quantize(const float x, float scale, int32_t zero_point) {
-  constexpr float min_val = std::numeric_limits<T>::min();
-  constexpr float max_val = std::numeric_limits<T>::max();
-  float tmp = roundf(x * scale + zero_point);
-  return std::max(std::min(tmp, max_val), min_val);
+  // constexpr float min_val = std::numeric_limits<T>::min();
+  // constexpr float max_val = std::numeric_limits<T>::max();
+  // float tmp = roundf(x * scale + zero_point);
+  // return std::max(std::min(tmp, max_val), min_val);
+  // Match Executorch CPU kernel implementation at
+  // https://fburl.com/code/fxizw6u6
+  int64_t qvalue;
+  qvalue = static_cast<int64_t>(zero_point + std::nearbyint(scale * x));
+
+  qvalue = std::max<int64_t>(qvalue, std::numeric_limits<T>::min());
+  qvalue = std::min<int64_t>(qvalue, std::numeric_limits<T>::max());
+  return static_cast<T>(qvalue);
 }
 
 // Quantize an fp32 array to an int8_t/uint8_t array

diff --git a/backends/cadence/reference/kernels/targets.bzl b/backends/cadence/reference/kernels/targets.bzl
@@ -10,6 +10,7 @@ def define_common_targets():
         ],
         visibility = [
             "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         platforms = CXX,
     )