From a5d755e381f0101024981a4da50ebcd121fd0bfd Mon Sep 17 00:00:00 2001
From: Matthias Cremon <matthiascremon@meta.com>
Date: Fri, 5 Sep 2025 17:24:33 -0700
Subject: [PATCH] Use OSS kernels everywhere (#13884)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/13884

As titled

Reviewed By: zonglinpeng

Differential Revision: D81203389
---
 backends/cadence/hifi/kernels/targets.bzl     |  1 +
 .../operators/op_quantized_conv_nchw_out.cpp  | 13 +++++++------
 .../cadence/reference/kernels/kernels.cpp     | 19 ++++++++++++++-----
 .../cadence/reference/kernels/targets.bzl     |  1 +
 4 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/backends/cadence/hifi/kernels/targets.bzl b/backends/cadence/hifi/kernels/targets.bzl
index 596acc90791..fe5feed69b5 100644
--- a/backends/cadence/hifi/kernels/targets.bzl
+++ b/backends/cadence/hifi/kernels/targets.bzl
@@ -16,6 +16,7 @@ def define_common_targets():
         compatible_with = ["ovr_config//cpu:xtensa"],
         visibility = [
             "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         exported_deps = [
             "fbsource//third-party/nnlib-hifi4/xa_nnlib:libxa_nnlib_common",
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
index 297fd30e446..fbc97a4c37b 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
@@ -12,16 +12,17 @@
 
 #define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
 
-using Tensor = executorch::aten::Tensor;
-using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
-using ScalarType = executorch::aten::ScalarType;
-using ::executorch::aten::IntArrayRef;
-
 namespace cadence {
 namespace impl {
 namespace HiFi {
 namespace native {
 
+using ::cadence::impl::HiFi::kernels::quantize;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::torch::executor::KernelRuntimeContext;
+
 // This implements a generic 2d conv kernel that operates on raw pointers.
 // The version handles both quantized and fp32 convolutions.
 // The input is of shape [n x c x h x w]
@@ -145,7 +146,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
             if (quantized) {
               float val = bias_scale * acc;
               out_plane[_oh * ow + _ow] =
-                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+                  quantize<OT>(val, inv_out_scale, out_zero_point);
             } else {
               out_plane[_oh * ow + _ow] = acc;
             }
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
index 9583ffc4a20..ad8746f51eb 100644
--- a/backends/cadence/reference/kernels/kernels.cpp
+++ b/backends/cadence/reference/kernels/kernels.cpp
@@ -7,10 +7,11 @@
  */
 
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
-#include <math.h>
 #include <algorithm>
+#include <cmath>
 #include <cstring>
 #include <limits>
+
 namespace impl {
 namespace reference {
 namespace kernels {
@@ -18,10 +19,18 @@ namespace kernels {
 // Quantize a fp32 value to an int8_t/uint8_t value
 template <typename T>
 T quantize(const float x, float scale, int32_t zero_point) {
-  constexpr float min_val = std::numeric_limits<T>::min();
-  constexpr float max_val = std::numeric_limits<T>::max();
-  float tmp = roundf(x * scale + zero_point);
-  return std::max(std::min(tmp, max_val), min_val);
+  // constexpr float min_val = std::numeric_limits<T>::min();
+  // constexpr float max_val = std::numeric_limits<T>::max();
+  // float tmp = roundf(x * scale + zero_point);
+  // return std::max(std::min(tmp, max_val), min_val);
+  // Match Executorch CPU kernel implementation at
+  // https://fburl.com/code/fxizw6u6
+  int64_t qvalue;
+  qvalue = static_cast<int64_t>(zero_point + std::nearbyint(scale * x));
+
+  qvalue = std::max<int64_t>(qvalue, std::numeric_limits<T>::min());
+  qvalue = std::min<int64_t>(qvalue, std::numeric_limits<T>::max());
+  return static_cast<T>(qvalue);
 }
 
 // Quantize an fp32 array to an int8_t/uint8_t array
diff --git a/backends/cadence/reference/kernels/targets.bzl b/backends/cadence/reference/kernels/targets.bzl
index d3fe3fa39db..d50cfe8f130 100644
--- a/backends/cadence/reference/kernels/targets.bzl
+++ b/backends/cadence/reference/kernels/targets.bzl
@@ -10,6 +10,7 @@ def define_common_targets():
         ],
         visibility = [
             "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
         ],
         platforms = CXX,
     )