From 7d6ca43022ee825a33560795cc2df0e02fc7462d Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:29:11 -0700
Subject: [PATCH 1/7] Revert "Implement unary_ufunc functions using
 elementwise_util (#9386)"

This reverts commit dffbd426a4c519a4dc1596595578b5fa9e68d585.
---
 kernels/portable/cpu/op_acos.cpp              |  5 +-
 kernels/portable/cpu/op_acosh.cpp             |  5 +-
 kernels/portable/cpu/op_asin.cpp              |  5 +-
 kernels/portable/cpu/op_asinh.cpp             |  5 +-
 kernels/portable/cpu/op_atan.cpp              |  5 +-
 kernels/portable/cpu/op_atanh.cpp             |  5 +-
 kernels/portable/cpu/op_ceil.cpp              |  4 +-
 kernels/portable/cpu/op_cos.cpp               |  4 +-
 kernels/portable/cpu/op_cosh.cpp              |  5 +-
 kernels/portable/cpu/op_erf.cpp               |  4 +-
 kernels/portable/cpu/op_exp.cpp               |  4 +-
 kernels/portable/cpu/op_expm1.cpp             |  7 +-
 kernels/portable/cpu/op_floor.cpp             |  4 +-
 kernels/portable/cpu/op_isinf.cpp             |  5 +-
 kernels/portable/cpu/op_isnan.cpp             |  5 +-
 kernels/portable/cpu/op_log.cpp               |  4 +-
 kernels/portable/cpu/op_log10.cpp             |  5 +-
 kernels/portable/cpu/op_log1p.cpp             |  5 +-
 kernels/portable/cpu/op_log2.cpp              |  5 +-
 kernels/portable/cpu/op_reciprocal.cpp        | 13 ++-
 kernels/portable/cpu/op_rsqrt.cpp             | 11 ++-
 kernels/portable/cpu/op_sin.cpp               |  4 +-
 kernels/portable/cpu/op_sinh.cpp              |  5 +-
 kernels/portable/cpu/op_sqrt.cpp              |  5 +-
 kernels/portable/cpu/op_tan.cpp               |  4 +-
 kernels/portable/cpu/op_tanh.cpp              |  5 +-
 kernels/portable/cpu/op_trunc.cpp             |  4 +-
 kernels/portable/cpu/pattern/pattern.cpp      | 28 ------
 kernels/portable/cpu/pattern/pattern.h        | 94 ++-----------------
 kernels/portable/cpu/pattern/targets.bzl      |  8 +-
 .../pattern/unary_ufunc_realhb_to_bool.cpp    | 60 ++++++++++++
 .../unary_ufunc_realhbbf16_to_floathbf16.cpp  | 60 ++++++++++++
 .../cpu/pattern/unary_ufunc_realhbf16.cpp     | 53 +++++++++++
 kernels/portable/cpu/util/vectorized_math.h   | 27 +-----
 .../kernels/portable/op_registration_util.bzl |  1 -
 35 files changed, 251 insertions(+), 222 deletions(-)
 delete mode 100644 kernels/portable/cpu/pattern/pattern.cpp
 create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
 create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
 create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp

diff --git a/kernels/portable/cpu/op_acos.cpp b/kernels/portable/cpu/op_acos.cpp
index 81daf10c9a6..dac3b1546f3 100644
--- a/kernels/portable/cpu/op_acos.cpp
+++ b/kernels/portable/cpu/op_acos.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "acos.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::acos(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acos, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_acosh.cpp b/kernels/portable/cpu/op_acosh.cpp
index b402698d761..77f7edf4c5d 100644
--- a/kernels/portable/cpu/op_acosh.cpp
+++ b/kernels/portable/cpu/op_acosh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& acosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "acosh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::acosh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::acosh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_asin.cpp b/kernels/portable/cpu/op_asin.cpp
index ddb52c70e84..6affa6e4122 100644
--- a/kernels/portable/cpu/op_asin.cpp
+++ b/kernels/portable/cpu/op_asin.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "asin.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::asin(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asin, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_asinh.cpp b/kernels/portable/cpu/op_asinh.cpp
index 9441db09589..bce8dcf6d5a 100644
--- a/kernels/portable/cpu/op_asinh.cpp
+++ b/kernels/portable/cpu/op_asinh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& asinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "asinh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::asinh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::asinh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_atan.cpp b/kernels/portable/cpu/op_atan.cpp
index 6a73341bf0d..23549627a3b 100644
--- a/kernels/portable/cpu/op_atan.cpp
+++ b/kernels/portable/cpu/op_atan.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "atan.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::atan(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atan, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_atanh.cpp b/kernels/portable/cpu/op_atanh.cpp
index 9e036a5fb3b..13e6e8ca141 100644
--- a/kernels/portable/cpu/op_atanh.cpp
+++ b/kernels/portable/cpu/op_atanh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& atanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "atanh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::atanh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::atanh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_ceil.cpp b/kernels/portable/cpu/op_ceil.cpp
index e2c8e6f07b6..5aa09ba0084 100644
--- a/kernels/portable/cpu/op_ceil.cpp
+++ b/kernels/portable/cpu/op_ceil.cpp
@@ -17,9 +17,7 @@ namespace native {
 using executorch::aten::Tensor;
 
 Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "ceil.out";
-  return internal::unary_ufunc_realhbf16<op_name>(
-      [](auto x) { return executorch::math::ceil(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbf16(std::ceil, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_cos.cpp b/kernels/portable/cpu/op_cos.cpp
index e7876116f94..e536060d162 100644
--- a/kernels/portable/cpu/op_cos.cpp
+++ b/kernels/portable/cpu/op_cos.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "cos.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::cos(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::cos, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_cosh.cpp b/kernels/portable/cpu/op_cosh.cpp
index 9703ff0336c..e622bbe6fcd 100644
--- a/kernels/portable/cpu/op_cosh.cpp
+++ b/kernels/portable/cpu/op_cosh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& cosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "cosh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::cosh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::cosh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_erf.cpp b/kernels/portable/cpu/op_erf.cpp
index aee0101fdb4..6897bcda95b 100644
--- a/kernels/portable/cpu/op_erf.cpp
+++ b/kernels/portable/cpu/op_erf.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& erf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "erf.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::erf(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::erf, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_exp.cpp b/kernels/portable/cpu/op_exp.cpp
index f2241613609..cbfc8924cb0 100644
--- a/kernels/portable/cpu/op_exp.cpp
+++ b/kernels/portable/cpu/op_exp.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "exp.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::exp(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp
index 67af9b343bb..f2d49f615b1 100644
--- a/kernels/portable/cpu/op_expm1.cpp
+++ b/kernels/portable/cpu/op_expm1.cpp
@@ -7,19 +7,16 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
-#include <type_traits>
 
 namespace torch {
 namespace executor {
 namespace native {
 
 Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "expm1.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::expm1(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::expm1, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_floor.cpp b/kernels/portable/cpu/op_floor.cpp
index 14b49cafbc1..4061722bd27 100644
--- a/kernels/portable/cpu/op_floor.cpp
+++ b/kernels/portable/cpu/op_floor.cpp
@@ -17,9 +17,7 @@ namespace native {
 using executorch::aten::Tensor;
 
 Tensor& floor_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "floor.out";
-  return internal::unary_ufunc_realhbf16<op_name>(
-      [](auto x) { return executorch::math::floor(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbf16(std::floor, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp
index 42798231a84..92d1e563a2e 100644
--- a/kernels/portable/cpu/op_isinf.cpp
+++ b/kernels/portable/cpu/op_isinf.cpp
@@ -17,9 +17,8 @@ namespace native {
 Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
-  static constexpr const char op_name[] = "isinf.out";
-  return internal::unary_ufunc_realhb_to_bool<op_name>(
-      [](auto x) -> bool { return std::isinf(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isinf(x); }, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp
index 817d314fd2b..51e189992ee 100644
--- a/kernels/portable/cpu/op_isnan.cpp
+++ b/kernels/portable/cpu/op_isnan.cpp
@@ -17,9 +17,8 @@ namespace native {
 Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   // Lambda is syntactic sugar needed to workaround compilation on some older
   // non-compatible distros where isnan is returning int rather than bool
-  static constexpr const char op_name[] = "isnan.out";
-  return internal::unary_ufunc_realhb_to_bool<op_name>(
-      [](auto x) -> bool { return std::isnan(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhb_to_bool(
+      [](double x) -> bool { return std::isnan(x); }, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log.cpp b/kernels/portable/cpu/op_log.cpp
index 5b0c32549aa..8a36bce8c49 100644
--- a/kernels/portable/cpu/op_log.cpp
+++ b/kernels/portable/cpu/op_log.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& log_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::log, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log10.cpp b/kernels/portable/cpu/op_log10.cpp
index 5251aea201d..89f9b672476 100644
--- a/kernels/portable/cpu/op_log10.cpp
+++ b/kernels/portable/cpu/op_log10.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log10_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log10.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log10(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log10, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log1p.cpp b/kernels/portable/cpu/op_log1p.cpp
index f352750a944..2daa31e37ff 100644
--- a/kernels/portable/cpu/op_log1p.cpp
+++ b/kernels/portable/cpu/op_log1p.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log1p_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log1p.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log1p(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log1p, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_log2.cpp b/kernels/portable/cpu/op_log2.cpp
index 42d17ea83b9..4d7406832e4 100644
--- a/kernels/portable/cpu/op_log2.cpp
+++ b/kernels/portable/cpu/op_log2.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& log2_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "log2.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::log2(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::log2, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_reciprocal.cpp b/kernels/portable/cpu/op_reciprocal.cpp
index a1bd116a962..f22f9883858 100644
--- a/kernels/portable/cpu/op_reciprocal.cpp
+++ b/kernels/portable/cpu/op_reciprocal.cpp
@@ -12,11 +12,18 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+double reciprocal(double x) {
+  return 1.0 / x;
+}
+
+} // namespace
+
 Tensor&
 reciprocal_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "reciprocal.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::reciprocal(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      reciprocal, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_rsqrt.cpp b/kernels/portable/cpu/op_rsqrt.cpp
index a14eb15d7ec..19c4c6c1a57 100644
--- a/kernels/portable/cpu/op_rsqrt.cpp
+++ b/kernels/portable/cpu/op_rsqrt.cpp
@@ -12,11 +12,16 @@
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+double rsqrt(double x) {
+  return 1.0 / std::sqrt(x);
+}
+
+} // namespace
 
 Tensor& rsqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "rsqrt.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::rsqrt(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sin.cpp b/kernels/portable/cpu/op_sin.cpp
index aeb73009729..ad65c4be18b 100644
--- a/kernels/portable/cpu/op_sin.cpp
+++ b/kernels/portable/cpu/op_sin.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& sin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "sin.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::sin(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::sin, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sinh.cpp b/kernels/portable/cpu/op_sinh.cpp
index f4cc67ad35f..21666392392 100644
--- a/kernels/portable/cpu/op_sinh.cpp
+++ b/kernels/portable/cpu/op_sinh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& sinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "sinh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::sinh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::sinh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_sqrt.cpp b/kernels/portable/cpu/op_sqrt.cpp
index 1b3d2ff6de5..bd2075f5b04 100644
--- a/kernels/portable/cpu/op_sqrt.cpp
+++ b/kernels/portable/cpu/op_sqrt.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& sqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "sqrt.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::sqrt(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::sqrt, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_tan.cpp b/kernels/portable/cpu/op_tan.cpp
index 19ccb84935b..a2b921d5146 100644
--- a/kernels/portable/cpu/op_tan.cpp
+++ b/kernels/portable/cpu/op_tan.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& tan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "tan.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::tan(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(std::tan, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_tanh.cpp b/kernels/portable/cpu/op_tanh.cpp
index 623968ac721..ae9f93dc62c 100644
--- a/kernels/portable/cpu/op_tanh.cpp
+++ b/kernels/portable/cpu/op_tanh.cpp
@@ -15,9 +15,8 @@ namespace executor {
 namespace native {
 
 Tensor& tanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "tanh.out";
-  return internal::unary_ufunc_realhbbf16_to_floathbf16<op_name>(
-      [](auto x) { return executorch::math::tanh(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbbf16_to_floathbf16(
+      std::tanh, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_trunc.cpp b/kernels/portable/cpu/op_trunc.cpp
index 9c96865db0e..2d70a3b1724 100644
--- a/kernels/portable/cpu/op_trunc.cpp
+++ b/kernels/portable/cpu/op_trunc.cpp
@@ -15,9 +15,7 @@ namespace executor {
 namespace native {
 
 Tensor& trunc_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  static constexpr const char op_name[] = "trunc.out";
-  return internal::unary_ufunc_realhbf16<op_name>(
-      [](auto x) { return executorch::math::trunc(x); }, ctx, in, out);
+  return internal::unary_ufunc_realhbf16(std::trunc, ctx, in, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/pattern/pattern.cpp b/kernels/portable/cpu/pattern/pattern.cpp
deleted file mode 100644
index 61571f25ddc..00000000000
--- a/kernels/portable/cpu/pattern/pattern.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/kernels/portable/cpu/pattern/pattern.h>
-
-namespace torch::executor::native::internal {
-
-bool check_and_resize_inputs(
-    KernelRuntimeContext& ctx,
-    const Tensor& in,
-    Tensor& out) {
-  ET_KERNEL_CHECK(
-      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, false);
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      resize_tensor(out, in.sizes()) == Error::Ok,
-      InvalidArgument,
-      false,
-      "Failed to resize output tensor.");
-  return true;
-}
-
-} // namespace torch::executor::native::internal
diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h
index 02690739a01..2d4b2ac509c 100644
--- a/kernels/portable/cpu/pattern/pattern.h
+++ b/kernels/portable/cpu/pattern/pattern.h
@@ -46,7 +46,6 @@ question is a bit more specific, then add a descriptive sufix. */
 
 #pragma once
 
-#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -54,78 +53,29 @@ namespace executor {
 namespace native {
 namespace internal {
 
-// Implementation detail for the other helpers in this header. Returns
-// true on success, false on failure.
-bool check_and_resize_inputs(
-    KernelRuntimeContext& ctx,
-    const Tensor& in,
-    Tensor& out);
-
 /**
  * Implements an op pattern for ops that take a single input tensor of any
- * realhbf16 dtype, no additional arguments, and outputs a tensor of the same
- * size and dtype. The function fn specifies the math operation which is applied
- * to the input tensor element-wise.
+ * realh dtye, no additional arguments, and outputs a tensor of the same size
+ * and dtype. The function fn specifies the math operation which is applied to
+ * the input tensor element-wise.
  */
-template <const char* op_name, typename Op>
 Tensor& unary_ufunc_realhbf16(
-    const Op& fn,
+    double (*fn)(double),
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    Tensor& out) {
-  if (!check_and_resize_inputs(ctx, in, out)) {
-    return out;
-  }
-  ET_KERNEL_CHECK(
-      ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
-
-  ET_SWITCH_REALHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE, [&] {
-    utils::apply_unitensor_elementwise_fn<
-        CTYPE,
-        op_name,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        fn, ctx, in, utils::SupportedTensorDtypes::REALHBF16, out);
-  });
-  return out;
-}
+    Tensor& out);
 
 /**
  * Implements an op pattern for ops that take a single input tensor of any
- * realhb dtype (real, half and boolean), no additional arguments, and outputs a
+ * realhb dtye (real, half and boolean), no additional arguments, and outputs a
  * boolean tensor of the same size. The function fn specifies the math
  * operation which is applied to the input tensor element-wise.
  */
-template <const char* op_name, typename Op>
 Tensor& unary_ufunc_realhb_to_bool(
-    const Op& fn,
+    bool (*fn)(double),
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    Tensor& out) {
-  if (!check_and_resize_inputs(ctx, in, out)) {
-    return out;
-  }
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      out.scalar_type() == executorch::aten::ScalarType::Bool,
-      InvalidArgument,
-      out,
-      "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
-      static_cast<int8_t>(out.scalar_type()));
-
-  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
-    utils::apply_unitensor_elementwise_fn<
-        CTYPE_IN,
-        op_name,
-        utils::SupportedTensorDtypes::BOOL>(
-        [fn](const CTYPE_IN val_in) { return fn(val_in); },
-        ctx,
-        in,
-        utils::SupportedTensorDtypes::REALHBBF16,
-        out);
-  });
-
-  return out;
-}
+    Tensor& out);
 
 /**
  * Implements an op pattern for ops that take a single input tensor of any
@@ -133,35 +83,11 @@ Tensor& unary_ufunc_realhb_to_bool(
  * outputs a floating point tensor of the same size. The function fn specifies
  * the math operation which is applied to the input tensor element-wise.
  */
-template <const char* op_name, typename Op>
 Tensor& unary_ufunc_realhbbf16_to_floathbf16(
-    const Op& fn,
+    double (*fn)(double),
     KernelRuntimeContext& ctx,
     const Tensor& in,
-    Tensor& out) {
-  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
-
-  if (!check_and_resize_inputs(ctx, in, out)) {
-    return out;
-  }
-
-  ScalarType compute_type = in.scalar_type() == ScalarType::Double
-      ? ScalarType::Double
-      : ScalarType::Float;
-  ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&] {
-    utils::apply_unitensor_elementwise_fn<
-        CTYPE_COMPUTE,
-        op_name,
-        utils::SupportedTensorDtypes::FLOATHBF16>(
-        [fn](const auto val_in) { return fn(val_in); },
-        ctx,
-        in,
-        utils::SupportedTensorDtypes::REALHBBF16,
-        out);
-  });
-
-  return out;
-}
+    Tensor& out);
 
 } // namespace internal
 } // namespace native
diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl
index 4140e4e0f14..5fc73ccd911 100644
--- a/kernels/portable/cpu/pattern/targets.bzl
+++ b/kernels/portable/cpu/pattern/targets.bzl
@@ -49,14 +49,18 @@ def define_common_targets():
 
     runtime.cxx_library(
         name = "pattern",
-        srcs = ["pattern.cpp"],
+        srcs = [
+            "unary_ufunc_realhb_to_bool.cpp",
+            "unary_ufunc_realhbbf16_to_floathbf16.cpp",
+            "unary_ufunc_realhbf16.cpp",
+        ],
         exported_headers = [
             "pattern.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
         exported_deps = [
             "//executorch/kernels/portable/cpu/util:broadcast_util",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/runtime/kernel:kernel_includes",
         ],
         visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
new file mode 100644
index 00000000000..367137ad02c
--- /dev/null
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace internal {
+
+Tensor& unary_ufunc_realhb_to_bool(
+    bool (*fn)(double),
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.scalar_type() == executorch::aten::ScalarType::Bool,
+      InvalidArgument,
+      out,
+      "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
+      static_cast<int8_t>(out.scalar_type()));
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  const auto in_type = in.scalar_type();
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
+    apply_unary_map_fn(
+        [fn](const CTYPE_IN val_in) { return fn(val_in); },
+        in.const_data_ptr<CTYPE_IN>(),
+        out.mutable_data_ptr<bool>(),
+        in.numel());
+  });
+
+  return out;
+}
+
+} // namespace internal
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
new file mode 100644
index 00000000000..602b5b1bfd2
--- /dev/null
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace internal {
+
+Tensor& unary_ufunc_realhbbf16_to_floathbf16(
+    double (*fn)(double),
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  const auto in_type = in.scalar_type();
+  const auto out_type = out.scalar_type();
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] {
+      apply_unary_map_fn(
+          [fn](const CTYPE_IN val_in) {
+            CTYPE_OUT xi = static_cast<CTYPE_OUT>(val_in);
+            return static_cast<CTYPE_OUT>(fn(xi));
+          },
+          in.const_data_ptr<CTYPE_IN>(),
+          out.mutable_data_ptr<CTYPE_OUT>(),
+          in.numel());
+    });
+  });
+
+  return out;
+}
+
+} // namespace internal
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp
new file mode 100644
index 00000000000..3672e223b7e
--- /dev/null
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+namespace internal {
+
+Tensor& unary_ufunc_realhbf16(
+    double (*fn)(double),
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
+    apply_unary_map_fn(
+        [fn](const CTYPE val_in) { return static_cast<CTYPE>(fn(val_in)); },
+        in.const_data_ptr<CTYPE>(),
+        out.mutable_data_ptr<CTYPE>(),
+        in.numel());
+  });
+
+  return out;
+}
+
+} // namespace internal
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h
index 823d0ccc39a..9e706ace56d 100644
--- a/kernels/portable/cpu/util/vectorized_math.h
+++ b/kernels/portable/cpu/util/vectorized_math.h
@@ -104,14 +104,11 @@ auto convert_to_vectorized_n_of_float(at::vec::Vectorized<T> vec) {
 #endif // ET_USE_PYTORCH_HEADERS
 
 // To simplify client code, we provide coverage for a bunch of float ops (the
-// same ones listed in ATen vml.h, plus acosh, asinh, atanh) here.
+// same ones listed in ATen vml.h) here.
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acosh)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asinh)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atanh)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh)
@@ -134,30 +131,12 @@ ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc)
 ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma)
 
 #ifdef ET_USE_PYTORCH_HEADERS
-ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(reciprocal)
-ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(rsqrt)
+ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(rsqrt)
 #endif // ET_USE_PYTORCH_HEADERS
 
 namespace executorch {
 inline namespace math {
-inline float reciprocal(float x) {
-  return 1.0f / x;
-}
-
-inline double reciprocal(double x) {
-  return 1.0 / x;
-}
-
-template <
-    typename Integer,
-    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
-double reciprocal(Integer x) {
-  return reciprocal((double)x);
-}
-
-template <
-    typename T,
-    std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>>
 T rsqrt(T x) {
   return T(1) / std::sqrt(x);
 }
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index 84c6567b495..a731ce5c674 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -533,7 +533,6 @@ ATEN_OPS = (
         name = "op_expm1",
         deps = [
             "//executorch/kernels/portable/cpu/pattern:pattern",
-            "//executorch/kernels/portable/cpu/util:elementwise_util",
         ],
     ),
     op_target(

From 20d31faa087a9feb2c92400b822f115d7e0036a4 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:29:47 -0700
Subject: [PATCH 2/7] Revert "Add mixed integer precision test for op_mul
 (#11206)"

This reverts commit 3a4ec6e8785a0c73fb44444493015a9abbf64881.
---
 kernels/test/op_mul_test.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 34433fbe95c..c21cceeaae3 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -746,21 +746,6 @@ TEST_F(OpMulOutTest, DynamicShapeUnbound) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-// >>> torch.ops.aten.mul(torch.tensor([100], dtype=torch.int8),
-// torch.tensor([100], dtype=torch.int8), out=torch.zeros([1],
-// dtype=torch.long)) tensor([16])
-TEST_F(OpMulOutTest, MixedIntegerDtypeMatchesATen) {
-  TensorFactory<ScalarType::Char> tf_in;
-  TensorFactory<ScalarType::Long> tf_out;
-
-  Tensor in = tf_in.make({1}, {100});
-  Tensor out = tf_out.zeros({1});
-  Tensor ret = op_mul_out(in, in, out);
-
-  Tensor expected = tf_out.make({1}, {16});
-  EXPECT_TENSOR_CLOSE(out, expected);
-}
-
 TEST_F(OpMulScalarOutTest, SanityCheck) {
   TensorFactory<ScalarType::Bool> tf_a;
   TensorFactory<ScalarType::Float> tf_out;

From 7a2934b37d9adb191af3af1faad14c1a791814e5 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:30:03 -0700
Subject: [PATCH 3/7] Revert "Add SupportedTensorDtypes::BOOL (#9584)"

This reverts commit 9123e91323c3603330c73258d84a1256bc7e4da9.
---
 kernels/portable/cpu/util/dtype_util.cpp |  2 --
 kernels/portable/cpu/util/dtype_util.h   | 30 +-----------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp
index 525199a6f78..d240b9f83bc 100644
--- a/kernels/portable/cpu/util/dtype_util.cpp
+++ b/kernels/portable/cpu/util/dtype_util.cpp
@@ -27,8 +27,6 @@ bool check_tensor_dtype(
       return executorch::runtime::tensor_is_floating_type(t);
     case SupportedTensorDtypes::INTB:
       return executorch::runtime::tensor_is_integral_type(t, true);
-    case SupportedTensorDtypes::BOOL:
-      return executorch::runtime::tensor_is_type(t, ScalarType::Bool);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return (executorch::runtime::tensor_is_type(
           t, ScalarType::Bool, ScalarType::Byte));
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 15732219c8f..1e7901c80b2 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -72,16 +72,6 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_intb(const Tensor& t) {
   return result;
 }
 
-template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_bool(const Tensor& t) {
-  ET_CHECK_MSG(
-      t.scalar_type() == ScalarType::Bool,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(t.scalar_type()),
-      op_name);
-  return internal::load_and_convert<CTYPE_COMPUTE, bool>;
-}
-
 template <typename CTYPE_COMPUTE, const char* op_name>
 load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_bool_or_byte(
     const Tensor& t) {
@@ -175,17 +165,6 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_intb(
   return result;
 }
 
-template <typename CTYPE_COMPUTE, const char* op_name>
-store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_bool(
-    const Tensor& t) {
-  ET_CHECK_MSG(
-      t.scalar_type() == ScalarType::Bool,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(t.scalar_type()),
-      op_name);
-  return internal::convert_and_store<bool, CTYPE_COMPUTE>;
-}
-
 template <typename CTYPE_COMPUTE, const char* op_name>
 store_compute_to_tensor_fn<CTYPE_COMPUTE>
 get_store_compute_to_tensor_fn_bool_or_byte(const Tensor& t) {
@@ -240,7 +219,6 @@ enum class SupportedTensorDtypes {
   REALHBF16,
   FLOATHBF16,
   INTB,
-  BOOL,
   BOOL_OR_BYTE,
   // DEPRECATED: not likely to be correct; use SAME_AS_COMMON.
   SAME_AS_COMPUTE,
@@ -262,8 +240,6 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
       return get_load_to_compute_fn_realhbf16<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_load_to_compute_fn_intb<CTYPE_COMPUTE, op_name>(t);
-    case SupportedTensorDtypes::BOOL:
-      return get_load_to_compute_fn_bool<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_load_to_compute_fn_bool_or_byte<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
@@ -295,8 +271,6 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
           t);
     case SupportedTensorDtypes::INTB:
       return get_store_compute_to_tensor_fn_intb<CTYPE_COMPUTE, op_name>(t);
-    case SupportedTensorDtypes::BOOL:
-      return get_store_compute_to_tensor_fn_bool<CTYPE_COMPUTE, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_store_compute_to_tensor_fn_bool_or_byte<
           CTYPE_COMPUTE,
@@ -344,14 +318,12 @@ bool check_tensor_dtype(
     const ScalarType compute_type);
 
 /// Return the one output type we are willing to emit specialized code
-/// to handle, given a compute type of CTYPE_COMPUTE and supported
+/// to handle, given a compute type of CTYPE_COMMON and supported
 /// output types of out_dtypes.
 template <typename CTYPE_COMPUTE>
 inline constexpr ScalarType specialized_output_scalar_type(
     SupportedTensorDtypes out_dtypes) {
   switch (out_dtypes) {
-    case SupportedTensorDtypes::BOOL:
-      return ScalarType::Bool;
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return ScalarType::Bool;
     case SupportedTensorDtypes::REALHBBF16:

From a2e898eb9b3fab6e79c8f66f1000d6f02311ddd1 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:30:31 -0700
Subject: [PATCH 4/7] Revert "relax tolerances for all unary float ops (#9585)"

This reverts commit 2dedc9e0e39047269b7762652b593c5c53883168.
---
 .../UnaryUfuncRealHBBF16ToFloatHBF16Test.h    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
index d1e812ec2c2..6e49dd9e57b 100644
--- a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
+++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h
@@ -72,16 +72,20 @@ class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest {
 
     auto expected = tf_out.make({1, 6}, expected_vector);
     if (IN_DTYPE == ScalarType::BFloat16 || OUT_DTYPE == ScalarType::BFloat16) {
-      // Raise tolerance because both we and ATen run these
-      // computations at internal float32 precision rather than
-      // float64.
-      double rtol = 3e-3;
+      double rtol = executorch::runtime::testing::internal::kDefaultRtol;
+      // It appears we need a higher tolerance for at least some ATen
+      // tests, like aten_op_acosh_test.
+      if (get_supported_features()->is_aten) {
+        rtol = 3e-3;
+      }
       EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultBFloat16Atol);
     } else if (IN_DTYPE == ScalarType::Half || OUT_DTYPE == ScalarType::Half) {
-      // Raise tolerance because both we and ATen run these
-      // computations at internal float32 precision rather than
-      // float64.
-      double rtol = 1e-3;
+      double rtol = executorch::runtime::testing::internal::kDefaultRtol;
+      // It appears we need a higher tolerance for at least some ATen
+      // tests, like aten_op_acosh_test.
+      if (get_supported_features()->is_aten) {
+        rtol = 1e-3;
+      }
       EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultHalfAtol);
     } else {
       EXPECT_TENSOR_CLOSE(out, expected);

From c275c6422ed442372eb3cadfb42a2c53a6fceccd Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:30:49 -0700
Subject: [PATCH 5/7] Revert "Add vectorization in elementwise_util (#9432)"

This reverts commit 4c35fe04477efb99b2f2c1da7402de8f08c3129c.
---
 .lintrunner.toml                             |   2 -
 kernels/portable/cpu/op_add.cpp              |  12 +-
 kernels/portable/cpu/op_atan2.cpp            |   2 +-
 kernels/portable/cpu/op_clamp.cpp            |   5 +-
 kernels/portable/cpu/op_elu.cpp              |   3 +-
 kernels/portable/cpu/op_fmod.cpp             |   8 +-
 kernels/portable/cpu/op_maximum.cpp          |   2 +-
 kernels/portable/cpu/op_minimum.cpp          |   3 +-
 kernels/portable/cpu/op_mul.cpp              |   4 +-
 kernels/portable/cpu/op_native_dropout.cpp   |  10 +-
 kernels/portable/cpu/op_pow.cpp              |  23 +--
 kernels/portable/cpu/op_sigmoid.cpp          |   7 +-
 kernels/portable/cpu/op_where.cpp            |   6 +-
 kernels/portable/cpu/util/elementwise_util.h | 139 +------------------
 kernels/portable/cpu/util/math_util.h        |  30 ----
 kernels/portable/cpu/util/targets.bzl        |   6 -
 kernels/test/op_atan2_test.cpp               |  33 -----
 kernels/test/op_clamp_test.cpp               |  34 -----
 kernels/test/op_fmod_test.cpp                |  31 -----
 kernels/test/op_maximum_test.cpp             |  14 --
 kernels/test/op_minimum_test.cpp             |  14 --
 kernels/test/op_mul_test.cpp                 |   6 +-
 kernels/test/op_pow_test.cpp                 |  13 --
 kernels/test/op_sigmoid_test.cpp             |   4 -
 24 files changed, 41 insertions(+), 370 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 4c881940155..1e81c570c65 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -272,8 +272,6 @@ exclude_patterns = [
     'exir/verification/bindings.cpp',
     'extension/**',
     # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/elementwise_util.h',
-    'kernels/portable/cpu/util/math_util.h',
     'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 83642c4864d..555341b3447 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -102,18 +102,14 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    auto val_alpha_times_b = val_alpha * val_b;
     utils::apply_unitensor_elementwise_fn<
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_alpha_times_b](const auto val_a) {
-          // Cast here supports vectorization; either it does nothing
-          // or it casts from CTYPE_COMPUTE to
-          // Vectorized<CTYPE_COMPUTE>.
-          return val_a + decltype(val_a)(val_alpha_times_b);
+        [b, alpha](const auto val_a) {
+          CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
+          CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+          return val_a + val_alpha * val_b;
         },
         ctx,
         a,
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 5390eb52820..33d66cf2ad7 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -60,7 +60,7 @@ Tensor& atan2_out(
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
         [](const auto val_a, const auto val_b) {
-          return executorch::math::atan2(val_a, val_b);
+          return std::atan2(val_a, val_b);
         },
         ctx,
         a,
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index c5bf28e8aec..6974789eccf 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -138,8 +138,9 @@ Tensor& clamp_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [has_min, min_opt, has_max, max_opt](const auto val_in) {
-          auto val_out = val_in;
+        [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
                 val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
index d7477717a3a..d6533642860 100644
--- a/kernels/portable/cpu/op_elu.cpp
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -48,7 +48,8 @@ Tensor& elu_out(
         CTYPE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [negcoef, math_scale, math_input_scale](const CTYPE x) {
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 40bb4a5e94c..96a971b166a 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable?
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -138,8 +138,10 @@ Tensor& fmod_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_b](const auto val_a) {
-          return executorch::math::fmod(val_a, (decltype(val_a))val_b);
+        [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE value = std::fmod(val_a, val_b);
+          return value;
         },
         ctx,
         a,
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index c7979e40d7c..3a84095a4df 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -49,7 +49,7 @@ Tensor& maximum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index 1bac23187d8..5c0e79eb9bb 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -49,7 +49,8 @@ Tensor& minimum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 6d4f30106ca..ba16ddc075a 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -72,7 +72,9 @@ Tensor& mul_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [](const auto val_a, const auto val_b) { return val_a * val_b; },
+          [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+            return val_a * val_b;
+          },
           ctx,
           a,
           utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/op_native_dropout.cpp b/kernels/portable/cpu/op_native_dropout.cpp
index 8dafd9e0512..1c4d177e8ed 100644
--- a/kernels/portable/cpu/op_native_dropout.cpp
+++ b/kernels/portable/cpu/op_native_dropout.cpp
@@ -57,11 +57,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     }
     ET_SWITCH_FLOATHBF16_TYPES(
         input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
-          utils::apply_bitensor_elementwise_fn<
-              CTYPE_COMPUTE,
-              op_name,
-              utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-              [](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
+          utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [](const auto val, const auto mask_val) {
                 if (!mask_val) {
                   return static_cast<decltype(val)>(0);
                 }
@@ -73,7 +70,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
               mask,
               // TODO: should really be just BOOL
               utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-              out);
+              out,
+              utils::SupportedTensorDtypes::SAME_AS_COMMON);
         });
   } else if (input.numel() > 0) {
     std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index aaf934b9adf..4d2673cb72d 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -57,8 +57,9 @@ Tensor& pow_Tensor_Tensor_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [](const auto val_a, const auto val_b) {
-          return executorch::math::pow(val_a, val_b);
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
+          return std::pow(val_a, val_b);
         },
         ctx,
         a,
@@ -110,13 +111,8 @@ Tensor& pow_Tensor_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // Casting val_b here supports vectorization; it does
-        // nothing if we are not vectorizing (casts to
-        // CTYPE_COMPUTE) and casts to a vectorized type
-        // otherwise.
-        [val_b](const auto val_a) {
-          return executorch::math::pow(val_a, decltype(val_a)(val_b));
-        },
+        // TODO: rewrite this to be vectorization-capable.
+        [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -165,13 +161,8 @@ Tensor& pow_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // Casting val_a here supports vectorization; it does
-        // nothing if we are not vectorizing (casts to
-        // CTYPE_COMPUTE) and casts to a vectorized type
-        // otherwise.
-        [val_a](const auto val_b) {
-          return executorch::math::pow(decltype(val_b)(val_a), val_b);
-        },
+        // TODO: rewrite this to be vectorization-capable.
+        [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index a1eb03c1869..acb743a2db6 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -49,9 +49,10 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_in) {
-          const auto one = static_cast<decltype(val_in)>(1.0);
-          auto out_val = one / (one + executorch::math::exp(-val_in));
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
+          CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
+              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
         },
         ctx,
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index b1eb4ff442c..692e296ee00 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -47,9 +47,9 @@ Tensor& where_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index d1e5d1e88f9..e30b8af7d89 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -12,14 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
-#include <executorch/kernels/portable/cpu/util/vectorized_math.h> // Make vectorization support easy for clients.
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
-#ifdef ET_USE_PYTORCH_HEADERS
-#include <ATen/cpu/vec/vec.h>
-#endif // ET_USE_PYTORCH_HEADERS
-
 #include <array>
 #include <utility>
 
@@ -56,38 +51,6 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
-template <typename Ignore, typename T>
-using ignore_first_yield_second = T;
-
-#ifdef ET_USE_PYTORCH_HEADERS
-// Can I call a function of type Op with sizeof...(Args) arguments of type
-// at::vec::Vectorized<CTYPE_COMPUTE>?
-//
-// See [NOTE: Generic lambdas] below for requirements on Op.
-template <typename CTYPE_COMPUTE, typename Op, typename... Args>
-constexpr bool can_use_vectorized() {
-  using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
-  // NOTE: if we start building optimized kernels on platforms that
-  // ATen Vectorized doesn't support well, we will want to add a way
-  // to check that Vectorized actually does something on our target
-  // platform. For now, I see no concrete need for that.
-  if constexpr (std::is_invocable_v<
-                    Op,
-                    ignore_first_yield_second<Args, Vec>...>) {
-    // For bool, we will get a false positive if we rely on only the
-    // is_invocable_v check above because at::vec::Vectorized is
-    // implicitly convertible to a pointer, which makes it implicitly
-    // convertible to bool (which was 15 minutes of fun to debug). Also
-    // just seems like good hygiene to make sure we get the Vectorized
-    // we're expecting.
-    return std::is_same_v<
-        std::invoke_result_t<Op, ignore_first_yield_second<Args, Vec>...>,
-        Vec>;
-  }
-  return false;
-}
-#endif // ET_USE_PYTORCH_HEADERS
-
 template <
     typename CTYPE_COMPUTE,
     typename CTYPE_OUT,
@@ -98,90 +61,8 @@ inline void dtype_specialized_elementwise_fn_impl(
     KernelRuntimeContext& ctx,
     const Tensor& out,
     Args... inputs) {
-  static_assert(
-      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
-       ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  // All inputs must be of type CTYPE_COMPUTE.
-  ET_DCHECK(
-      ((inputs.first->scalar_type() ==
-        CppTypeToScalarType<CTYPE_COMPUTE>::value) &&
-       ...));
-
-#ifdef ET_USE_PYTORCH_HEADERS
-  if constexpr (can_use_vectorized<CTYPE_COMPUTE, Op, Args...>()) {
-    const bool any_is_broadcasted =
-        !(torch::executor::internal::sizes_match_ignoring_leading_1s(
-              inputs.first->sizes(), out.sizes()) &&
-          ...);
-    if (!any_is_broadcasted) {
-      using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
-      ::executorch::extension::parallel_for(
-          0,
-          out.numel(),
-          ::executorch::extension::internal::GRAIN_SIZE,
-          [&](const auto begin, const auto end) {
-            std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
-                inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
-
-            CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
-
-            const auto vectorized_begin =
-                begin + (Vec::size() - begin % Vec::size()) % Vec::size();
-            const auto vectorized_end = end - (end % Vec::size());
-            // Scalar prologue.
-            for (const auto idx : c10::irange(begin, vectorized_begin)) {
-          // In debug mode, always use Vectorized so that even
-          // small-sized tests will test whether using Vectorized broke our
-          // lambda.
-#ifndef NDEBUG
-              std::array<Vec, kNumInputs> loaded_inputs;
-#else // NDEBUG
-              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
-#endif // NDEBUG
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
-              }
-#ifndef NDEBUG
-              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
-#else // NDEBUG
-              data_out[idx] = std::apply(compute_fun, loaded_inputs);
-#endif // NDEBUG
-            }
-
-            // Main vectorized loop.
-            for (auto idx = vectorized_begin; idx < vectorized_end;
-                 idx += Vec::size()) {
-              std::array<Vec, kNumInputs> loaded_vec_inputs;
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_vec_inputs[input_idx] =
-                    Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
-              }
-              auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
-              result_vec.store(&data_out[idx]);
-            }
-
-            // Scalar epilogue.
-            for (const auto idx : c10::irange(vectorized_end, end)) {
-#ifndef NDEBUG
-              std::array<Vec, kNumInputs> loaded_inputs;
-#else // NDEBUG
-              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
-#endif // NDEBUG
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
-              }
-#ifndef NDEBUG
-              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
-#else // NDEBUG
-              data_out[idx] = std::apply(compute_fun, loaded_inputs);
-#endif // NDEBUG
-            }
-          });
-      return;
-    }
-  }
-#endif // ET_USE_PYTORCH_HEADERS
+  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...));
 
   ::executorch::extension::parallel_for(
       0,
@@ -359,19 +240,6 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
-/**
- * Useful for unary elementwise operators. For each element of the
- * input, call Op and write to the corresponding element of the
- * output. Tensor broadcasting is applied wherever it is required.
- *
- * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto`
- * parameters; normal lambdas are fine), it must fulfill one of the
- * following conditions. Either:
- * 1) It must in fact compile when passed at::vec::Vectorized<CTYPE_COMPUTE>, or
- * 2) It must be actively SFINAE-friendly, as per the C++17 examples in
- * https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable
- * .
- */
 template <
     typename CTYPE_COMPUTE,
     const char* op_name,
@@ -413,8 +281,6 @@ inline void apply_bitensor_elementwise_fn(
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
- * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
- * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,
@@ -481,9 +347,6 @@ inline void apply_tritensor_elementwise_fn(
  *
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
- *
- * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
- * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,
diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h
index d6d32217137..2ba068da18e 100644
--- a/kernels/portable/cpu/util/math_util.h
+++ b/kernels/portable/cpu/util/math_util.h
@@ -8,10 +8,6 @@
 
 #pragma once
 
-#ifdef ET_USE_PYTORCH_HEADERS
-#include <ATen/cpu/vec/vec.h>
-#endif
-
 namespace torch {
 namespace executor {
 namespace native {
@@ -142,32 +138,6 @@ T max_override(T a, T b) {
   return b;
 }
 
-#ifdef ET_USE_PYTORCH_HEADERS
-template <typename T>
-at::vec::Vectorized<T> min_override(
-    at::vec::Vectorized<T> a,
-    at::vec::Vectorized<T> b) {
-  return at::vec::minimum(a, b);
-}
-
-template <typename T>
-at::vec::Vectorized<T> min_override(at::vec::Vectorized<T> a, T b) {
-  return min_override(a, at::vec::Vectorized<T>(b));
-}
-
-template <typename T>
-at::vec::Vectorized<T> max_override(
-    at::vec::Vectorized<T> a,
-    at::vec::Vectorized<T> b) {
-  return at::vec::maximum(a, b);
-}
-
-template <typename T>
-at::vec::Vectorized<T> max_override(at::vec::Vectorized<T> a, T b) {
-  return max_override(a, at::vec::Vectorized<T>(b));
-}
-
-#endif
 /**
  * There is a slight difference in how std::fmod works compared to how ATen
  * determines remainders:
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 65a0c9fc47a..1b432c736ae 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -33,7 +33,6 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
-            "//executorch/kernels/portable/cpu/util:vectorized_math",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -111,8 +110,6 @@ def define_common_targets():
             ":broadcast_indexes_range",
             ":broadcast_util",
             ":dtype_util",
-            ":vectorized_math",
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
             "//executorch/runtime/kernel:kernel_runtime_context",
             "//executorch/extension/threadpool:threadpool",
         ],
@@ -263,9 +260,6 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["math_util.h"],
         visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."],
-        exported_deps = [
-            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
-        ],
     )
 
     runtime.cxx_library(
diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp
index ae19ef687bc..436826e2b6d 100644
--- a/kernels/test/op_atan2_test.cpp
+++ b/kernels/test/op_atan2_test.cpp
@@ -46,36 +46,3 @@ TEST(OpAtan2OutTest, SmokeTest) {
   op_atan2_out(self, other, out);
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
-
-TEST(OpAtan2OutTest, SmokeTestNoBroadcastingSameDtype) {
-  TensorFactory<ScalarType::Double> tfDouble;
-
-  std::vector<double> a(18);
-  std::iota(a.begin(), a.end(), -8);
-  std::vector<double> b(18, 2.0);
-  Tensor self = tfDouble.make({18}, a);
-  Tensor other = tfDouble.make({18}, b);
-  Tensor out = tfDouble.zeros({18});
-  Tensor out_expected = tfDouble.make(
-      {18},
-      {-1.3258176636680326,
-       -1.2924966677897853,
-       -1.2490457723982544,
-       -1.1902899496825317,
-       -1.1071487177940904,
-       -0.9827937232473291,
-       -0.7853981633974483,
-       -0.4636476090008061,
-       0.0000000000000000,
-       0.4636476090008061,
-       0.7853981633974483,
-       0.9827937232473291,
-       1.1071487177940904,
-       1.1902899496825317,
-       1.2490457723982544,
-       1.2924966677897853,
-       1.3258176636680326,
-       1.3521273809209546});
-  op_atan2_out(self, other, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 0b632ab7f8d..8a021c70303 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -31,15 +31,6 @@ using torch::executor::testing::TensorFactory;
 
 using OptScalar = executorch::aten::optional<Scalar>;
 
-namespace {
-template <typename T>
-std::vector<T> arange(T stop) {
-  std::vector<T> result(stop);
-  std::iota(result.begin(), result.end(), 0);
-  return result;
-}
-} // namespace
-
 class OpClampOutTest : public OperatorTest {
  protected:
   Tensor& op_clamp_out(
@@ -123,31 +114,6 @@ class OpClampOutTest : public OperatorTest {
             // Should set all elements to max.
             {6, 6, 6, 6}, // expected_data
         },
-        {
-            std::string(__func__) + ": Simple clamp larger data",
-            {18}, // sizes
-            arange<typename ClampTestCase<DTYPE>::ctype>(18), // input_data
-            OptScalar(1), // min
-            OptScalar(6), // max
-            {1,
-             1,
-             2,
-             3,
-             4,
-             5,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6,
-             6}, // expected_data
-        },
     };
 
     run_test_cases(test_cases);
diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp
index 3227a01a17a..fa7cc4b63f7 100644
--- a/kernels/test/op_fmod_test.cpp
+++ b/kernels/test/op_fmod_test.cpp
@@ -45,34 +45,3 @@ TEST_F(OpFmodTest, SmokeTest) {
   op_fmod_tensor_out(self, other, out);
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
-
-TEST_F(OpFmodTest, ScalarSmokeTest) {
-  TensorFactory<ScalarType::Float> tfFloat;
-  std::vector<float> a(18);
-  std::iota(a.begin(), a.end(), -8);
-  Tensor self = tfFloat.make({18}, a);
-  Scalar other = 3;
-  Tensor out = tfFloat.zeros({18});
-  Tensor out_expected = tfFloat.make(
-      {18},
-      {-2.,
-       -1.,
-       -0.,
-       -2.,
-       -1.,
-       -0.,
-       -2.,
-       -1.,
-       0.,
-       1.,
-       2.,
-       0.,
-       1.,
-       2.,
-       0.,
-       1.,
-       2.,
-       0.});
-  op_fmod_scalar_out(self, other, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp
index c32cf571ff3..faa18fa56cd 100644
--- a/kernels/test/op_maximum_test.cpp
+++ b/kernels/test/op_maximum_test.cpp
@@ -37,17 +37,3 @@ TEST(OpMaximumOutTest, SmokeTest) {
   op_maximum_out(self, other, out);
   EXPECT_TENSOR_CLOSE(out, out_expected);
 }
-
-TEST(OpMaximumOutTest, SmokeTestLarger) {
-  TensorFactory<ScalarType::Float> tfFloat;
-
-  std::vector<float> a(18);
-  std::iota(a.begin(), a.end(), -8);
-  Tensor self = tfFloat.make({18}, a);
-  Tensor other = tfFloat.full({18}, 4);
-  Tensor out = tfFloat.zeros({18});
-  Tensor out_expected = tfFloat.make(
-      {18}, {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 7, 8, 9});
-  op_maximum_out(self, other, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp
index 9c256963943..686e1feee64 100644
--- a/kernels/test/op_minimum_test.cpp
+++ b/kernels/test/op_minimum_test.cpp
@@ -266,17 +266,3 @@ TEST_F(OpMinimumOutTest, DynamicShapeUnbound) {
   op_minimum_out(x, y, out);
   EXPECT_TENSOR_EQ(out, expected);
 }
-
-TEST_F(OpMinimumOutTest, SmokeTestLarger) {
-  TensorFactory<ScalarType::Float> tfFloat;
-
-  std::vector<float> a(18);
-  std::iota(a.begin(), a.end(), -8);
-  Tensor self = tfFloat.make({18}, a);
-  Tensor other = tfFloat.full({18}, 4);
-  Tensor out = tfFloat.zeros({18});
-  Tensor out_expected = tfFloat.make(
-      {18}, {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 4, 4, 4, 4});
-  op_minimum_out(self, other, out);
-  EXPECT_TENSOR_CLOSE(out, out_expected);
-}
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index c21cceeaae3..8b858fa18c2 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -30,7 +30,7 @@ class OpMulOutTest : public OperatorTest {
     return torch::executor::aten::mul_outf(context_, self, other, out);
   }
 
-  // Common testing for multiplying two integer Tensors
+  // Common testing for multipling two integer Tensors
   template <ScalarType DTYPE_A, ScalarType DTYPE_B, ScalarType DTYPE_OUT>
   void test_mul() {
     TensorFactory<DTYPE_A> tf_a;
@@ -54,10 +54,6 @@ class OpMulOutTest : public OperatorTest {
         tf_b.make(sizes, /*data=*/{1, 2, 4, 8}),
         out);
     EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 4, 16, 64}));
-
-    out = tf_out.zeros({18});
-    op_mul_out(tf_a.full({18}, 4), tf_b.full({18}, 2), out);
-    EXPECT_TENSOR_EQ(out, tf_out.full({18}, 8));
   }
 
   template <ScalarType DTYPE_A, ScalarType DTYPE_B>
diff --git a/kernels/test/op_pow_test.cpp b/kernels/test/op_pow_test.cpp
index 25d0f97526c..f9234a748b9 100644
--- a/kernels/test/op_pow_test.cpp
+++ b/kernels/test/op_pow_test.cpp
@@ -54,19 +54,6 @@ TEST_F(OpPowTest, TensorTensorSanityCheck) {
   EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16, 16, 16, 16}));
 }
 
-TEST_F(OpPowTest, TensorTensorSanityCheckLargerNoBroadcasting) {
-  TensorFactory<ScalarType::Float> tf;
-  Tensor self = tf.full({18}, 2);
-  Tensor exp = tf.full({18}, 4);
-  Tensor out = tf.zeros({18});
-  Tensor out_expected = tf.full({18}, 16);
-
-  Tensor ret = op_pow_tensor_tensor_out(self, exp, out);
-
-  EXPECT_TENSOR_EQ(out, ret);
-  EXPECT_TENSOR_EQ(out_expected, out);
-}
-
 TEST_F(OpPowTest, TensorTensorSanityCheck2) {
   TensorFactory<ScalarType::Float> tf1;
   TensorFactory<ScalarType::Int> tf2;
diff --git a/kernels/test/op_sigmoid_test.cpp b/kernels/test/op_sigmoid_test.cpp
index 1e3499ba451..550cebda315 100644
--- a/kernels/test/op_sigmoid_test.cpp
+++ b/kernels/test/op_sigmoid_test.cpp
@@ -44,10 +44,6 @@ class OpSigmoidOutTest : public OperatorTest {
     EXPECT_TENSOR_CLOSE(
         out,
         tf_out.make(sizes, /*data=*/{0.731059, 0.880797, 0.982014, 0.999665}));
-
-    out = tf_out.zeros({18});
-    op_sigmoid_out(tf.full({18}, 2), out);
-    EXPECT_TENSOR_CLOSE(out, tf_out.full({18}, 0.880797));
   }
 
   // Unhandled output dtypes.

From d83bed311e431b4967783e28d11ca8812486058c Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:31:01 -0700
Subject: [PATCH 6/7] Revert "Add optimized_portable_kernels test (#11205)"

This reverts commit bc42d8d696eff38c31050787efacaa8ff88285d8.
---
 kernels/portable/CMakeLists.txt |  9 +--------
 kernels/test/CMakeLists.txt     | 32 ++------------------------------
 2 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index c687da62e21..d301ea564f6 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -69,15 +69,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
   target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
-  gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
-  generate_bindings_for_kernels(
-    LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
-  )
-  gen_operators_lib(
-    LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
-  )
   install(
-    TARGETS optimized_portable_kernels optimized_portable_ops_lib
+    TARGETS optimized_portable_kernels
     DESTINATION lib
   )
 endif()
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index f5997a1ee3f..4f174b5a652 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -17,7 +17,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
-set(_kernels portable optimized_portable optimized quantized)
+set(_kernels portable optimized quantized)
 foreach(kernel ${_kernels})
   set(_wrapper_dir
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test"
@@ -37,17 +37,13 @@ foreach(kernel ${_kernels})
     VERBATIM
   )
 
-  set(_supported_features_kernel ${kernel})
-  if(${kernel} STREQUAL "optimized_portable")
-    set(_supported_features_kernel "portable")
-  endif()
   add_custom_command(
     OUTPUT "${_wrapper_dir}/supported_features.cpp"
            "${_wrapper_dir}/supported_features.h"
     COMMAND mkdir -p ${_wrapper_dir}
     COMMAND
       ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
-      kernels/${_supported_features_kernel}/test/supported_features_def.yaml >
+      kernels/${kernel}/test/supported_features_def.yaml >
       ${_wrapper_dir}/supported_features.cpp
     COMMAND
       ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py
@@ -61,11 +57,6 @@ foreach(kernel ${_kernels})
     set(_kernel_ops_lib "optimized_native_cpu_ops_lib")
     set(_kernel_ops_lib_path
         "${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib"
-      )
-  elseif(${kernel} STREQUAL "optimized_portable")
-    set(_kernel_ops_lib "${kernel}_ops_lib")
-    set(_kernel_ops_lib_path
-        "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/portable/${kernel}_ops_lib"
     )
   else()
     set(_kernel_ops_lib "${kernel}_ops_lib")
@@ -97,9 +88,6 @@ add_custom_target(
     "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/FunctionHeaderWrapper.h"
     "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.h"
     "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp"
-    "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/FunctionHeaderWrapper.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp"
     "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/FunctionHeaderWrapper.h"
     "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.h"
     "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.cpp"
@@ -311,22 +299,6 @@ set(_optimized_kernels_test_sources
 if(TARGET optimized_portable_kernels)
   list(APPEND _optimized_kernels_test_sources ${all_test_sources})
   list(REMOVE_DUPLICATES _optimized_kernels_test_sources)
-
-  # Make sure that we still test optimized versions of portable
-  # kernels even if they would currently be shadowed by specific
-  # optimized implementations.
-  et_cxx_test(
-    optimized_portable_kernels_test
-    SOURCES
-    ${all_test_sources}
-    ${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp
-    EXTRA_LIBS
-    optimized_portable_kernels
-  )
-   add_dependencies(optimized_portable_kernels_test generate_wrapper)
-  target_include_directories(
-    optimized_portable_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable"
-  )
 endif()
 
 et_cxx_test(

From 9aee068e3a8cba05582e1a27beb7a924ff8b8ab5 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Tue, 10 Jun 2025 13:31:16 -0700
Subject: [PATCH 7/7] Revert "Add vectorized_math.h (#11204)"

This reverts commit 1720f2f37644eb2aa9fd54fef87e17da6a83f07a.
---
 .lintrunner.toml                              |   2 -
 kernels/portable/cpu/util/targets.bzl         |  10 --
 kernels/portable/cpu/util/test/CMakeLists.txt |  16 +-
 kernels/portable/cpu/util/test/targets.bzl    |  11 --
 .../cpu/util/test/vectorized_math_test.cpp    |  95 -----------
 kernels/portable/cpu/util/vectorized_math.h   | 148 ------------------
 .../core/portable_type/c10/c10/targets.bzl    |   6 +-
 runtime/core/portable_type/targets.bzl        |   1 -
 test/utils/OSSTestConfig.json                 |  12 ++
 9 files changed, 23 insertions(+), 278 deletions(-)
 delete mode 100644 kernels/portable/cpu/util/test/vectorized_math_test.cpp
 delete mode 100644 kernels/portable/cpu/util/vectorized_math.h

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 1e81c570c65..8912e65d66d 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -271,8 +271,6 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
-    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 1b432c736ae..abf3f22c00b 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -307,16 +307,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_library(
-        name = "vectorized_math",
-        exported_headers = ["vectorized_math.h"],
-        visibility = ["//executorch/..."],
-        exported_deps = [
-            "//executorch/runtime/core/portable_type:portable_type",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
-        ],
-    )
-
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt
index 41bfea54020..d95b3a81b5c 100644
--- a/kernels/portable/cpu/util/test/CMakeLists.txt
+++ b/kernels/portable/cpu/util/test/CMakeLists.txt
@@ -4,22 +4,26 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# @generated by test/utils/generate_gtest_cmakelists.py
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
 cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
-               reduce_test.cpp vectorized_math_test.cpp
+               reduce_test.cpp
 )
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
   portable_kernels portable_ops_lib
 )
-
-find_package_torch_headers()
-target_include_directories(kernels_portable_cpu_util_test PRIVATE ${TORCH_INCLUDE_DIRS})
-target_compile_definitions(kernels_portable_cpu_util_test PRIVATE ET_USE_PYTORCH_HEADERS)
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 4b167c6e946..178eb25a79b 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -32,14 +32,3 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     )
-
-    # this test requires ET_USE_PYTORCH_HEADERS, which doesn't work in OSS Buck.
-    if not runtime.is_oss:
-        runtime.cxx_test(
-            name = "vectorized_math_test",
-            srcs = ["vectorized_math_test.cpp"],
-            deps = [
-                "//executorch/kernels/portable/cpu/util:vectorized_math",
-                "//executorch/runtime/core/portable_type/c10/c10:c10",
-            ],
-        )
diff --git a/kernels/portable/cpu/util/test/vectorized_math_test.cpp b/kernels/portable/cpu/util/test/vectorized_math_test.cpp
deleted file mode 100644
index 2a2e8397ca4..00000000000
--- a/kernels/portable/cpu/util/test/vectorized_math_test.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/kernels/portable/cpu/util/vectorized_math.h>
-
-#include <c10/util/irange.h>
-
-#include <gtest/gtest.h>
-
-#include <cstdint>
-
-#ifndef ET_USE_PYTORCH_HEADERS
-#error "This test requires ET_USE_PYTORCH_HEADERS!"
-#endif // ET_USE_PYTORCH_HEADERS
-
-TEST(VectorizedMathTest, BasicUnary) {
-  __at_align__ float result_floats[at::vec::Vectorized<float>::size()] = {0};
-  const auto x_vec = at::vec::Vectorized<float>::arange(0, 1);
-  const auto result_vec = executorch::math::exp(x_vec);
-  result_vec.store(result_floats);
-  for (const auto ii : c10::irange(at::vec::Vectorized<float>::size())) {
-    EXPECT_FLOAT_EQ(result_floats[ii], std::exp(ii));
-  }
-}
-
-namespace {
-template <typename T>
-void test_unary_t_to_float() {
-  __at_align__ float result_floats[at::vec::Vectorized<T>::size()] = {0};
-  const auto x_vec = at::vec::Vectorized<T>::arange(0, 1);
-  const auto result_vec = executorch::math::exp(x_vec);
-  static_assert(decltype(result_vec)::size() >= at::vec::Vectorized<T>::size());
-  result_vec.store(result_floats, at::vec::Vectorized<T>::size());
-  for (const auto ii : c10::irange(at::vec::Vectorized<T>::size())) {
-    EXPECT_EQ(result_floats[ii], std::exp((float)ii)) << ii;
-  }
-}
-
-} // namespace
-
-TEST(VectorizedMathTest, UnaryInt16ToFloat) {
-  test_unary_t_to_float<std::uint16_t>();
-}
-
-TEST(VectorizedMathTest, UnaryInt32ToFloat) {
-  test_unary_t_to_float<std::uint32_t>();
-}
-
-TEST(VectorizedMathTest, UnaryInt64ToFloat) {
-  test_unary_t_to_float<std::uint64_t>();
-}
-
-TEST(VectorizedMathTest, BasicBinary) {
-  __at_align__ float result_floats[at::vec::Vectorized<float>::size()] = {0};
-  const auto x_vec = at::vec::Vectorized<float>::arange(0, 1);
-  const auto y_vec = at::vec::Vectorized<float>(2);
-  const auto result_vec = executorch::math::pow(x_vec, y_vec);
-  result_vec.store(result_floats);
-  for (const auto ii : c10::irange(at::vec::Vectorized<float>::size())) {
-    EXPECT_FLOAT_EQ(result_floats[ii], std::pow((float)ii, 2.0f));
-  }
-}
-
-namespace {
-template <typename T>
-void test_binary_t_to_float() {
-  __at_align__ float result_floats[at::vec::Vectorized<T>::size()] = {0};
-  const auto x_vec = at::vec::Vectorized<T>::arange(0, 1);
-  const auto y_vec = at::vec::Vectorized<T>(2);
-  const auto result_vec = executorch::math::pow(x_vec, y_vec);
-  static_assert(decltype(result_vec)::size() >= at::vec::Vectorized<T>::size());
-  result_vec.store(result_floats, at::vec::Vectorized<T>::size());
-  for (const auto ii : c10::irange(at::vec::Vectorized<T>::size())) {
-    EXPECT_EQ(result_floats[ii], std::pow((float)ii, 2.0f)) << ii;
-  }
-}
-
-TEST(VectorizedMathTest, BinaryInt16ToFloat) {
-  test_binary_t_to_float<std::int16_t>();
-}
-
-TEST(VectorizedMathTest, BinaryInt32ToFloat) {
-  test_binary_t_to_float<std::int32_t>();
-}
-
-TEST(VectorizedMathTest, BinaryInt64ToFloat) {
-  test_binary_t_to_float<std::uint64_t>();
-}
-
-} // namespace
diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h
deleted file mode 100644
index 9e706ace56d..00000000000
--- a/kernels/portable/cpu/util/vectorized_math.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-
-#ifdef ET_USE_PYTORCH_HEADERS
-#include <ATen/cpu/vec/vec.h>
-#endif // ET_USE_PYTORCH_HEADERS
-
-#include <iostream>
-#include <type_traits>
-
-#ifdef ET_USE_PYTORCH_HEADERS
-namespace executorch {
-inline namespace math {
-namespace internal {
-template <typename T>
-auto convert_to_vectorized_n_of_float(at::vec::Vectorized<T> vec) {
-  static constexpr auto float_vec_size = at::vec::Vectorized<float>::size();
-  static constexpr auto t_vec_size = at::vec::Vectorized<T>::size();
-  static constexpr auto result_size =
-      t_vec_size < float_vec_size ? 1 : t_vec_size / float_vec_size;
-  static_assert(result_size >= 1);
-  return at::vec::convert<float, result_size, T, 1, /*keep=*/true>(
-      at::vec::VectorizedN<T, 1>(vec));
-}
-} // namespace internal
-} // namespace math
-} // namespace executorch
-#endif // ET_USE_PYTORCH_HEADERS
-
-#define _ET_INTERNAL_STD_MATH_FUNC(name) \
-  namespace executorch {                 \
-  inline namespace math {                \
-  using std::name;                       \
-  }                                      \
-  } // namespace executorch
-
-#ifdef ET_USE_PYTORCH_HEADERS
-/**
- * Internal-usage macro for making a vectorized variant of a unary
- * function available in the executorch::math namespace.
- */
-#define ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name)                \
-  namespace executorch {                                                  \
-  inline namespace math {                                                 \
-  template <typename T>                                                   \
-  auto func_name(at::vec::Vectorized<T> vec) {                            \
-    if constexpr (!::executorch::runtime::is_floating_point<T>::value) {  \
-      return internal::convert_to_vectorized_n_of_float(vec).func_name(); \
-    } else {                                                              \
-      return vec.func_name();                                             \
-    }                                                                     \
-  }                                                                       \
-  }                                                                       \
-  }
-
-#define ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name)                  \
-  namespace executorch {                                                     \
-  inline namespace math {                                                    \
-  template <typename T>                                                      \
-  auto func_name(at::vec::Vectorized<T> vec0, at::vec::Vectorized<T> vec1) { \
-    if constexpr (!::executorch::runtime::is_floating_point<T>::value) {     \
-      const auto vec_float0 =                                                \
-          internal::convert_to_vectorized_n_of_float(vec0);                  \
-      const auto vec_float1 =                                                \
-          internal::convert_to_vectorized_n_of_float(vec1);                  \
-      return vec_float0.func_name(vec_float1);                               \
-    } else {                                                                 \
-      return vec0.func_name(vec1);                                           \
-    }                                                                        \
-  }                                                                          \
-  }                                                                          \
-  }
-
-/**
- * Internal-usage macro for making a C++ standard library
- * floating-point function and a vectorized variant of it available in
- * the c10::math namespace. Should be used with functions where the
- * corresponding operator is a "float op" in TensorIterator parlance
- * (i.e., uses something like build_borrowing_binary_float_op()),
- * because it converts non-floating-point arguments to floating point.
- */
-#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(func_name) \
-  _ET_INTERNAL_STD_MATH_FUNC(func_name)                        \
-  ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name)
-
-#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(func_name) \
-  _ET_INTERNAL_STD_MATH_FUNC(func_name)                         \
-  ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name)
-
-#else // ET_USE_PYTORCH_HEADERS
-#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(name) \
-  _ET_INTERNAL_STD_MATH_FUNC(name)
-#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(name) \
-  _ET_INTERNAL_STD_MATH_FUNC(name)
-#endif // ET_USE_PYTORCH_HEADERS
-
-// To simplify client code, we provide coverage for a bunch of float ops (the
-// same ones listed in ATen vml.h) here.
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erf)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erfc)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(exp)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(expm1)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(floor)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log10)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log1p)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log2)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sin)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sinh)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sqrt)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(round)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tan)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tanh)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma)
-
-#ifdef ET_USE_PYTORCH_HEADERS
-ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(rsqrt)
-#endif // ET_USE_PYTORCH_HEADERS
-
-namespace executorch {
-inline namespace math {
-template <typename T, std::enable_if_t<std::is_floating_point_v<T>>>
-T rsqrt(T x) {
-  return T(1) / std::sqrt(x);
-}
-} // namespace math
-} // namespace executorch
-
-ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(atan2)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(fmod)
-ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(pow)
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index 025cd127021..827a63d2cef 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -53,11 +53,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_headers_for_executorch",
         srcs = [],
-        visibility = [
-            "//executorch/kernels/optimized/...",
-            "//executorch/kernels/portable/cpu/util/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
+        visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS"],
         exported_deps = select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [
diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 5b6e67fa213..41bc6050524 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -26,7 +26,6 @@ def define_common_targets():
         visibility = [
             "//executorch/backends/...",
             "//executorch/extension/fb/dynamic_shim/...",
-            "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 182d0bfd58a..2cfc4b8a995 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -68,6 +68,18 @@
             "extension_threadpool"
         ]
     },
+    {
+        "directory": "kernels/portable/cpu/util/test",
+        "sources": [
+            "broadcast_indexes_range_test.cpp",
+            "broadcast_test.cpp",
+            "reduce_test.cpp"
+        ],
+        "additional_libs": [
+            "portable_kernels",
+            "portable_ops_lib"
+        ]
+    },
     {
         "directory": "runtime/core/portable_type/test",
         "sources": [