From 7d6ca43022ee825a33560795cc2df0e02fc7462d Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:29:11 -0700 Subject: [PATCH 1/7] Revert "Implement unary_ufunc functions using elementwise_util (#9386)" This reverts commit dffbd426a4c519a4dc1596595578b5fa9e68d585. --- kernels/portable/cpu/op_acos.cpp | 5 +- kernels/portable/cpu/op_acosh.cpp | 5 +- kernels/portable/cpu/op_asin.cpp | 5 +- kernels/portable/cpu/op_asinh.cpp | 5 +- kernels/portable/cpu/op_atan.cpp | 5 +- kernels/portable/cpu/op_atanh.cpp | 5 +- kernels/portable/cpu/op_ceil.cpp | 4 +- kernels/portable/cpu/op_cos.cpp | 4 +- kernels/portable/cpu/op_cosh.cpp | 5 +- kernels/portable/cpu/op_erf.cpp | 4 +- kernels/portable/cpu/op_exp.cpp | 4 +- kernels/portable/cpu/op_expm1.cpp | 7 +- kernels/portable/cpu/op_floor.cpp | 4 +- kernels/portable/cpu/op_isinf.cpp | 5 +- kernels/portable/cpu/op_isnan.cpp | 5 +- kernels/portable/cpu/op_log.cpp | 4 +- kernels/portable/cpu/op_log10.cpp | 5 +- kernels/portable/cpu/op_log1p.cpp | 5 +- kernels/portable/cpu/op_log2.cpp | 5 +- kernels/portable/cpu/op_reciprocal.cpp | 13 ++- kernels/portable/cpu/op_rsqrt.cpp | 11 ++- kernels/portable/cpu/op_sin.cpp | 4 +- kernels/portable/cpu/op_sinh.cpp | 5 +- kernels/portable/cpu/op_sqrt.cpp | 5 +- kernels/portable/cpu/op_tan.cpp | 4 +- kernels/portable/cpu/op_tanh.cpp | 5 +- kernels/portable/cpu/op_trunc.cpp | 4 +- kernels/portable/cpu/pattern/pattern.cpp | 28 ------ kernels/portable/cpu/pattern/pattern.h | 94 ++----------------- kernels/portable/cpu/pattern/targets.bzl | 8 +- .../pattern/unary_ufunc_realhb_to_bool.cpp | 60 ++++++++++++ .../unary_ufunc_realhbbf16_to_floathbf16.cpp | 60 ++++++++++++ .../cpu/pattern/unary_ufunc_realhbf16.cpp | 53 +++++++++++ kernels/portable/cpu/util/vectorized_math.h | 27 +----- .../kernels/portable/op_registration_util.bzl | 1 - 35 files changed, 251 insertions(+), 222 deletions(-) delete mode 100644 kernels/portable/cpu/pattern/pattern.cpp create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp create mode 100644 kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp diff --git a/kernels/portable/cpu/op_acos.cpp b/kernels/portable/cpu/op_acos.cpp index 81daf10c9a6..dac3b1546f3 100644 --- a/kernels/portable/cpu/op_acos.cpp +++ b/kernels/portable/cpu/op_acos.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& acos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "acos.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::acos(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::acos, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_acosh.cpp b/kernels/portable/cpu/op_acosh.cpp index b402698d761..77f7edf4c5d 100644 --- a/kernels/portable/cpu/op_acosh.cpp +++ b/kernels/portable/cpu/op_acosh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& acosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "acosh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::acosh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::acosh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_asin.cpp b/kernels/portable/cpu/op_asin.cpp index ddb52c70e84..6affa6e4122 100644 --- a/kernels/portable/cpu/op_asin.cpp +++ b/kernels/portable/cpu/op_asin.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& asin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "asin.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::asin(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::asin, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_asinh.cpp b/kernels/portable/cpu/op_asinh.cpp index 9441db09589..bce8dcf6d5a 100644 --- a/kernels/portable/cpu/op_asinh.cpp +++ b/kernels/portable/cpu/op_asinh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& asinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "asinh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::asinh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::asinh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_atan.cpp b/kernels/portable/cpu/op_atan.cpp index 6a73341bf0d..23549627a3b 100644 --- a/kernels/portable/cpu/op_atan.cpp +++ b/kernels/portable/cpu/op_atan.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& atan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "atan.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::atan(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::atan, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_atanh.cpp b/kernels/portable/cpu/op_atanh.cpp index 9e036a5fb3b..13e6e8ca141 100644 --- a/kernels/portable/cpu/op_atanh.cpp +++ b/kernels/portable/cpu/op_atanh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& atanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "atanh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::atanh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::atanh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_ceil.cpp b/kernels/portable/cpu/op_ceil.cpp index e2c8e6f07b6..5aa09ba0084 100644 --- a/kernels/portable/cpu/op_ceil.cpp +++ b/kernels/portable/cpu/op_ceil.cpp @@ -17,9 +17,7 @@ namespace native { using executorch::aten::Tensor; Tensor& ceil_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "ceil.out"; - return internal::unary_ufunc_realhbf16( - [](auto x) { return executorch::math::ceil(x); }, ctx, in, out); + return internal::unary_ufunc_realhbf16(std::ceil, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_cos.cpp b/kernels/portable/cpu/op_cos.cpp index e7876116f94..e536060d162 100644 --- a/kernels/portable/cpu/op_cos.cpp +++ b/kernels/portable/cpu/op_cos.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& cos_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "cos.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::cos(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::cos, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_cosh.cpp b/kernels/portable/cpu/op_cosh.cpp index 9703ff0336c..e622bbe6fcd 100644 --- a/kernels/portable/cpu/op_cosh.cpp +++ b/kernels/portable/cpu/op_cosh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& cosh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "cosh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::cosh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::cosh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_erf.cpp b/kernels/portable/cpu/op_erf.cpp index aee0101fdb4..6897bcda95b 100644 --- a/kernels/portable/cpu/op_erf.cpp +++ b/kernels/portable/cpu/op_erf.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& erf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "erf.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::erf(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::erf, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_exp.cpp b/kernels/portable/cpu/op_exp.cpp index f2241613609..cbfc8924cb0 100644 --- a/kernels/portable/cpu/op_exp.cpp +++ b/kernels/portable/cpu/op_exp.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "exp.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::exp(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::exp, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp index 67af9b343bb..f2d49f615b1 100644 --- a/kernels/portable/cpu/op_expm1.cpp +++ b/kernels/portable/cpu/op_expm1.cpp @@ -7,19 +7,16 @@ */ #include -#include #include #include -#include namespace torch { namespace executor { namespace native { Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "expm1.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::expm1(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::expm1, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_floor.cpp b/kernels/portable/cpu/op_floor.cpp index 14b49cafbc1..4061722bd27 100644 --- a/kernels/portable/cpu/op_floor.cpp +++ b/kernels/portable/cpu/op_floor.cpp @@ -17,9 +17,7 @@ namespace native { using executorch::aten::Tensor; Tensor& floor_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "floor.out"; - return internal::unary_ufunc_realhbf16( - [](auto x) { return executorch::math::floor(x); }, ctx, in, out); + return internal::unary_ufunc_realhbf16(std::floor, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_isinf.cpp b/kernels/portable/cpu/op_isinf.cpp index 42798231a84..92d1e563a2e 100644 --- a/kernels/portable/cpu/op_isinf.cpp +++ b/kernels/portable/cpu/op_isinf.cpp @@ -17,9 +17,8 @@ namespace native { Tensor& isinf_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { // Lambda is syntactic sugar needed to workaround compilation on some older // non-compatible distros where isnan is returning int rather than bool - static constexpr const char op_name[] = "isinf.out"; - return internal::unary_ufunc_realhb_to_bool( - [](auto x) -> bool { return std::isinf(x); }, ctx, in, out); + return internal::unary_ufunc_realhb_to_bool( + [](double x) -> bool { return std::isinf(x); }, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_isnan.cpp b/kernels/portable/cpu/op_isnan.cpp index 817d314fd2b..51e189992ee 100644 --- a/kernels/portable/cpu/op_isnan.cpp +++ b/kernels/portable/cpu/op_isnan.cpp @@ -17,9 +17,8 @@ namespace native { Tensor& isnan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { // Lambda is syntactic sugar needed to workaround compilation on some older // non-compatible distros where isnan is returning int rather than bool - static constexpr const char op_name[] = "isnan.out"; - return internal::unary_ufunc_realhb_to_bool( - [](auto x) -> bool { return std::isnan(x); }, ctx, in, out); + return internal::unary_ufunc_realhb_to_bool( + [](double x) -> bool { return std::isnan(x); }, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_log.cpp b/kernels/portable/cpu/op_log.cpp index 5b0c32549aa..8a36bce8c49 100644 --- a/kernels/portable/cpu/op_log.cpp +++ b/kernels/portable/cpu/op_log.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& log_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "log.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::log(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::log, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_log10.cpp b/kernels/portable/cpu/op_log10.cpp index 5251aea201d..89f9b672476 100644 --- a/kernels/portable/cpu/op_log10.cpp +++ b/kernels/portable/cpu/op_log10.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& log10_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "log10.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::log10(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::log10, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_log1p.cpp b/kernels/portable/cpu/op_log1p.cpp index f352750a944..2daa31e37ff 100644 --- a/kernels/portable/cpu/op_log1p.cpp +++ b/kernels/portable/cpu/op_log1p.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& log1p_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "log1p.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::log1p(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::log1p, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_log2.cpp b/kernels/portable/cpu/op_log2.cpp index 42d17ea83b9..4d7406832e4 100644 --- a/kernels/portable/cpu/op_log2.cpp +++ b/kernels/portable/cpu/op_log2.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& log2_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "log2.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::log2(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::log2, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_reciprocal.cpp b/kernels/portable/cpu/op_reciprocal.cpp index a1bd116a962..f22f9883858 100644 --- a/kernels/portable/cpu/op_reciprocal.cpp +++ b/kernels/portable/cpu/op_reciprocal.cpp @@ -12,11 +12,18 @@ namespace torch { namespace executor { namespace native { +namespace { + +double reciprocal(double x) { + return 1.0 / x; +} + +} // namespace + Tensor& reciprocal_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "reciprocal.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::reciprocal(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + reciprocal, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_rsqrt.cpp b/kernels/portable/cpu/op_rsqrt.cpp index a14eb15d7ec..19c4c6c1a57 100644 --- a/kernels/portable/cpu/op_rsqrt.cpp +++ b/kernels/portable/cpu/op_rsqrt.cpp @@ -12,11 +12,16 @@ namespace torch { namespace executor { namespace native { +namespace { + +double rsqrt(double x) { + return 1.0 / std::sqrt(x); +} + +} // namespace Tensor& rsqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "rsqrt.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::rsqrt(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_sin.cpp b/kernels/portable/cpu/op_sin.cpp index aeb73009729..ad65c4be18b 100644 --- a/kernels/portable/cpu/op_sin.cpp +++ b/kernels/portable/cpu/op_sin.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& sin_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "sin.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::sin(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::sin, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_sinh.cpp b/kernels/portable/cpu/op_sinh.cpp index f4cc67ad35f..21666392392 100644 --- a/kernels/portable/cpu/op_sinh.cpp +++ b/kernels/portable/cpu/op_sinh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& sinh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "sinh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::sinh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::sinh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_sqrt.cpp b/kernels/portable/cpu/op_sqrt.cpp index 1b3d2ff6de5..bd2075f5b04 100644 --- a/kernels/portable/cpu/op_sqrt.cpp +++ b/kernels/portable/cpu/op_sqrt.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& sqrt_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "sqrt.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::sqrt(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::sqrt, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_tan.cpp b/kernels/portable/cpu/op_tan.cpp index 19ccb84935b..a2b921d5146 100644 --- a/kernels/portable/cpu/op_tan.cpp +++ b/kernels/portable/cpu/op_tan.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& tan_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "tan.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::tan(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16(std::tan, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_tanh.cpp b/kernels/portable/cpu/op_tanh.cpp index 623968ac721..ae9f93dc62c 100644 --- a/kernels/portable/cpu/op_tanh.cpp +++ b/kernels/portable/cpu/op_tanh.cpp @@ -15,9 +15,8 @@ namespace executor { namespace native { Tensor& tanh_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "tanh.out"; - return internal::unary_ufunc_realhbbf16_to_floathbf16( - [](auto x) { return executorch::math::tanh(x); }, ctx, in, out); + return internal::unary_ufunc_realhbbf16_to_floathbf16( + std::tanh, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/op_trunc.cpp b/kernels/portable/cpu/op_trunc.cpp index 9c96865db0e..2d70a3b1724 100644 --- a/kernels/portable/cpu/op_trunc.cpp +++ b/kernels/portable/cpu/op_trunc.cpp @@ -15,9 +15,7 @@ namespace executor { namespace native { Tensor& trunc_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { - static constexpr const char op_name[] = "trunc.out"; - return internal::unary_ufunc_realhbf16( - [](auto x) { return executorch::math::trunc(x); }, ctx, in, out); + return internal::unary_ufunc_realhbf16(std::trunc, ctx, in, out); } } // namespace native diff --git a/kernels/portable/cpu/pattern/pattern.cpp b/kernels/portable/cpu/pattern/pattern.cpp deleted file mode 100644 index 61571f25ddc..00000000000 --- a/kernels/portable/cpu/pattern/pattern.cpp +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -namespace torch::executor::native::internal { - -bool check_and_resize_inputs( - KernelRuntimeContext& ctx, - const Tensor& in, - Tensor& out) { - ET_KERNEL_CHECK( - ctx, tensors_have_same_dim_order(in, out), InvalidArgument, false); - ET_KERNEL_CHECK_MSG( - ctx, - resize_tensor(out, in.sizes()) == Error::Ok, - InvalidArgument, - false, - "Failed to resize output tensor."); - return true; -} - -} // namespace torch::executor::native::internal diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h index 02690739a01..2d4b2ac509c 100644 --- a/kernels/portable/cpu/pattern/pattern.h +++ b/kernels/portable/cpu/pattern/pattern.h @@ -46,7 +46,6 @@ question is a bit more specific, then add a descriptive sufix. */ #pragma once -#include #include namespace torch { @@ -54,78 +53,29 @@ namespace executor { namespace native { namespace internal { -// Implementation detail for the other helpers in this header. Returns -// true on success, false on failure. -bool check_and_resize_inputs( - KernelRuntimeContext& ctx, - const Tensor& in, - Tensor& out); - /** * Implements an op pattern for ops that take a single input tensor of any - * realhbf16 dtype, no additional arguments, and outputs a tensor of the same - * size and dtype. The function fn specifies the math operation which is applied - * to the input tensor element-wise. + * realh dtye, no additional arguments, and outputs a tensor of the same size + * and dtype. The function fn specifies the math operation which is applied to + * the input tensor element-wise. */ -template Tensor& unary_ufunc_realhbf16( - const Op& fn, + double (*fn)(double), KernelRuntimeContext& ctx, const Tensor& in, - Tensor& out) { - if (!check_and_resize_inputs(ctx, in, out)) { - return out; - } - ET_KERNEL_CHECK( - ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); - - ET_SWITCH_REALHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE, [&] { - utils::apply_unitensor_elementwise_fn< - CTYPE, - op_name, - utils::SupportedTensorDtypes::SAME_AS_COMMON>( - fn, ctx, in, utils::SupportedTensorDtypes::REALHBF16, out); - }); - return out; -} + Tensor& out); /** * Implements an op pattern for ops that take a single input tensor of any - * realhb dtype (real, half and boolean), no additional arguments, and outputs a + * realhb dtye (real, half and boolean), no additional arguments, and outputs a * boolean tensor of the same size. The function fn specifies the math * operation which is applied to the input tensor element-wise. */ -template Tensor& unary_ufunc_realhb_to_bool( - const Op& fn, + bool (*fn)(double), KernelRuntimeContext& ctx, const Tensor& in, - Tensor& out) { - if (!check_and_resize_inputs(ctx, in, out)) { - return out; - } - ET_KERNEL_CHECK_MSG( - ctx, - out.scalar_type() == executorch::aten::ScalarType::Bool, - InvalidArgument, - out, - "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.", - static_cast(out.scalar_type())); - - ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { - utils::apply_unitensor_elementwise_fn< - CTYPE_IN, - op_name, - utils::SupportedTensorDtypes::BOOL>( - [fn](const CTYPE_IN val_in) { return fn(val_in); }, - ctx, - in, - utils::SupportedTensorDtypes::REALHBBF16, - out); - }); - - return out; -} + Tensor& out); /** * Implements an op pattern for ops that take a single input tensor of any @@ -133,35 +83,11 @@ Tensor& unary_ufunc_realhb_to_bool( * outputs a floating point tensor of the same size. The function fn specifies * the math operation which is applied to the input tensor element-wise. */ -template Tensor& unary_ufunc_realhbbf16_to_floathbf16( - const Op& fn, + double (*fn)(double), KernelRuntimeContext& ctx, const Tensor& in, - Tensor& out) { - ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); - - if (!check_and_resize_inputs(ctx, in, out)) { - return out; - } - - ScalarType compute_type = in.scalar_type() == ScalarType::Double - ? ScalarType::Double - : ScalarType::Float; - ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&] { - utils::apply_unitensor_elementwise_fn< - CTYPE_COMPUTE, - op_name, - utils::SupportedTensorDtypes::FLOATHBF16>( - [fn](const auto val_in) { return fn(val_in); }, - ctx, - in, - utils::SupportedTensorDtypes::REALHBBF16, - out); - }); - - return out; -} + Tensor& out); } // namespace internal } // namespace native diff --git a/kernels/portable/cpu/pattern/targets.bzl b/kernels/portable/cpu/pattern/targets.bzl index 4140e4e0f14..5fc73ccd911 100644 --- a/kernels/portable/cpu/pattern/targets.bzl +++ b/kernels/portable/cpu/pattern/targets.bzl @@ -49,14 +49,18 @@ def define_common_targets(): runtime.cxx_library( name = "pattern", - srcs = ["pattern.cpp"], + srcs = [ + "unary_ufunc_realhb_to_bool.cpp", + "unary_ufunc_realhbbf16_to_floathbf16.cpp", + "unary_ufunc_realhbf16.cpp", + ], exported_headers = [ "pattern.h", ], compiler_flags = ["-Wno-missing-prototypes"], exported_deps = [ "//executorch/kernels/portable/cpu/util:broadcast_util", - "//executorch/kernels/portable/cpu/util:elementwise_util", + "//executorch/kernels/portable/cpu/util:functional_util", "//executorch/runtime/kernel:kernel_includes", ], visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."], diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp new file mode 100644 index 00000000000..367137ad02c --- /dev/null +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace internal { + +Tensor& unary_ufunc_realhb_to_bool( + bool (*fn)(double), + KernelRuntimeContext& ctx, + const Tensor& in, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_KERNEL_CHECK_MSG( + ctx, + out.scalar_type() == executorch::aten::ScalarType::Bool, + InvalidArgument, + out, + "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.", + static_cast(out.scalar_type())); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + const auto in_type = in.scalar_type(); + + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { + apply_unary_map_fn( + [fn](const CTYPE_IN val_in) { return fn(val_in); }, + in.const_data_ptr(), + out.mutable_data_ptr(), + in.numel()); + }); + + return out; +} + +} // namespace internal +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp new file mode 100644 index 00000000000..602b5b1bfd2 --- /dev/null +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace internal { + +Tensor& unary_ufunc_realhbbf16_to_floathbf16( + double (*fn)(double), + KernelRuntimeContext& ctx, + const Tensor& in, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + const auto in_type = in.scalar_type(); + const auto out_type = out.scalar_type(); + + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { + ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] { + apply_unary_map_fn( + [fn](const CTYPE_IN val_in) { + CTYPE_OUT xi = static_cast(val_in); + return static_cast(fn(xi)); + }, + in.const_data_ptr(), + out.mutable_data_ptr(), + in.numel()); + }); + }); + + return out; +} + +} // namespace internal +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp new file mode 100644 index 00000000000..3672e223b7e --- /dev/null +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbf16.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { +namespace internal { + +Tensor& unary_ufunc_realhbf16( + double (*fn)(double), + KernelRuntimeContext& ctx, + const Tensor& in, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { + apply_unary_map_fn( + [fn](const CTYPE val_in) { return static_cast(fn(val_in)); }, + in.const_data_ptr(), + out.mutable_data_ptr(), + in.numel()); + }); + + return out; +} + +} // namespace internal +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h index 823d0ccc39a..9e706ace56d 100644 --- a/kernels/portable/cpu/util/vectorized_math.h +++ b/kernels/portable/cpu/util/vectorized_math.h @@ -104,14 +104,11 @@ auto convert_to_vectorized_n_of_float(at::vec::Vectorized vec) { #endif // ET_USE_PYTORCH_HEADERS // To simplify client code, we provide coverage for a bunch of float ops (the -// same ones listed in ATen vml.h, plus acosh, asinh, atanh) here. +// same ones listed in ATen vml.h) here. ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acosh) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asinh) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atanh) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh) @@ -134,30 +131,12 @@ ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc) ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma) #ifdef ET_USE_PYTORCH_HEADERS -ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(reciprocal) -ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(rsqrt) +ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(rsqrt) #endif // ET_USE_PYTORCH_HEADERS namespace executorch { inline namespace math { -inline float reciprocal(float x) { - return 1.0f / x; -} - -inline double reciprocal(double x) { - return 1.0 / x; -} - -template < - typename Integer, - std::enable_if_t, bool> = true> -double reciprocal(Integer x) { - return reciprocal((double)x); -} - -template < - typename T, - std::enable_if_t, bool> = true> +template >> T rsqrt(T x) { return T(1) / std::sqrt(x); } diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index 84c6567b495..a731ce5c674 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -533,7 +533,6 @@ ATEN_OPS = ( name = "op_expm1", deps = [ "//executorch/kernels/portable/cpu/pattern:pattern", - "//executorch/kernels/portable/cpu/util:elementwise_util", ], ), op_target( From 20d31faa087a9feb2c92400b822f115d7e0036a4 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:29:47 -0700 Subject: [PATCH 2/7] Revert "Add mixed integer precision test for op_mul (#11206)" This reverts commit 3a4ec6e8785a0c73fb44444493015a9abbf64881. --- kernels/test/op_mul_test.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index 34433fbe95c..c21cceeaae3 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -746,21 +746,6 @@ TEST_F(OpMulOutTest, DynamicShapeUnbound) { EXPECT_TENSOR_CLOSE(out, expected_result); } -// >>> torch.ops.aten.mul(torch.tensor([100], dtype=torch.int8), -// torch.tensor([100], dtype=torch.int8), out=torch.zeros([1], -// dtype=torch.long)) tensor([16]) -TEST_F(OpMulOutTest, MixedIntegerDtypeMatchesATen) { - TensorFactory tf_in; - TensorFactory tf_out; - - Tensor in = tf_in.make({1}, {100}); - Tensor out = tf_out.zeros({1}); - Tensor ret = op_mul_out(in, in, out); - - Tensor expected = tf_out.make({1}, {16}); - EXPECT_TENSOR_CLOSE(out, expected); -} - TEST_F(OpMulScalarOutTest, SanityCheck) { TensorFactory tf_a; TensorFactory tf_out; From 7a2934b37d9adb191af3af1faad14c1a791814e5 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:30:03 -0700 Subject: [PATCH 3/7] Revert "Add SupportedTensorDtypes::BOOL (#9584)" This reverts commit 9123e91323c3603330c73258d84a1256bc7e4da9. --- kernels/portable/cpu/util/dtype_util.cpp | 2 -- kernels/portable/cpu/util/dtype_util.h | 30 +----------------------- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp index 525199a6f78..d240b9f83bc 100644 --- a/kernels/portable/cpu/util/dtype_util.cpp +++ b/kernels/portable/cpu/util/dtype_util.cpp @@ -27,8 +27,6 @@ bool check_tensor_dtype( return executorch::runtime::tensor_is_floating_type(t); case SupportedTensorDtypes::INTB: return executorch::runtime::tensor_is_integral_type(t, true); - case SupportedTensorDtypes::BOOL: - return executorch::runtime::tensor_is_type(t, ScalarType::Bool); case SupportedTensorDtypes::BOOL_OR_BYTE: return (executorch::runtime::tensor_is_type( t, ScalarType::Bool, ScalarType::Byte)); diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 15732219c8f..1e7901c80b2 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -72,16 +72,6 @@ load_to_compute_fn get_load_to_compute_fn_intb(const Tensor& t) { return result; } -template -load_to_compute_fn get_load_to_compute_fn_bool(const Tensor& t) { - ET_CHECK_MSG( - t.scalar_type() == ScalarType::Bool, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(t.scalar_type()), - op_name); - return internal::load_and_convert; -} - template load_to_compute_fn get_load_to_compute_fn_bool_or_byte( const Tensor& t) { @@ -175,17 +165,6 @@ store_compute_to_tensor_fn get_store_compute_to_tensor_fn_intb( return result; } -template -store_compute_to_tensor_fn get_store_compute_to_tensor_fn_bool( - const Tensor& t) { - ET_CHECK_MSG( - t.scalar_type() == ScalarType::Bool, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(t.scalar_type()), - op_name); - return internal::convert_and_store; -} - template store_compute_to_tensor_fn get_store_compute_to_tensor_fn_bool_or_byte(const Tensor& t) { @@ -240,7 +219,6 @@ enum class SupportedTensorDtypes { REALHBF16, FLOATHBF16, INTB, - BOOL, BOOL_OR_BYTE, // DEPRECATED: not likely to be correct; use SAME_AS_COMMON. SAME_AS_COMPUTE, @@ -262,8 +240,6 @@ load_to_compute_fn get_load_to_compute_fn_impl( return get_load_to_compute_fn_realhbf16(t); case SupportedTensorDtypes::INTB: return get_load_to_compute_fn_intb(t); - case SupportedTensorDtypes::BOOL: - return get_load_to_compute_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_load_to_compute_fn_bool_or_byte(t); case SupportedTensorDtypes::SAME_AS_COMPUTE: @@ -295,8 +271,6 @@ store_compute_to_tensor_fn get_store_compute_to_tensor_fn( t); case SupportedTensorDtypes::INTB: return get_store_compute_to_tensor_fn_intb(t); - case SupportedTensorDtypes::BOOL: - return get_store_compute_to_tensor_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_store_compute_to_tensor_fn_bool_or_byte< CTYPE_COMPUTE, @@ -344,14 +318,12 @@ bool check_tensor_dtype( const ScalarType compute_type); /// Return the one output type we are willing to emit specialized code -/// to handle, given a compute type of CTYPE_COMPUTE and supported +/// to handle, given a compute type of CTYPE_COMMON and supported /// output types of out_dtypes. template inline constexpr ScalarType specialized_output_scalar_type( SupportedTensorDtypes out_dtypes) { switch (out_dtypes) { - case SupportedTensorDtypes::BOOL: - return ScalarType::Bool; case SupportedTensorDtypes::BOOL_OR_BYTE: return ScalarType::Bool; case SupportedTensorDtypes::REALHBBF16: From a2e898eb9b3fab6e79c8f66f1000d6f02311ddd1 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:30:31 -0700 Subject: [PATCH 4/7] Revert "relax tolerances for all unary float ops (#9585)" This reverts commit 2dedc9e0e39047269b7762652b593c5c53883168. --- .../UnaryUfuncRealHBBF16ToFloatHBF16Test.h | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h index d1e812ec2c2..6e49dd9e57b 100644 --- a/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h +++ b/kernels/test/UnaryUfuncRealHBBF16ToFloatHBF16Test.h @@ -72,16 +72,20 @@ class UnaryUfuncRealHBBF16ToFloatHBF16Test : public OperatorTest { auto expected = tf_out.make({1, 6}, expected_vector); if (IN_DTYPE == ScalarType::BFloat16 || OUT_DTYPE == ScalarType::BFloat16) { - // Raise tolerance because both we and ATen run these - // computations at internal float32 precision rather than - // float64. - double rtol = 3e-3; + double rtol = executorch::runtime::testing::internal::kDefaultRtol; + // It appears we need a higher tolerance for at least some ATen + // tests, like aten_op_acosh_test. + if (get_supported_features()->is_aten) { + rtol = 3e-3; + } EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultBFloat16Atol); } else if (IN_DTYPE == ScalarType::Half || OUT_DTYPE == ScalarType::Half) { - // Raise tolerance because both we and ATen run these - // computations at internal float32 precision rather than - // float64. - double rtol = 1e-3; + double rtol = executorch::runtime::testing::internal::kDefaultRtol; + // It appears we need a higher tolerance for at least some ATen + // tests, like aten_op_acosh_test. + if (get_supported_features()->is_aten) { + rtol = 1e-3; + } EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, rtol, executorch::runtime::testing::internal::kDefaultHalfAtol); } else { EXPECT_TENSOR_CLOSE(out, expected); From c275c6422ed442372eb3cadfb42a2c53a6fceccd Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:30:49 -0700 Subject: [PATCH 5/7] Revert "Add vectorization in elementwise_util (#9432)" This reverts commit 4c35fe04477efb99b2f2c1da7402de8f08c3129c. --- .lintrunner.toml | 2 - kernels/portable/cpu/op_add.cpp | 12 +- kernels/portable/cpu/op_atan2.cpp | 2 +- kernels/portable/cpu/op_clamp.cpp | 5 +- kernels/portable/cpu/op_elu.cpp | 3 +- kernels/portable/cpu/op_fmod.cpp | 8 +- kernels/portable/cpu/op_maximum.cpp | 2 +- kernels/portable/cpu/op_minimum.cpp | 3 +- kernels/portable/cpu/op_mul.cpp | 4 +- kernels/portable/cpu/op_native_dropout.cpp | 10 +- kernels/portable/cpu/op_pow.cpp | 23 +-- kernels/portable/cpu/op_sigmoid.cpp | 7 +- kernels/portable/cpu/op_where.cpp | 6 +- kernels/portable/cpu/util/elementwise_util.h | 139 +------------------ kernels/portable/cpu/util/math_util.h | 30 ---- kernels/portable/cpu/util/targets.bzl | 6 - kernels/test/op_atan2_test.cpp | 33 ----- kernels/test/op_clamp_test.cpp | 34 ----- kernels/test/op_fmod_test.cpp | 31 ----- kernels/test/op_maximum_test.cpp | 14 -- kernels/test/op_minimum_test.cpp | 14 -- kernels/test/op_mul_test.cpp | 6 +- kernels/test/op_pow_test.cpp | 13 -- kernels/test/op_sigmoid_test.cpp | 4 - 24 files changed, 41 insertions(+), 370 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 4c881940155..1e81c570c65 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -272,8 +272,6 @@ exclude_patterns = [ 'exir/verification/bindings.cpp', 'extension/**', # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include. - 'kernels/portable/cpu/util/elementwise_util.h', - 'kernels/portable/cpu/util/math_util.h', 'kernels/portable/cpu/util/vectorized_math.h', 'kernels/optimized/**', 'runtime/core/exec_aten/**', diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 83642c4864d..555341b3447 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -102,18 +102,14 @@ Tensor& add_scalar_out( static constexpr const char op_name[] = "add.Scalar_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - CTYPE_COMPUTE val_b = utils::scalar_to(b); - CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - auto val_alpha_times_b = val_alpha * val_b; utils::apply_unitensor_elementwise_fn< CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [val_alpha_times_b](const auto val_a) { - // Cast here supports vectorization; either it does nothing - // or it casts from CTYPE_COMPUTE to - // Vectorized. - return val_a + decltype(val_a)(val_alpha_times_b); + [b, alpha](const auto val_a) { + CTYPE_COMPUTE val_b = utils::scalar_to(b); + CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); + return val_a + val_alpha * val_b; }, ctx, a, diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 5390eb52820..33d66cf2ad7 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -60,7 +60,7 @@ Tensor& atan2_out( op_name, utils::SupportedTensorDtypes::FLOATHBF16>( [](const auto val_a, const auto val_b) { - return executorch::math::atan2(val_a, val_b); + return std::atan2(val_a, val_b); }, ctx, a, diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index c5bf28e8aec..6974789eccf 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -138,8 +138,9 @@ Tensor& clamp_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [has_min, min_opt, has_max, max_opt](const auto val_in) { - auto val_out = val_in; + [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { + // TODO: rewrite this to be vectorization-capable. + CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override( val_out, utils::scalar_to(min_opt.value())); diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp index d7477717a3a..d6533642860 100644 --- a/kernels/portable/cpu/op_elu.cpp +++ b/kernels/portable/cpu/op_elu.cpp @@ -48,7 +48,8 @@ Tensor& elu_out( CTYPE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [negcoef, math_scale, math_input_scale](const CTYPE x) { + [negcoef, math_scale, math_input_scale](const auto x) { + // TODO: rewrite this to be vectorization-capable. return MathT(x) <= MathT(0) ? std::expm1(MathT(x) * math_input_scale) * negcoef : MathT(x) * math_scale; diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 40bb4a5e94c..96a971b166a 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out( utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - // TODO: rewrite this to be vectorization-capable? + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -138,8 +138,10 @@ Tensor& fmod_Scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [val_b](const auto val_a) { - return executorch::math::fmod(val_a, (decltype(val_a))val_b); + [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. + CTYPE_COMPUTE value = std::fmod(val_a, val_b); + return value; }, ctx, a, diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index c7979e40d7c..3a84095a4df 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -49,7 +49,7 @@ Tensor& maximum_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const auto val_a, const auto val_b) { + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { return utils::max_override(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index 1bac23187d8..5c0e79eb9bb 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -49,7 +49,8 @@ Tensor& minimum_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const auto val_a, const auto val_b) { + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return utils::min_override(val_a, val_b); }, ctx, diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 6d4f30106ca..ba16ddc075a 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -72,7 +72,9 @@ Tensor& mul_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const auto val_a, const auto val_b) { return val_a * val_b; }, + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return val_a * val_b; + }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/op_native_dropout.cpp b/kernels/portable/cpu/op_native_dropout.cpp index 8dafd9e0512..1c4d177e8ed 100644 --- a/kernels/portable/cpu/op_native_dropout.cpp +++ b/kernels/portable/cpu/op_native_dropout.cpp @@ -57,11 +57,8 @@ std::tuple native_dropout_out( } ET_SWITCH_FLOATHBF16_TYPES( input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn< - CTYPE_COMPUTE, - op_name, - utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) { + utils::apply_bitensor_elementwise_fn( + [](const auto val, const auto mask_val) { if (!mask_val) { return static_cast(0); } @@ -73,7 +70,8 @@ std::tuple native_dropout_out( mask, // TODO: should really be just BOOL utils::SupportedTensorDtypes::BOOL_OR_BYTE, - out); + out, + utils::SupportedTensorDtypes::SAME_AS_COMMON); }); } else if (input.numel() > 0) { std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes()); diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index aaf934b9adf..4d2673cb72d 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -57,8 +57,9 @@ Tensor& pow_Tensor_Tensor_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - [](const auto val_a, const auto val_b) { - return executorch::math::pow(val_a, val_b); + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. + return std::pow(val_a, val_b); }, ctx, a, @@ -110,13 +111,8 @@ Tensor& pow_Tensor_Scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - // Casting val_b here supports vectorization; it does - // nothing if we are not vectorizing (casts to - // CTYPE_COMPUTE) and casts to a vectorized type - // otherwise. - [val_b](const auto val_a) { - return executorch::math::pow(val_a, decltype(val_a)(val_b)); - }, + // TODO: rewrite this to be vectorization-capable. + [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -165,13 +161,8 @@ Tensor& pow_Scalar_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBF16>( - // Casting val_a here supports vectorization; it does - // nothing if we are not vectorizing (casts to - // CTYPE_COMPUTE) and casts to a vectorized type - // otherwise. - [val_a](const auto val_b) { - return executorch::math::pow(decltype(val_b)(val_a), val_b); - }, + // TODO: rewrite this to be vectorization-capable. + [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); }, ctx, b, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index a1eb03c1869..acb743a2db6 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -49,9 +49,10 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::FLOATHBF16>( - [](const auto val_in) { - const auto one = static_cast(1.0); - auto out_val = one / (one + executorch::math::exp(-val_in)); + [](const auto val_in) -> CTYPE_COMPUTE { + // TODO: rewrite this to be vectorization-capable + CTYPE_COMPUTE out_val = static_cast(1.0) / + (static_cast(1.0) + exp(-val_in)); return out_val; }, ctx, diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index b1eb4ff442c..692e296ee00 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -47,9 +47,9 @@ Tensor& where_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>( - [](const CTYPE_COMPUTE val_a, - const CTYPE_COMPUTE val_b, - const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, + [](const auto val_a, const auto val_b, const auto val_c) { + return val_c ? val_a : val_b; + }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index d1e5d1e88f9..e30b8af7d89 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -12,14 +12,9 @@ #include #include #include -#include // Make vectorization support easy for clients. #include #include -#ifdef ET_USE_PYTORCH_HEADERS -#include -#endif // ET_USE_PYTORCH_HEADERS - #include #include @@ -56,38 +51,6 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { -template -using ignore_first_yield_second = T; - -#ifdef ET_USE_PYTORCH_HEADERS -// Can I call a function of type Op with sizeof...(Args) arguments of type -// at::vec::Vectorized? -// -// See [NOTE: Generic lambdas] below for requirements on Op. -template -constexpr bool can_use_vectorized() { - using Vec = at::vec::Vectorized; - // NOTE: if we start building optimized kernels on platforms that - // ATen Vectorized doesn't support well, we will want to add a way - // to check that Vectorized actually does something on our target - // platform. For now, I see no concrete need for that. - if constexpr (std::is_invocable_v< - Op, - ignore_first_yield_second...>) { - // For bool, we will get a false positive if we rely on only the - // is_invocable_v check above because at::vec::Vectorized is - // implicitly convertible to a pointer, which makes it implicitly - // convertible to bool (which was 15 minutes of fun to debug). Also - // just seems like good hygiene to make sure we get the Vectorized - // we're expecting. - return std::is_same_v< - std::invoke_result_t...>, - Vec>; - } - return false; -} -#endif // ET_USE_PYTORCH_HEADERS - template < typename CTYPE_COMPUTE, typename CTYPE_OUT, @@ -98,90 +61,8 @@ inline void dtype_specialized_elementwise_fn_impl( KernelRuntimeContext& ctx, const Tensor& out, Args... inputs) { - static_assert( - (std::is_same_v> && - ...)); constexpr auto kNumInputs = sizeof...(inputs); - // All inputs must be of type CTYPE_COMPUTE. - ET_DCHECK( - ((inputs.first->scalar_type() == - CppTypeToScalarType::value) && - ...)); - -#ifdef ET_USE_PYTORCH_HEADERS - if constexpr (can_use_vectorized()) { - const bool any_is_broadcasted = - !(torch::executor::internal::sizes_match_ignoring_leading_1s( - inputs.first->sizes(), out.sizes()) && - ...); - if (!any_is_broadcasted) { - using Vec = at::vec::Vectorized; - ::executorch::extension::parallel_for( - 0, - out.numel(), - ::executorch::extension::internal::GRAIN_SIZE, - [&](const auto begin, const auto end) { - std::array inputs_data_ptrs = { - inputs.first->template const_data_ptr()...}; - - CTYPE_OUT* const data_out = out.mutable_data_ptr(); - - const auto vectorized_begin = - begin + (Vec::size() - begin % Vec::size()) % Vec::size(); - const auto vectorized_end = end - (end % Vec::size()); - // Scalar prologue. - for (const auto idx : c10::irange(begin, vectorized_begin)) { - // In debug mode, always use Vectorized so that even - // small-sized tests will test whether using Vectorized broke our - // lambda. -#ifndef NDEBUG - std::array loaded_inputs; -#else // NDEBUG - std::array loaded_inputs; -#endif // NDEBUG - for (const auto input_idx : c10::irange(kNumInputs)) { - loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; - } -#ifndef NDEBUG - std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1); -#else // NDEBUG - data_out[idx] = std::apply(compute_fun, loaded_inputs); -#endif // NDEBUG - } - - // Main vectorized loop. - for (auto idx = vectorized_begin; idx < vectorized_end; - idx += Vec::size()) { - std::array loaded_vec_inputs; - for (const auto input_idx : c10::irange(kNumInputs)) { - loaded_vec_inputs[input_idx] = - Vec::loadu(&inputs_data_ptrs[input_idx][idx]); - } - auto result_vec = std::apply(compute_fun, loaded_vec_inputs); - result_vec.store(&data_out[idx]); - } - - // Scalar epilogue. - for (const auto idx : c10::irange(vectorized_end, end)) { -#ifndef NDEBUG - std::array loaded_inputs; -#else // NDEBUG - std::array loaded_inputs; -#endif // NDEBUG - for (const auto input_idx : c10::irange(kNumInputs)) { - loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; - } -#ifndef NDEBUG - std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1); -#else // NDEBUG - data_out[idx] = std::apply(compute_fun, loaded_inputs); -#endif // NDEBUG - } - }); - return; - } - } -#endif // ET_USE_PYTORCH_HEADERS + ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...)); ::executorch::extension::parallel_for( 0, @@ -359,19 +240,6 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } -/** - * Useful for unary elementwise operators. For each element of the - * input, call Op and write to the corresponding element of the - * output. Tensor broadcasting is applied wherever it is required. - * - * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto` - * parameters; normal lambdas are fine), it must fulfill one of the - * following conditions. Either: - * 1) It must in fact compile when passed at::vec::Vectorized, or - * 2) It must be actively SFINAE-friendly, as per the C++17 examples in - * https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable - * . - */ template < typename CTYPE_COMPUTE, const char* op_name, @@ -413,8 +281,6 @@ inline void apply_bitensor_elementwise_fn( * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. - * See [NOTE: Generic lambdas] if you want to pass a generic lambda for - * compute_fun. */ template < typename CTYPE_COMPUTE, @@ -481,9 +347,6 @@ inline void apply_tritensor_elementwise_fn( * * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. - * - * See [NOTE: Generic lambdas] if you want to pass a generic lambda for - * compute_fun. */ template < typename CTYPE_COMPUTE, diff --git a/kernels/portable/cpu/util/math_util.h b/kernels/portable/cpu/util/math_util.h index d6d32217137..2ba068da18e 100644 --- a/kernels/portable/cpu/util/math_util.h +++ b/kernels/portable/cpu/util/math_util.h @@ -8,10 +8,6 @@ #pragma once -#ifdef ET_USE_PYTORCH_HEADERS -#include -#endif - namespace torch { namespace executor { namespace native { @@ -142,32 +138,6 @@ T max_override(T a, T b) { return b; } -#ifdef ET_USE_PYTORCH_HEADERS -template -at::vec::Vectorized min_override( - at::vec::Vectorized a, - at::vec::Vectorized b) { - return at::vec::minimum(a, b); -} - -template -at::vec::Vectorized min_override(at::vec::Vectorized a, T b) { - return min_override(a, at::vec::Vectorized(b)); -} - -template -at::vec::Vectorized max_override( - at::vec::Vectorized a, - at::vec::Vectorized b) { - return at::vec::maximum(a, b); -} - -template -at::vec::Vectorized max_override(at::vec::Vectorized a, T b) { - return max_override(a, at::vec::Vectorized(b)); -} - -#endif /** * There is a slight difference in how std::fmod works compared to how ATen * determines remainders: diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 65a0c9fc47a..1b432c736ae 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -33,7 +33,6 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:slice_util", "//executorch/kernels/portable/cpu/util:elementwise_util", "//executorch/kernels/portable/cpu/util:upsample_util", - "//executorch/kernels/portable/cpu/util:vectorized_math", ], visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"], ) @@ -111,8 +110,6 @@ def define_common_targets(): ":broadcast_indexes_range", ":broadcast_util", ":dtype_util", - ":vectorized_math", - "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", "//executorch/runtime/kernel:kernel_runtime_context", "//executorch/extension/threadpool:threadpool", ], @@ -263,9 +260,6 @@ def define_common_targets(): srcs = [], exported_headers = ["math_util.h"], visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/quantized/..."], - exported_deps = [ - "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", - ], ) runtime.cxx_library( diff --git a/kernels/test/op_atan2_test.cpp b/kernels/test/op_atan2_test.cpp index ae19ef687bc..436826e2b6d 100644 --- a/kernels/test/op_atan2_test.cpp +++ b/kernels/test/op_atan2_test.cpp @@ -46,36 +46,3 @@ TEST(OpAtan2OutTest, SmokeTest) { op_atan2_out(self, other, out); EXPECT_TENSOR_CLOSE(out, out_expected); } - -TEST(OpAtan2OutTest, SmokeTestNoBroadcastingSameDtype) { - TensorFactory tfDouble; - - std::vector a(18); - std::iota(a.begin(), a.end(), -8); - std::vector b(18, 2.0); - Tensor self = tfDouble.make({18}, a); - Tensor other = tfDouble.make({18}, b); - Tensor out = tfDouble.zeros({18}); - Tensor out_expected = tfDouble.make( - {18}, - {-1.3258176636680326, - -1.2924966677897853, - -1.2490457723982544, - -1.1902899496825317, - -1.1071487177940904, - -0.9827937232473291, - -0.7853981633974483, - -0.4636476090008061, - 0.0000000000000000, - 0.4636476090008061, - 0.7853981633974483, - 0.9827937232473291, - 1.1071487177940904, - 1.1902899496825317, - 1.2490457723982544, - 1.2924966677897853, - 1.3258176636680326, - 1.3521273809209546}); - op_atan2_out(self, other, out); - EXPECT_TENSOR_CLOSE(out, out_expected); -} diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp index 0b632ab7f8d..8a021c70303 100644 --- a/kernels/test/op_clamp_test.cpp +++ b/kernels/test/op_clamp_test.cpp @@ -31,15 +31,6 @@ using torch::executor::testing::TensorFactory; using OptScalar = executorch::aten::optional; -namespace { -template -std::vector arange(T stop) { - std::vector result(stop); - std::iota(result.begin(), result.end(), 0); - return result; -} -} // namespace - class OpClampOutTest : public OperatorTest { protected: Tensor& op_clamp_out( @@ -123,31 +114,6 @@ class OpClampOutTest : public OperatorTest { // Should set all elements to max. {6, 6, 6, 6}, // expected_data }, - { - std::string(__func__) + ": Simple clamp larger data", - {18}, // sizes - arange::ctype>(18), // input_data - OptScalar(1), // min - OptScalar(6), // max - {1, - 1, - 2, - 3, - 4, - 5, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6}, // expected_data - }, }; run_test_cases(test_cases); diff --git a/kernels/test/op_fmod_test.cpp b/kernels/test/op_fmod_test.cpp index 3227a01a17a..fa7cc4b63f7 100644 --- a/kernels/test/op_fmod_test.cpp +++ b/kernels/test/op_fmod_test.cpp @@ -45,34 +45,3 @@ TEST_F(OpFmodTest, SmokeTest) { op_fmod_tensor_out(self, other, out); EXPECT_TENSOR_CLOSE(out, out_expected); } - -TEST_F(OpFmodTest, ScalarSmokeTest) { - TensorFactory tfFloat; - std::vector a(18); - std::iota(a.begin(), a.end(), -8); - Tensor self = tfFloat.make({18}, a); - Scalar other = 3; - Tensor out = tfFloat.zeros({18}); - Tensor out_expected = tfFloat.make( - {18}, - {-2., - -1., - -0., - -2., - -1., - -0., - -2., - -1., - 0., - 1., - 2., - 0., - 1., - 2., - 0., - 1., - 2., - 0.}); - op_fmod_scalar_out(self, other, out); - EXPECT_TENSOR_CLOSE(out, out_expected); -} diff --git a/kernels/test/op_maximum_test.cpp b/kernels/test/op_maximum_test.cpp index c32cf571ff3..faa18fa56cd 100644 --- a/kernels/test/op_maximum_test.cpp +++ b/kernels/test/op_maximum_test.cpp @@ -37,17 +37,3 @@ TEST(OpMaximumOutTest, SmokeTest) { op_maximum_out(self, other, out); EXPECT_TENSOR_CLOSE(out, out_expected); } - -TEST(OpMaximumOutTest, SmokeTestLarger) { - TensorFactory tfFloat; - - std::vector a(18); - std::iota(a.begin(), a.end(), -8); - Tensor self = tfFloat.make({18}, a); - Tensor other = tfFloat.full({18}, 4); - Tensor out = tfFloat.zeros({18}); - Tensor out_expected = tfFloat.make( - {18}, {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 7, 8, 9}); - op_maximum_out(self, other, out); - EXPECT_TENSOR_CLOSE(out, out_expected); -} diff --git a/kernels/test/op_minimum_test.cpp b/kernels/test/op_minimum_test.cpp index 9c256963943..686e1feee64 100644 --- a/kernels/test/op_minimum_test.cpp +++ b/kernels/test/op_minimum_test.cpp @@ -266,17 +266,3 @@ TEST_F(OpMinimumOutTest, DynamicShapeUnbound) { op_minimum_out(x, y, out); EXPECT_TENSOR_EQ(out, expected); } - -TEST_F(OpMinimumOutTest, SmokeTestLarger) { - TensorFactory tfFloat; - - std::vector a(18); - std::iota(a.begin(), a.end(), -8); - Tensor self = tfFloat.make({18}, a); - Tensor other = tfFloat.full({18}, 4); - Tensor out = tfFloat.zeros({18}); - Tensor out_expected = tfFloat.make( - {18}, {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 4, 4, 4, 4, 4}); - op_minimum_out(self, other, out); - EXPECT_TENSOR_CLOSE(out, out_expected); -} diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp index c21cceeaae3..8b858fa18c2 100644 --- a/kernels/test/op_mul_test.cpp +++ b/kernels/test/op_mul_test.cpp @@ -30,7 +30,7 @@ class OpMulOutTest : public OperatorTest { return torch::executor::aten::mul_outf(context_, self, other, out); } - // Common testing for multiplying two integer Tensors + // Common testing for multipling two integer Tensors template void test_mul() { TensorFactory tf_a; @@ -54,10 +54,6 @@ class OpMulOutTest : public OperatorTest { tf_b.make(sizes, /*data=*/{1, 2, 4, 8}), out); EXPECT_TENSOR_EQ(out, tf_out.make(sizes, /*data=*/{1, 4, 16, 64})); - - out = tf_out.zeros({18}); - op_mul_out(tf_a.full({18}, 4), tf_b.full({18}, 2), out); - EXPECT_TENSOR_EQ(out, tf_out.full({18}, 8)); } template diff --git a/kernels/test/op_pow_test.cpp b/kernels/test/op_pow_test.cpp index 25d0f97526c..f9234a748b9 100644 --- a/kernels/test/op_pow_test.cpp +++ b/kernels/test/op_pow_test.cpp @@ -54,19 +54,6 @@ TEST_F(OpPowTest, TensorTensorSanityCheck) { EXPECT_TENSOR_EQ(out, tf.make({2, 2}, {16, 16, 16, 16})); } -TEST_F(OpPowTest, TensorTensorSanityCheckLargerNoBroadcasting) { - TensorFactory tf; - Tensor self = tf.full({18}, 2); - Tensor exp = tf.full({18}, 4); - Tensor out = tf.zeros({18}); - Tensor out_expected = tf.full({18}, 16); - - Tensor ret = op_pow_tensor_tensor_out(self, exp, out); - - EXPECT_TENSOR_EQ(out, ret); - EXPECT_TENSOR_EQ(out_expected, out); -} - TEST_F(OpPowTest, TensorTensorSanityCheck2) { TensorFactory tf1; TensorFactory tf2; diff --git a/kernels/test/op_sigmoid_test.cpp b/kernels/test/op_sigmoid_test.cpp index 1e3499ba451..550cebda315 100644 --- a/kernels/test/op_sigmoid_test.cpp +++ b/kernels/test/op_sigmoid_test.cpp @@ -44,10 +44,6 @@ class OpSigmoidOutTest : public OperatorTest { EXPECT_TENSOR_CLOSE( out, tf_out.make(sizes, /*data=*/{0.731059, 0.880797, 0.982014, 0.999665})); - - out = tf_out.zeros({18}); - op_sigmoid_out(tf.full({18}, 2), out); - EXPECT_TENSOR_CLOSE(out, tf_out.full({18}, 0.880797)); } // Unhandled output dtypes. From d83bed311e431b4967783e28d11ca8812486058c Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:31:01 -0700 Subject: [PATCH 6/7] Revert "Add optimized_portable_kernels test (#11205)" This reverts commit bc42d8d696eff38c31050787efacaa8ff88285d8. --- kernels/portable/CMakeLists.txt | 9 +-------- kernels/test/CMakeLists.txt | 32 ++------------------------------ 2 files changed, 3 insertions(+), 38 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index c687da62e21..d301ea564f6 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -69,15 +69,8 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS) - gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}") - generate_bindings_for_kernels( - LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}" - ) - gen_operators_lib( - LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core - ) install( - TARGETS optimized_portable_kernels optimized_portable_ops_lib + TARGETS optimized_portable_kernels DESTINATION lib ) endif() diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt index f5997a1ee3f..4f174b5a652 100644 --- a/kernels/test/CMakeLists.txt +++ b/kernels/test/CMakeLists.txt @@ -17,7 +17,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) -set(_kernels portable optimized_portable optimized quantized) +set(_kernels portable optimized quantized) foreach(kernel ${_kernels}) set(_wrapper_dir "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test" @@ -37,17 +37,13 @@ foreach(kernel ${_kernels}) VERBATIM ) - set(_supported_features_kernel ${kernel}) - if(${kernel} STREQUAL "optimized_portable") - set(_supported_features_kernel "portable") - endif() add_custom_command( OUTPUT "${_wrapper_dir}/supported_features.cpp" "${_wrapper_dir}/supported_features.h" COMMAND mkdir -p ${_wrapper_dir} COMMAND ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py - kernels/${_supported_features_kernel}/test/supported_features_def.yaml > + kernels/${kernel}/test/supported_features_def.yaml > ${_wrapper_dir}/supported_features.cpp COMMAND ${PYTHON_EXECUTABLE} kernels/test/gen_supported_features.py @@ -61,11 +57,6 @@ foreach(kernel ${_kernels}) set(_kernel_ops_lib "optimized_native_cpu_ops_lib") set(_kernel_ops_lib_path "${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib" - ) - elseif(${kernel} STREQUAL "optimized_portable") - set(_kernel_ops_lib "${kernel}_ops_lib") - set(_kernel_ops_lib_path - "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/portable/${kernel}_ops_lib" ) else() set(_kernel_ops_lib "${kernel}_ops_lib") @@ -97,9 +88,6 @@ add_custom_target( "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/FunctionHeaderWrapper.h" "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.h" "${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp" - "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/FunctionHeaderWrapper.h" - "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.h" - "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp" "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/FunctionHeaderWrapper.h" "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.h" "${CMAKE_CURRENT_BINARY_DIR}/include/quantized/executorch/kernels/test/supported_features.cpp" @@ -311,22 +299,6 @@ set(_optimized_kernels_test_sources if(TARGET optimized_portable_kernels) list(APPEND _optimized_kernels_test_sources ${all_test_sources}) list(REMOVE_DUPLICATES _optimized_kernels_test_sources) - - # Make sure that we still test optimized versions of portable - # kernels even if they would currently be shadowed by specific - # optimized implementations. - et_cxx_test( - optimized_portable_kernels_test - SOURCES - ${all_test_sources} - ${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable/executorch/kernels/test/supported_features.cpp - EXTRA_LIBS - optimized_portable_kernels - ) - add_dependencies(optimized_portable_kernels_test generate_wrapper) - target_include_directories( - optimized_portable_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized_portable" - ) endif() et_cxx_test( From 9aee068e3a8cba05582e1a27beb7a924ff8b8ab5 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Tue, 10 Jun 2025 13:31:16 -0700 Subject: [PATCH 7/7] Revert "Add vectorized_math.h (#11204)" This reverts commit 1720f2f37644eb2aa9fd54fef87e17da6a83f07a. --- .lintrunner.toml | 2 - kernels/portable/cpu/util/targets.bzl | 10 -- kernels/portable/cpu/util/test/CMakeLists.txt | 16 +- kernels/portable/cpu/util/test/targets.bzl | 11 -- .../cpu/util/test/vectorized_math_test.cpp | 95 ----------- kernels/portable/cpu/util/vectorized_math.h | 148 ------------------ .../core/portable_type/c10/c10/targets.bzl | 6 +- runtime/core/portable_type/targets.bzl | 1 - test/utils/OSSTestConfig.json | 12 ++ 9 files changed, 23 insertions(+), 278 deletions(-) delete mode 100644 kernels/portable/cpu/util/test/vectorized_math_test.cpp delete mode 100644 kernels/portable/cpu/util/vectorized_math.h diff --git a/.lintrunner.toml b/.lintrunner.toml index 1e81c570c65..8912e65d66d 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -271,8 +271,6 @@ exclude_patterns = [ 'examples/**', 'exir/verification/bindings.cpp', 'extension/**', - # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include. - 'kernels/portable/cpu/util/vectorized_math.h', 'kernels/optimized/**', 'runtime/core/exec_aten/**', # Want to be able to keep c10 in sync with PyTorch core. diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 1b432c736ae..abf3f22c00b 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -307,16 +307,6 @@ def define_common_targets(): ], ) - runtime.cxx_library( - name = "vectorized_math", - exported_headers = ["vectorized_math.h"], - visibility = ["//executorch/..."], - exported_deps = [ - "//executorch/runtime/core/portable_type:portable_type", - "//executorch/runtime/core/exec_aten/util:scalar_type_util", - ], - ) - # Utility functions that can be used by operators that perform reduction for aten_mode in get_aten_mode_options(): suffix = "_aten" if aten_mode else "" diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt index 41bfea54020..d95b3a81b5c 100644 --- a/kernels/portable/cpu/util/test/CMakeLists.txt +++ b/kernels/portable/cpu/util/test/CMakeLists.txt @@ -4,22 +4,26 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# @generated by test/utils/generate_gtest_cmakelists.py +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# + cmake_minimum_required(VERSION 3.19) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) -include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp - reduce_test.cpp vectorized_math_test.cpp + reduce_test.cpp ) et_cxx_test( kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS portable_kernels portable_ops_lib ) - -find_package_torch_headers() -target_include_directories(kernels_portable_cpu_util_test PRIVATE ${TORCH_INCLUDE_DIRS}) -target_compile_definitions(kernels_portable_cpu_util_test PRIVATE ET_USE_PYTORCH_HEADERS) diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl index 4b167c6e946..178eb25a79b 100644 --- a/kernels/portable/cpu/util/test/targets.bzl +++ b/kernels/portable/cpu/util/test/targets.bzl @@ -32,14 +32,3 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:reduce_util", ], ) - - # this test requires ET_USE_PYTORCH_HEADERS, which doesn't work in OSS Buck. - if not runtime.is_oss: - runtime.cxx_test( - name = "vectorized_math_test", - srcs = ["vectorized_math_test.cpp"], - deps = [ - "//executorch/kernels/portable/cpu/util:vectorized_math", - "//executorch/runtime/core/portable_type/c10/c10:c10", - ], - ) diff --git a/kernels/portable/cpu/util/test/vectorized_math_test.cpp b/kernels/portable/cpu/util/test/vectorized_math_test.cpp deleted file mode 100644 index 2a2e8397ca4..00000000000 --- a/kernels/portable/cpu/util/test/vectorized_math_test.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#include - -#include - -#include - -#ifndef ET_USE_PYTORCH_HEADERS -#error "This test requires ET_USE_PYTORCH_HEADERS!" -#endif // ET_USE_PYTORCH_HEADERS - -TEST(VectorizedMathTest, BasicUnary) { - __at_align__ float result_floats[at::vec::Vectorized::size()] = {0}; - const auto x_vec = at::vec::Vectorized::arange(0, 1); - const auto result_vec = executorch::math::exp(x_vec); - result_vec.store(result_floats); - for (const auto ii : c10::irange(at::vec::Vectorized::size())) { - EXPECT_FLOAT_EQ(result_floats[ii], std::exp(ii)); - } -} - -namespace { -template -void test_unary_t_to_float() { - __at_align__ float result_floats[at::vec::Vectorized::size()] = {0}; - const auto x_vec = at::vec::Vectorized::arange(0, 1); - const auto result_vec = executorch::math::exp(x_vec); - static_assert(decltype(result_vec)::size() >= at::vec::Vectorized::size()); - result_vec.store(result_floats, at::vec::Vectorized::size()); - for (const auto ii : c10::irange(at::vec::Vectorized::size())) { - EXPECT_EQ(result_floats[ii], std::exp((float)ii)) << ii; - } -} - -} // namespace - -TEST(VectorizedMathTest, UnaryInt16ToFloat) { - test_unary_t_to_float(); -} - -TEST(VectorizedMathTest, UnaryInt32ToFloat) { - test_unary_t_to_float(); -} - -TEST(VectorizedMathTest, UnaryInt64ToFloat) { - test_unary_t_to_float(); -} - -TEST(VectorizedMathTest, BasicBinary) { - __at_align__ float result_floats[at::vec::Vectorized::size()] = {0}; - const auto x_vec = at::vec::Vectorized::arange(0, 1); - const auto y_vec = at::vec::Vectorized(2); - const auto result_vec = executorch::math::pow(x_vec, y_vec); - result_vec.store(result_floats); - for (const auto ii : c10::irange(at::vec::Vectorized::size())) { - EXPECT_FLOAT_EQ(result_floats[ii], std::pow((float)ii, 2.0f)); - } -} - -namespace { -template -void test_binary_t_to_float() { - __at_align__ float result_floats[at::vec::Vectorized::size()] = {0}; - const auto x_vec = at::vec::Vectorized::arange(0, 1); - const auto y_vec = at::vec::Vectorized(2); - const auto result_vec = executorch::math::pow(x_vec, y_vec); - static_assert(decltype(result_vec)::size() >= at::vec::Vectorized::size()); - result_vec.store(result_floats, at::vec::Vectorized::size()); - for (const auto ii : c10::irange(at::vec::Vectorized::size())) { - EXPECT_EQ(result_floats[ii], std::pow((float)ii, 2.0f)) << ii; - } -} - -TEST(VectorizedMathTest, BinaryInt16ToFloat) { - test_binary_t_to_float(); -} - -TEST(VectorizedMathTest, BinaryInt32ToFloat) { - test_binary_t_to_float(); -} - -TEST(VectorizedMathTest, BinaryInt64ToFloat) { - test_binary_t_to_float(); -} - -} // namespace diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h deleted file mode 100644 index 9e706ace56d..00000000000 --- a/kernels/portable/cpu/util/vectorized_math.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#ifdef ET_USE_PYTORCH_HEADERS -#include -#endif // ET_USE_PYTORCH_HEADERS - -#include -#include - -#ifdef ET_USE_PYTORCH_HEADERS -namespace executorch { -inline namespace math { -namespace internal { -template -auto convert_to_vectorized_n_of_float(at::vec::Vectorized vec) { - static constexpr auto float_vec_size = at::vec::Vectorized::size(); - static constexpr auto t_vec_size = at::vec::Vectorized::size(); - static constexpr auto result_size = - t_vec_size < float_vec_size ? 1 : t_vec_size / float_vec_size; - static_assert(result_size >= 1); - return at::vec::convert( - at::vec::VectorizedN(vec)); -} -} // namespace internal -} // namespace math -} // namespace executorch -#endif // ET_USE_PYTORCH_HEADERS - -#define _ET_INTERNAL_STD_MATH_FUNC(name) \ - namespace executorch { \ - inline namespace math { \ - using std::name; \ - } \ - } // namespace executorch - -#ifdef ET_USE_PYTORCH_HEADERS -/** - * Internal-usage macro for making a vectorized variant of a unary - * function available in the executorch::math namespace. - */ -#define ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name) \ - namespace executorch { \ - inline namespace math { \ - template \ - auto func_name(at::vec::Vectorized vec) { \ - if constexpr (!::executorch::runtime::is_floating_point::value) { \ - return internal::convert_to_vectorized_n_of_float(vec).func_name(); \ - } else { \ - return vec.func_name(); \ - } \ - } \ - } \ - } - -#define ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name) \ - namespace executorch { \ - inline namespace math { \ - template \ - auto func_name(at::vec::Vectorized vec0, at::vec::Vectorized vec1) { \ - if constexpr (!::executorch::runtime::is_floating_point::value) { \ - const auto vec_float0 = \ - internal::convert_to_vectorized_n_of_float(vec0); \ - const auto vec_float1 = \ - internal::convert_to_vectorized_n_of_float(vec1); \ - return vec_float0.func_name(vec_float1); \ - } else { \ - return vec0.func_name(vec1); \ - } \ - } \ - } \ - } - -/** - * Internal-usage macro for making a C++ standard library - * floating-point function and a vectorized variant of it available in - * the c10::math namespace. Should be used with functions where the - * corresponding operator is a "float op" in TensorIterator parlance - * (i.e., uses something like build_borrowing_binary_float_op()), - * because it converts non-floating-point arguments to floating point. - */ -#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(func_name) \ - _ET_INTERNAL_STD_MATH_FUNC(func_name) \ - ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name) - -#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(func_name) \ - _ET_INTERNAL_STD_MATH_FUNC(func_name) \ - ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name) - -#else // ET_USE_PYTORCH_HEADERS -#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(name) \ - _ET_INTERNAL_STD_MATH_FUNC(name) -#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(name) \ - _ET_INTERNAL_STD_MATH_FUNC(name) -#endif // ET_USE_PYTORCH_HEADERS - -// To simplify client code, we provide coverage for a bunch of float ops (the -// same ones listed in ATen vml.h) here. -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erf) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erfc) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(exp) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(expm1) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(floor) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log10) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log1p) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log2) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sin) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sinh) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sqrt) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(round) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tan) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tanh) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc) -ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma) - -#ifdef ET_USE_PYTORCH_HEADERS -ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(rsqrt) -#endif // ET_USE_PYTORCH_HEADERS - -namespace executorch { -inline namespace math { -template >> -T rsqrt(T x) { - return T(1) / std::sqrt(x); -} -} // namespace math -} // namespace executorch - -ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(atan2) -ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(fmod) -ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(pow) diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index 025cd127021..827a63d2cef 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -53,11 +53,7 @@ def define_common_targets(): runtime.cxx_library( name = "aten_headers_for_executorch", srcs = [], - visibility = [ - "//executorch/kernels/optimized/...", - "//executorch/kernels/portable/cpu/util/...", - "@EXECUTORCH_CLIENTS", - ], + visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS"], exported_deps = select({ "DEFAULT": [], "ovr_config//cpu:arm64": [ diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index 5b6e67fa213..41bc6050524 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -26,7 +26,6 @@ def define_common_targets(): visibility = [ "//executorch/backends/...", "//executorch/extension/fb/dynamic_shim/...", - "//executorch/kernels/portable/cpu/...", "//executorch/runtime/core/exec_aten/...", "//executorch/runtime/core/portable_type/test/...", ], diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json index 182d0bfd58a..2cfc4b8a995 100644 --- a/test/utils/OSSTestConfig.json +++ b/test/utils/OSSTestConfig.json @@ -68,6 +68,18 @@ "extension_threadpool" ] }, + { + "directory": "kernels/portable/cpu/util/test", + "sources": [ + "broadcast_indexes_range_test.cpp", + "broadcast_test.cpp", + "reduce_test.cpp" + ], + "additional_libs": [ + "portable_kernels", + "portable_ops_lib" + ] + }, { "directory": "runtime/core/portable_type/test", "sources": [