From 03e1aaf161dc15c816f195fa8c2a4d4d742b8422 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Wed, 3 Sep 2025 13:26:40 -0700 Subject: [PATCH] Revert exposed_as_util functionality and refactor stack utils (#13822) Summary: This diff reverts the exposed_as_util functionality introduced in D79654142, and consequently, refactors the stack utils. Pull Request resolved: https://github.com/pytorch/executorch/pull/13822 Differential Revision: D81199586 --- kernels/portable/cpu/op_add.cpp | 72 ---------- kernels/portable/cpu/op_add.h | 65 --------- kernels/portable/cpu/op_stack.cpp | 125 +----------------- kernels/portable/cpu/util/stack_util.cpp | 121 +++++++++++++++++ .../cpu/{op_stack.h => util/stack_util.h} | 33 ++--- kernels/portable/cpu/util/targets.bzl | 12 ++ .../kernels/portable/op_registration_util.bzl | 40 +----- 7 files changed, 148 insertions(+), 320 deletions(-) delete mode 100644 kernels/portable/cpu/op_add.h create mode 100644 kernels/portable/cpu/util/stack_util.cpp rename kernels/portable/cpu/{op_stack.h => util/stack_util.h} (57%) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index 7dead2bf5a7..122b2a2c97e 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -15,7 +15,6 @@ namespace torch { namespace executor { namespace native { -namespace impl { Tensor& add_out( KernelRuntimeContext& ctx, @@ -152,77 +151,6 @@ Tensor& add_scalar_out( return out; } -} // namespace impl - -Tensor& add_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Tensor& b, - const Scalar& alpha, - Tensor& out) { - return impl::add_out(ctx, a, b, alpha, out); -} - -Tensor& add_scalar_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Scalar& b, - const Scalar& alpha, - Tensor& out) { - return impl::add_scalar_out(ctx, a, b, alpha, out); -} - -namespace utils { - -Tensor& add_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Tensor& b, - const Scalar& alpha, - Tensor& out) { - return impl::add_out(ctx, a, b, alpha, out); -} - -Tensor& add_scalar_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Scalar& b, - const Scalar& alpha, - Tensor& out) { - return impl::add_scalar_out(ctx, a, b, alpha, out); -} - -std::tuple< - Error, - std::array, - size_t> -add_out_shape(const Tensor& a, const Tensor& b, ET_UNUSED const Scalar& alpha) { - std::array out_sizes{}; - size_t out_dim = 0; - - Error err = get_broadcast_target_size( - a, b, out_sizes.data(), kTensorDimensionLimit, &out_dim); - - return std::make_tuple(err, out_sizes, out_dim); -} - -std::tuple< - Error, - std::array, - size_t> -add_scalar_out_shape( - const Tensor& a, - ET_UNUSED const Scalar& b, - ET_UNUSED const Scalar& alpha) { - std::array out_sizes{}; - size_t out_dim = a.dim(); - - std::copy(a.sizes().begin(), a.sizes().end(), out_sizes.begin()); - - return std::make_tuple(Error::Ok, out_sizes, out_dim); -} - -} // namespace utils } // namespace native } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/op_add.h b/kernels/portable/cpu/op_add.h deleted file mode 100644 index f19d7e98b12..00000000000 --- a/kernels/portable/cpu/op_add.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include - -#pragma once - -namespace torch { -namespace executor { -namespace native { -namespace utils { - -Tensor& add_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Tensor& b, - const Scalar& alpha, - Tensor& out); - -Tensor& add_scalar_out( - KernelRuntimeContext& ctx, - const Tensor& a, - const Scalar& b, - const Scalar& alpha, - Tensor& out); - -/** - * Computes the output shape for tensor addition with broadcasting. - * - * @param[in] a First input tensor - * @param[in] b Second input tensor - * @param[in] alpha Scalar multiplier for b (unused for shape computation) - * @return Tuple containing the Error, output shape array, and number of - * dimensions - */ -std::tuple< - Error, - std::array, - size_t> -add_out_shape(const Tensor& a, const Tensor& b, const Scalar& alpha); - -/** - * Computes the output shape for tensor-scalar addition. - * - * @param[in] a Input tensor - * @param[in] b Scalar value (unused for shape computation) - * @param[in] alpha Scalar multiplier for b (unused for shape computation) - * @return Tuple containing the Error, output shape array, and number of - * dimensions - */ -std::tuple< - Error, - std::array, - size_t> -add_scalar_out_shape(const Tensor& a, const Scalar& b, const Scalar& alpha); - -} // namespace utils -} // namespace native -} // namespace executor -} // namespace torch diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp index b78d03c6970..961a907a723 100644 --- a/kernels/portable/cpu/op_stack.cpp +++ b/kernels/portable/cpu/op_stack.cpp @@ -6,142 +6,21 @@ * LICENSE file in the root directory of this source tree. */ -#include - -#include -#include +#include #include namespace torch { namespace executor { namespace native { -namespace impl { - -using Tensor = executorch::aten::Tensor; Tensor& stack_out( KernelRuntimeContext& ctx, executorch::aten::ArrayRef tensors, int64_t dim, Tensor& out) { - (void)ctx; - - if (dim < 0) { - dim += out.dim(); - } - - ET_KERNEL_CHECK( - ctx, check_stack_args(tensors, dim, out), InvalidArgument, out); - - for (size_t i = 0; i < tensors.size(); ++i) { - ET_KERNEL_CHECK( - ctx, - tensors_have_same_dim_order(tensors[i], out), - InvalidArgument, - out); - } - - ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); - - Tensor::SizesType expected_out_size[kTensorDimensionLimit]; - size_t expected_out_dim = 0; - get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); - ET_KERNEL_CHECK( - ctx, - resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, - InvalidArgument, - out); - - const size_t outer = getLeadingDims(out, dim); - const size_t inner = getTrailingDims(out, dim); - const size_t ninputs = tensors.size(); - - const auto out_type = out.scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "stack.out", CTYPE_OUT, [&] { - CTYPE_OUT* out_ptr = out.mutable_data_ptr(); - for (size_t i = 0; i < outer; ++i) { - for (size_t j = 0; j < ninputs; ++j) { - const auto in_type = tensors[j].scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "stack.out", CTYPE_IN, [&] { - const CTYPE_IN* const in_ptr = - tensors[j].const_data_ptr() + i * inner; - - for (size_t k = 0; k < inner; ++k) { - out_ptr[k] = static_cast(in_ptr[k]); - } - out_ptr += inner; - }); - } - } - }); - - return out; -} - -} // namespace impl - -Tensor& stack_out( - KernelRuntimeContext& ctx, - executorch::aten::ArrayRef tensors, - int64_t dim, - Tensor& out) { - return impl::stack_out(ctx, tensors, dim, out); -} - -namespace utils { - -Tensor& stack_out( - KernelRuntimeContext& ctx, - executorch::aten::ArrayRef tensors, - int64_t dim, - Tensor& out) { - return impl::stack_out(ctx, tensors, dim, out); -} - -std::tuple< - Error, - std::array, - size_t> -stack_out_shape(executorch::aten::ArrayRef tensors, int64_t dim) { - std::array out_sizes{}; - size_t out_dim = 0; - - // Check if tensors array is empty - if (tensors.size() == 0) { - return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); - } - - // Normalize negative dimension - int64_t normalized_dim = dim; - if (normalized_dim < 0) { - normalized_dim += tensors[0].dim() + 1; - } - - // Check if dimension is valid - if (normalized_dim < 0 || normalized_dim > tensors[0].dim()) { - return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); - } - - // Check that all tensors have the same shape - for (size_t i = 1; i < tensors.size(); ++i) { - if (tensors[i].dim() != tensors[0].dim()) { - return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); - } - for (const auto d : c10::irange(tensors[0].dim())) { - if (tensors[i].size(d) != tensors[0].size(d)) { - return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); - } - } - } - - // Compute output shape using the existing utility - ::torch::executor::get_stack_out_target_size( - tensors, normalized_dim, out_sizes.data(), &out_dim); - - return std::make_tuple(Error::Ok, out_sizes, out_dim); + return utils::stack_out_impl(ctx, tensors, dim, out); } -} // namespace utils } // namespace native } // namespace executor } // namespace torch diff --git a/kernels/portable/cpu/util/stack_util.cpp b/kernels/portable/cpu/util/stack_util.cpp new file mode 100644 index 00000000000..22e540ad3ec --- /dev/null +++ b/kernels/portable/cpu/util/stack_util.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include + +namespace torch::executor::native::utils { + +using Tensor = executorch::aten::Tensor; + +std::tuple< + Error, + std::array, + size_t> +stack_out_shape(executorch::aten::ArrayRef tensors, int64_t dim) { + std::array out_sizes{}; + size_t out_dim = 0; + + // Check if tensors array is empty + if (tensors.size() == 0) { + return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); + } + + // Normalize negative dimension + int64_t normalized_dim = dim; + if (normalized_dim < 0) { + normalized_dim += tensors[0].dim() + 1; + } + + // Check if dimension is valid + if (normalized_dim < 0 || normalized_dim > tensors[0].dim()) { + return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); + } + + // Check that all tensors have the same shape + for (size_t i = 1; i < tensors.size(); ++i) { + if (tensors[i].dim() != tensors[0].dim()) { + return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); + } + for (const auto d : c10::irange(tensors[0].dim())) { + if (tensors[i].size(d) != tensors[0].size(d)) { + return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim); + } + } + } + + // Compute output shape using the existing utility + get_stack_out_target_size( + tensors, normalized_dim, out_sizes.data(), &out_dim); + + return std::make_tuple(Error::Ok, out_sizes, out_dim); +} + +Tensor& stack_out_impl( + KernelRuntimeContext& ctx, + executorch::aten::ArrayRef tensors, + int64_t dim, + Tensor& out) { + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK( + ctx, check_stack_args(tensors, dim, out), InvalidArgument, out); + + for (size_t i = 0; i < tensors.size(); ++i) { + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(tensors[i], out), + InvalidArgument, + out); + } + + ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + const size_t outer = getLeadingDims(out, dim); + const size_t inner = getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "stack.out", CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "stack.out", CTYPE_IN, [&] { + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + + return out; +} + +} // namespace torch::executor::native::utils diff --git a/kernels/portable/cpu/op_stack.h b/kernels/portable/cpu/util/stack_util.h similarity index 57% rename from kernels/portable/cpu/op_stack.h rename to kernels/portable/cpu/util/stack_util.h index 6a507b7dcd5..a8a8821c49c 100644 --- a/kernels/portable/cpu/op_stack.h +++ b/kernels/portable/cpu/util/stack_util.h @@ -6,36 +6,23 @@ * LICENSE file in the root directory of this source tree. */ -#include - #pragma once -namespace torch { -namespace executor { -namespace native { -namespace utils { +#include +#include -Tensor& stack_out( - KernelRuntimeContext& ctx, - executorch::aten::ArrayRef tensors, - int64_t dim, - Tensor& out); +namespace torch::executor::native::utils { -/** - * Computes the output shape for tensor stacking. - * - * @param[in] tensors Array of input tensors to stack - * @param[in] dim Dimension along which to stack - * @return Tuple containing the Error, output shape array, and number of - * dimensions - */ std::tuple< Error, std::array, size_t> stack_out_shape(executorch::aten::ArrayRef tensors, int64_t dim); -} // namespace utils -} // namespace native -} // namespace executor -} // namespace torch +Tensor& stack_out_impl( + KernelRuntimeContext& ctx, + executorch::aten::ArrayRef tensors, + int64_t dim, + Tensor& out); + +} // namespace torch::executor::native::utils diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index 8194b37f319..d0fd9f4c72b 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -32,6 +32,7 @@ def define_common_targets(): "//executorch/kernels/portable/cpu/util:select_copy_util", "//executorch/kernels/portable/cpu/util:advanced_index_util", "//executorch/kernels/portable/cpu/util:slice_util", + "//executorch/kernels/portable/cpu/util:stack_util", "//executorch/kernels/portable/cpu/util:elementwise_util", "//executorch/kernels/portable/cpu/util:upsample_util", "//executorch/kernels/portable/cpu/util:vectorized_math", @@ -295,6 +296,17 @@ def define_common_targets(): visibility = ["//executorch/kernels/portable/cpu/..."], ) + runtime.cxx_library( + name = "stack_util", + srcs = ["stack_util.cpp"], + exported_headers = ["stack_util.h"], + deps = [ + "//executorch/kernels/portable/cpu/util:copy_ops_util", + "//executorch/runtime/kernel:kernel_includes", + ], + visibility = ["//executorch/kernels/portable/cpu/...", "@EXECUTORCH_CLIENTS"], + ) + runtime.cxx_library( name = "upsample_util", srcs = ["upsample_util.cpp"], diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index 158d2cd2769..a2dc66d3a60 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -5,7 +5,7 @@ def get_compiler_optimization_flags(): # App size regressons requires this to be baktraced until I have a better solution return [] -def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = [], exposed_as_util = False): +def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = []): """Registers an implementation of an operator overload group. An operator overload group is a set of operator overloads with a common @@ -45,8 +45,6 @@ def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = Fals from third-party optimization libraries. _aten_mode_deps: List of deps to add to the cxx_library() when building for ATen mode. - exposed_as_util: If True, this op has a utils namespace that should be exposed - as a separate library target for reuse by other operators. """ # Note that this doesn't actually define the target, but helps register @@ -57,7 +55,6 @@ def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = Fals "name": name, "_allow_third_party_deps": _allow_third_party_deps, "_aten_mode_deps": _aten_mode_deps, - "exposed_as_util": exposed_as_util, } def _enforce_deps(deps, name, allow_third_party_deps): @@ -157,7 +154,7 @@ def define_op_library(name, deps, android_deps, aten_target, _allow_third_party_ link_whole = True, ) -def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _allow_third_party_deps = False, _aten_mode_deps = [], exposed_as_util = False): +def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _allow_third_party_deps = False, _aten_mode_deps = []): """Possibly defines cxx_library targets for the named operator group. Args: @@ -169,37 +166,8 @@ def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _all _allow_third_party_deps: If True, the op is allowed to depend on third-party deps outside of //executorch. Should only be used by targets under //executorch/kernels/optimized. - exposed_as_util: If True, this op has a utils namespace that should be exposed - as a separate library target for reuse by other operators. """ - # If this op has utils, create a separate utils library target - if exposed_as_util: - utils_name = name + "_util" - runtime.cxx_library( - name = utils_name, - srcs = ["{}.cpp".format(name)], - exported_headers = ["{}.h".format(name)], - visibility = [ - "//executorch/kernels/portable/...", - "//executorch/kernels/quantized/...", - "//executorch/kernels/optimized/...", - "//executorch/kernels/test/...", - "@EXECUTORCH_CLIENTS", - ], - fbandroid_platform_deps = android_deps, - compiler_flags = select({ - "DEFAULT": ["-Wno-missing-prototypes"], - "ovr_config//os:windows": [], - }) + ( - ["-fvisibility=hidden"] if is_xplat() else [] - ) + get_compiler_optimization_flags(), - deps = [ - "//executorch/runtime/kernel:kernel_includes", - ] + deps, - force_static = True, - ) - # If this is a custom op, define a target that builds it with at::Tensor # so that it can be imported into a host PyTorch environment for authoring. if not is_aten_op and True in get_aten_mode_options(): @@ -258,7 +226,6 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:kernel_ops_util", ":scalar_utils", ], - exposed_as_util = True, ), op_target( name = "op_addmm", @@ -1233,9 +1200,8 @@ ATEN_OPS = ( op_target( name = "op_stack", deps = [ - "//executorch/kernels/portable/cpu/util:copy_ops_util", + "//executorch/kernels/portable/cpu/util:stack_util", ], - exposed_as_util = True, ), op_target( name = "op_sub",