From 03e1aaf161dc15c816f195fa8c2a4d4d742b8422 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Wed, 3 Sep 2025 13:26:40 -0700
Subject: [PATCH] Revert exposed_as_util functionality and refactor stack utils
 (#13822)

Summary:
This diff reverts the exposed_as_util functionality introduced in D79654142, and consequently, refactors the stack utils.

Pull Request resolved: https://github.com/pytorch/executorch/pull/13822

Differential Revision: D81199586
---
 kernels/portable/cpu/op_add.cpp               |  72 ----------
 kernels/portable/cpu/op_add.h                 |  65 ---------
 kernels/portable/cpu/op_stack.cpp             | 125 +-----------------
 kernels/portable/cpu/util/stack_util.cpp      | 121 +++++++++++++++++
 .../cpu/{op_stack.h => util/stack_util.h}     |  33 ++---
 kernels/portable/cpu/util/targets.bzl         |  12 ++
 .../kernels/portable/op_registration_util.bzl |  40 +-----
 7 files changed, 148 insertions(+), 320 deletions(-)
 delete mode 100644 kernels/portable/cpu/op_add.h
 create mode 100644 kernels/portable/cpu/util/stack_util.cpp
 rename kernels/portable/cpu/{op_stack.h => util/stack_util.h} (57%)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index 7dead2bf5a7..122b2a2c97e 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -15,7 +15,6 @@
 namespace torch {
 namespace executor {
 namespace native {
-namespace impl {
 
 Tensor& add_out(
     KernelRuntimeContext& ctx,
@@ -152,77 +151,6 @@ Tensor& add_scalar_out(
   return out;
 }
 
-} // namespace impl
-
-Tensor& add_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  return impl::add_out(ctx, a, b, alpha, out);
-}
-
-Tensor& add_scalar_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Scalar& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  return impl::add_scalar_out(ctx, a, b, alpha, out);
-}
-
-namespace utils {
-
-Tensor& add_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  return impl::add_out(ctx, a, b, alpha, out);
-}
-
-Tensor& add_scalar_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Scalar& b,
-    const Scalar& alpha,
-    Tensor& out) {
-  return impl::add_scalar_out(ctx, a, b, alpha, out);
-}
-
-std::tuple<
-    Error,
-    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
-    size_t>
-add_out_shape(const Tensor& a, const Tensor& b, ET_UNUSED const Scalar& alpha) {
-  std::array<executorch::aten::SizesType, kTensorDimensionLimit> out_sizes{};
-  size_t out_dim = 0;
-
-  Error err = get_broadcast_target_size(
-      a, b, out_sizes.data(), kTensorDimensionLimit, &out_dim);
-
-  return std::make_tuple(err, out_sizes, out_dim);
-}
-
-std::tuple<
-    Error,
-    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
-    size_t>
-add_scalar_out_shape(
-    const Tensor& a,
-    ET_UNUSED const Scalar& b,
-    ET_UNUSED const Scalar& alpha) {
-  std::array<executorch::aten::SizesType, kTensorDimensionLimit> out_sizes{};
-  size_t out_dim = a.dim();
-
-  std::copy(a.sizes().begin(), a.sizes().end(), out_sizes.begin());
-
-  return std::make_tuple(Error::Ok, out_sizes, out_dim);
-}
-
-} // namespace utils
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/op_add.h b/kernels/portable/cpu/op_add.h
deleted file mode 100644
index f19d7e98b12..00000000000
--- a/kernels/portable/cpu/op_add.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-#pragma once
-
-namespace torch {
-namespace executor {
-namespace native {
-namespace utils {
-
-Tensor& add_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Tensor& b,
-    const Scalar& alpha,
-    Tensor& out);
-
-Tensor& add_scalar_out(
-    KernelRuntimeContext& ctx,
-    const Tensor& a,
-    const Scalar& b,
-    const Scalar& alpha,
-    Tensor& out);
-
-/**
- * Computes the output shape for tensor addition with broadcasting.
- *
- * @param[in] a First input tensor
- * @param[in] b Second input tensor
- * @param[in] alpha Scalar multiplier for b (unused for shape computation)
- * @return Tuple containing the Error, output shape array, and number of
- * dimensions
- */
-std::tuple<
-    Error,
-    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
-    size_t>
-add_out_shape(const Tensor& a, const Tensor& b, const Scalar& alpha);
-
-/**
- * Computes the output shape for tensor-scalar addition.
- *
- * @param[in] a Input tensor
- * @param[in] b Scalar value (unused for shape computation)
- * @param[in] alpha Scalar multiplier for b (unused for shape computation)
- * @return Tuple containing the Error, output shape array, and number of
- * dimensions
- */
-std::tuple<
-    Error,
-    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
-    size_t>
-add_scalar_out_shape(const Tensor& a, const Scalar& b, const Scalar& alpha);
-
-} // namespace utils
-} // namespace native
-} // namespace executor
-} // namespace torch
diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp
index b78d03c6970..961a907a723 100644
--- a/kernels/portable/cpu/op_stack.cpp
+++ b/kernels/portable/cpu/op_stack.cpp
@@ -6,142 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <cstring>
-
-#include <c10/util/irange.h>
-#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/stack_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
 namespace executor {
 namespace native {
-namespace impl {
-
-using Tensor = executorch::aten::Tensor;
 
 Tensor& stack_out(
     KernelRuntimeContext& ctx,
     executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
-  (void)ctx;
-
-  if (dim < 0) {
-    dim += out.dim();
-  }
-
-  ET_KERNEL_CHECK(
-      ctx, check_stack_args(tensors, dim, out), InvalidArgument, out);
-
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    ET_KERNEL_CHECK(
-        ctx,
-        tensors_have_same_dim_order(tensors[i], out),
-        InvalidArgument,
-        out);
-  }
-
-  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
-
-  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
-  size_t expected_out_dim = 0;
-  get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
-  ET_KERNEL_CHECK(
-      ctx,
-      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
-      InvalidArgument,
-      out);
-
-  const size_t outer = getLeadingDims(out, dim);
-  const size_t inner = getTrailingDims(out, dim);
-  const size_t ninputs = tensors.size();
-
-  const auto out_type = out.scalar_type();
-  ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "stack.out", CTYPE_OUT, [&] {
-    CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
-    for (size_t i = 0; i < outer; ++i) {
-      for (size_t j = 0; j < ninputs; ++j) {
-        const auto in_type = tensors[j].scalar_type();
-        ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "stack.out", CTYPE_IN, [&] {
-          const CTYPE_IN* const in_ptr =
-              tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
-
-          for (size_t k = 0; k < inner; ++k) {
-            out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
-          }
-          out_ptr += inner;
-        });
-      }
-    }
-  });
-
-  return out;
-}
-
-} // namespace impl
-
-Tensor& stack_out(
-    KernelRuntimeContext& ctx,
-    executorch::aten::ArrayRef<Tensor> tensors,
-    int64_t dim,
-    Tensor& out) {
-  return impl::stack_out(ctx, tensors, dim, out);
-}
-
-namespace utils {
-
-Tensor& stack_out(
-    KernelRuntimeContext& ctx,
-    executorch::aten::ArrayRef<Tensor> tensors,
-    int64_t dim,
-    Tensor& out) {
-  return impl::stack_out(ctx, tensors, dim, out);
-}
-
-std::tuple<
-    Error,
-    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
-    size_t>
-stack_out_shape(executorch::aten::ArrayRef<Tensor> tensors, int64_t dim) {
-  std::array<executorch::aten::SizesType, kTensorDimensionLimit> out_sizes{};
-  size_t out_dim = 0;
-
-  // Check if tensors array is empty
-  if (tensors.size() == 0) {
-    return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
-  }
-
-  // Normalize negative dimension
-  int64_t normalized_dim = dim;
-  if (normalized_dim < 0) {
-    normalized_dim += tensors[0].dim() + 1;
-  }
-
-  // Check if dimension is valid
-  if (normalized_dim < 0 || normalized_dim > tensors[0].dim()) {
-    return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
-  }
-
-  // Check that all tensors have the same shape
-  for (size_t i = 1; i < tensors.size(); ++i) {
-    if (tensors[i].dim() != tensors[0].dim()) {
-      return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
-    }
-    for (const auto d : c10::irange(tensors[0].dim())) {
-      if (tensors[i].size(d) != tensors[0].size(d)) {
-        return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
-      }
-    }
-  }
-
-  // Compute output shape using the existing utility
-  ::torch::executor::get_stack_out_target_size(
-      tensors, normalized_dim, out_sizes.data(), &out_dim);
-
-  return std::make_tuple(Error::Ok, out_sizes, out_dim);
+  return utils::stack_out_impl(ctx, tensors, dim, out);
 }
 
-} // namespace utils
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/stack_util.cpp b/kernels/portable/cpu/util/stack_util.cpp
new file mode 100644
index 00000000000..22e540ad3ec
--- /dev/null
+++ b/kernels/portable/cpu/util/stack_util.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+#include <tuple>
+
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/stack_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch::executor::native::utils {
+
+using Tensor = executorch::aten::Tensor;
+
+std::tuple<
+    Error,
+    std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
+    size_t>
+stack_out_shape(executorch::aten::ArrayRef<Tensor> tensors, int64_t dim) {
+  std::array<executorch::aten::SizesType, kTensorDimensionLimit> out_sizes{};
+  size_t out_dim = 0;
+
+  // Check if tensors array is empty
+  if (tensors.size() == 0) {
+    return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
+  }
+
+  // Normalize negative dimension
+  int64_t normalized_dim = dim;
+  if (normalized_dim < 0) {
+    normalized_dim += tensors[0].dim() + 1;
+  }
+
+  // Check if dimension is valid
+  if (normalized_dim < 0 || normalized_dim > tensors[0].dim()) {
+    return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
+  }
+
+  // Check that all tensors have the same shape
+  for (size_t i = 1; i < tensors.size(); ++i) {
+    if (tensors[i].dim() != tensors[0].dim()) {
+      return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
+    }
+    for (const auto d : c10::irange(tensors[0].dim())) {
+      if (tensors[i].size(d) != tensors[0].size(d)) {
+        return std::make_tuple(Error::InvalidArgument, out_sizes, out_dim);
+      }
+    }
+  }
+
+  // Compute output shape using the existing utility
+  get_stack_out_target_size(
+      tensors, normalized_dim, out_sizes.data(), &out_dim);
+
+  return std::make_tuple(Error::Ok, out_sizes, out_dim);
+}
+
+Tensor& stack_out_impl(
+    KernelRuntimeContext& ctx,
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  if (dim < 0) {
+    dim += out.dim();
+  }
+
+  ET_KERNEL_CHECK(
+      ctx, check_stack_args(tensors, dim, out), InvalidArgument, out);
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    ET_KERNEL_CHECK(
+        ctx,
+        tensors_have_same_dim_order(tensors[i], out),
+        InvalidArgument,
+        out);
+  }
+
+  ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(out), InvalidArgument, out);
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  const size_t outer = getLeadingDims(out, dim);
+  const size_t inner = getTrailingDims(out, dim);
+  const size_t ninputs = tensors.size();
+
+  const auto out_type = out.scalar_type();
+  ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "stack.out", CTYPE_OUT, [&] {
+    CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+    for (size_t i = 0; i < outer; ++i) {
+      for (size_t j = 0; j < ninputs; ++j) {
+        const auto in_type = tensors[j].scalar_type();
+        ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "stack.out", CTYPE_IN, [&] {
+          const CTYPE_IN* const in_ptr =
+              tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
+
+          for (size_t k = 0; k < inner; ++k) {
+            out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
+          }
+          out_ptr += inner;
+        });
+      }
+    }
+  });
+
+  return out;
+}
+
+} // namespace torch::executor::native::utils
diff --git a/kernels/portable/cpu/op_stack.h b/kernels/portable/cpu/util/stack_util.h
similarity index 57%
rename from kernels/portable/cpu/op_stack.h
rename to kernels/portable/cpu/util/stack_util.h
index 6a507b7dcd5..a8a8821c49c 100644
--- a/kernels/portable/cpu/op_stack.h
+++ b/kernels/portable/cpu/util/stack_util.h
@@ -6,36 +6,23 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/runtime/kernel/kernel_includes.h>
-
 #pragma once
 
-namespace torch {
-namespace executor {
-namespace native {
-namespace utils {
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <tuple>
 
-Tensor& stack_out(
-    KernelRuntimeContext& ctx,
-    executorch::aten::ArrayRef<Tensor> tensors,
-    int64_t dim,
-    Tensor& out);
+namespace torch::executor::native::utils {
 
-/**
- * Computes the output shape for tensor stacking.
- *
- * @param[in] tensors Array of input tensors to stack
- * @param[in] dim Dimension along which to stack
- * @return Tuple containing the Error, output shape array, and number of
- * dimensions
- */
 std::tuple<
     Error,
     std::array<executorch::aten::SizesType, kTensorDimensionLimit>,
     size_t>
 stack_out_shape(executorch::aten::ArrayRef<Tensor> tensors, int64_t dim);
 
-} // namespace utils
-} // namespace native
-} // namespace executor
-} // namespace torch
+Tensor& stack_out_impl(
+    KernelRuntimeContext& ctx,
+    executorch::aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out);
+
+} // namespace torch::executor::native::utils
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 8194b37f319..d0fd9f4c72b 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -32,6 +32,7 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:select_copy_util",
             "//executorch/kernels/portable/cpu/util:advanced_index_util",
             "//executorch/kernels/portable/cpu/util:slice_util",
+            "//executorch/kernels/portable/cpu/util:stack_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
             "//executorch/kernels/portable/cpu/util:vectorized_math",
@@ -295,6 +296,17 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "stack_util",
+        srcs = ["stack_util.cpp"],
+        exported_headers = ["stack_util.h"],
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "@EXECUTORCH_CLIENTS"],
+    )
+
     runtime.cxx_library(
         name = "upsample_util",
         srcs = ["upsample_util.cpp"],
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index 158d2cd2769..a2dc66d3a60 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -5,7 +5,7 @@ def get_compiler_optimization_flags():
     # App size regressons requires this to be baktraced until I have a better solution
     return []
 
-def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = [], exposed_as_util = False):
+def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = False, _aten_mode_deps = []):
     """Registers an implementation of an operator overload group.
 
     An operator overload group is a set of operator overloads with a common
@@ -45,8 +45,6 @@ def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = Fals
             from third-party optimization libraries.
         _aten_mode_deps: List of deps to add to the cxx_library() when building
             for ATen mode.
-        exposed_as_util: If True, this op has a utils namespace that should be exposed
-            as a separate library target for reuse by other operators.
     """
 
     # Note that this doesn't actually define the target, but helps register
@@ -57,7 +55,6 @@ def op_target(name, deps = [], android_deps = [], _allow_third_party_deps = Fals
         "name": name,
         "_allow_third_party_deps": _allow_third_party_deps,
         "_aten_mode_deps": _aten_mode_deps,
-        "exposed_as_util": exposed_as_util,
     }
 
 def _enforce_deps(deps, name, allow_third_party_deps):
@@ -157,7 +154,7 @@ def define_op_library(name, deps, android_deps, aten_target, _allow_third_party_
         link_whole = True,
     )
 
-def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _allow_third_party_deps = False, _aten_mode_deps = [], exposed_as_util = False):
+def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _allow_third_party_deps = False, _aten_mode_deps = []):
     """Possibly defines cxx_library targets for the named operator group.
 
     Args:
@@ -169,37 +166,8 @@ def define_op_target(name, deps, android_deps, is_aten_op, is_et_op = True, _all
         _allow_third_party_deps: If True, the op is allowed to depend on
             third-party deps outside of //executorch. Should only be used by
             targets under //executorch/kernels/optimized.
-        exposed_as_util: If True, this op has a utils namespace that should be exposed
-            as a separate library target for reuse by other operators.
     """
 
-    # If this op has utils, create a separate utils library target
-    if exposed_as_util:
-        utils_name = name + "_util"
-        runtime.cxx_library(
-            name = utils_name,
-            srcs = ["{}.cpp".format(name)],
-            exported_headers = ["{}.h".format(name)],
-            visibility = [
-                "//executorch/kernels/portable/...",
-                "//executorch/kernels/quantized/...",
-                "//executorch/kernels/optimized/...",
-                "//executorch/kernels/test/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            fbandroid_platform_deps = android_deps,
-            compiler_flags = select({
-                    "DEFAULT": ["-Wno-missing-prototypes"],
-                    "ovr_config//os:windows": [],
-                }) + (
-                ["-fvisibility=hidden"] if is_xplat() else []
-            ) + get_compiler_optimization_flags(),
-            deps = [
-                "//executorch/runtime/kernel:kernel_includes",
-            ] + deps,
-            force_static = True,
-        )
-
     # If this is a custom op, define a target that builds it with at::Tensor
     # so that it can be imported into a host PyTorch environment for authoring.
     if not is_aten_op and True in get_aten_mode_options():
@@ -258,7 +226,6 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
             ":scalar_utils",
         ],
-        exposed_as_util = True,
     ),
     op_target(
         name = "op_addmm",
@@ -1233,9 +1200,8 @@ ATEN_OPS = (
     op_target(
         name = "op_stack",
         deps = [
-            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+            "//executorch/kernels/portable/cpu/util:stack_util",
         ],
-        exposed_as_util = True,
     ),
     op_target(
         name = "op_sub",