From 4050fd9d042ae72286e57b226f679017dd50f1fa Mon Sep 17 00:00:00 2001
From: tkaruturi <tkaruturi@fb.com>
Date: Fri, 27 Sep 2024 00:00:35 -0700
Subject: [PATCH 1/2] Add sort util for 1D tensors

Differential Revision: D55577025
---
 kernels/portable/cpu/util/sort_util.cpp       | 73 +++++++++++++++++++
 kernels/portable/cpu/util/sort_util.h         | 25 +++++++
 kernels/portable/cpu/util/targets.bzl         | 11 +++
 .../portable/cpu/util/test/sort_util_test.cpp | 45 ++++++++++++
 kernels/portable/cpu/util/test/targets.bzl    | 10 +++
 5 files changed, 164 insertions(+)
 create mode 100644 kernels/portable/cpu/util/sort_util.cpp
 create mode 100644 kernels/portable/cpu/util/sort_util.h
 create mode 100644 kernels/portable/cpu/util/test/sort_util_test.cpp
diff --git a/kernels/portable/cpu/util/sort_util.cpp b/kernels/portable/cpu/util/sort_util.cpp
new file mode 100644
index 00000000000..c57053a5088
--- /dev/null
+++ b/kernels/portable/cpu/util/sort_util.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/kernels/portable/cpu/util/sort_util.h"
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <algorithm>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+Error sort_tensor(
+    const Tensor& tensor,
+    Tensor& sorted_tensor,
+    Tensor& sorted_indices,
+    bool descending) {
+  // Check if the input tensor is a valid input
+  ET_CHECK_MSG(tensor.dim() == 1, "Input tensor must be 1D");
+
+  // Check if the output tensors are valid
+  ET_CHECK_MSG(sorted_tensor.dim() == 1, "Output tensor must be 1D");
+  ET_CHECK_MSG(sorted_indices.dim() == 1, "Output tensor must be 1D");
+
+  // Check if the output tensors have the same dtype
+  ET_CHECK_MSG(
+      tensor.scalar_type() == sorted_tensor.scalar_type(),
+      "Input and output tensors must have the same dtype");
+  ET_CHECK_MSG(
+      tensor.scalar_type() == ScalarType::Float,
+      "Only float inputs are supported currently");
+  ET_CHECK_MSG(
+      sorted_indices.scalar_type() == exec_aten::ScalarType::Long,
+      "Output tensor must be of type int64");
+
+  // Get the number of elements in the tensor
+  int size = tensor.numel();
+
+  // Create a tensor to store the indices
+  for (int i = 0; i < size; i++) {
+    sorted_indices.mutable_data_ptr<int64_t>()[i] = i;
+  }
+
+  // Sort the indices based on the corresponding tensor values
+  std::sort(
+      sorted_indices.mutable_data_ptr<int64_t>(),
+      sorted_indices.mutable_data_ptr<int64_t>() + size,
+      [&tensor, descending](int64_t i, int64_t j) {
+        if (descending) {
+          return tensor.const_data_ptr<float>()[i] >
+              tensor.const_data_ptr<float>()[j];
+        } else {
+          return tensor.const_data_ptr<float>()[i] <
+              tensor.const_data_ptr<float>()[j];
+        }
+      });
+
+  // Rearrange the tensor values based on the sorted indices
+  for (int i = 0; i < size; i++) {
+    sorted_tensor.mutable_data_ptr<float>()[i] = tensor.const_data_ptr<
+        float>()[sorted_indices.const_data_ptr<int64_t>()[i]];
+  }
+
+  return Error::Ok;
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/sort_util.h b/kernels/portable/cpu/util/sort_util.h
new file mode 100644
index 00000000000..9095490b327
--- /dev/null
+++ b/kernels/portable/cpu/util/sort_util.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+Error sort_tensor(
+    const Tensor& tensor,
+    Tensor& sorted_tensor,
+    Tensor& sorted_indice,
+    bool descending = false);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 82d3d84fa23..e271ab06bff 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -237,6 +237,17 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "sort_util",
+        srcs = ["sort_util.cpp"],
+        exported_headers = ["sort_util.h"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/torchvision/..."],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in [True, False]:
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/sort_util_test.cpp b/kernels/portable/cpu/util/test/sort_util_test.cpp
new file mode 100644
index 00000000000..e5dbfbd4b30
--- /dev/null
+++ b/kernels/portable/cpu/util/test/sort_util_test.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/sort_util.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/test/utils/DeathTest.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::ArrayRef;
+using torch::executor::testing::TensorFactory;
+
+TEST(SortUtilTest, SortTensorTest) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> lf;
+
+  Tensor a = tf.make({4}, {3, 2, 1, 4});
+  Tensor b = tf.zeros({4});
+  Tensor c = lf.zeros({4});
+
+  // Ascending order sort test
+  sort_tensor(a, b, c);
+
+  Tensor expected = tf.make({4}, {1, 2, 3, 4});
+  Tensor expected_indices = lf.make({4}, {2, 1, 0, 3});
+  EXPECT_TENSOR_EQ(b, expected);
+  EXPECT_TENSOR_EQ(c, expected_indices);
+
+  // Descending order sort test
+  sort_tensor(a, b, c, true);
+  expected = tf.make({4}, {4, 3, 2, 1});
+  expected_indices = lf.make({4}, {3, 0, 1, 2});
+  EXPECT_TENSOR_EQ(b, expected);
+  EXPECT_TENSOR_EQ(c, expected_indices);
+}
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 28988b90dcc..23a6a7bfe01 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -21,3 +21,13 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     )
+
+    runtime.cxx_test(
+        name = "sort_util_test",
+        srcs = ["sort_util_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/kernels/portable/cpu/util:sort_util",
+        ],
+    )

From af92d4e9cfeec7bfc7125a0069e675284031d121 Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <tkaruturi@meta.com>
Date: Mon, 30 Sep 2024 14:57:10 -0700
Subject: [PATCH 2/2] Make make_tensor in broadcast utilities public and rename
 free_broadcast_tensor (#2785)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/2785

This diff does a couple of things:
- Makes `make_tensor` a public function so that we can create temporary intermediate tensors in operators that need to do so. (Such as NMS that is implemented above in this stack)
- Renames `free_broadcast_tensor` to a more generic name `free_tensor`

Differential Revision: D55577026
---
 kernels/portable/cpu/util/broadcast_util.cpp  | 20 +++++------
 kernels/portable/cpu/util/broadcast_util.h    | 33 +++++++++++++++----
 .../portable/cpu/util/test/broadcast_test.cpp |  8 ++---
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/kernels/portable/cpu/util/broadcast_util.cpp b/kernels/portable/cpu/util/broadcast_util.cpp
index 943219490b0..0ebe78d5d56 100644
--- a/kernels/portable/cpu/util/broadcast_util.cpp
+++ b/kernels/portable/cpu/util/broadcast_util.cpp
@@ -18,17 +18,15 @@ namespace executor {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
-void free_broadcast_tensor(const Tensor& broadcast_tensor) {
-  free((void*)broadcast_tensor.const_data_ptr());
-  free((void*)broadcast_tensor.sizes().data());
-  free((void*)broadcast_tensor.dim_order().data());
-  free((void*)broadcast_tensor.strides().data());
-  free(broadcast_tensor.unsafeGetTensorImpl());
+void free_tensor(const Tensor& tensor) {
+  free((void*)tensor.const_data_ptr());
+  free((void*)tensor.sizes().data());
+  free((void*)tensor.dim_order().data());
+  free((void*)tensor.strides().data());
+  free(tensor.unsafeGetTensorImpl());
 }
 
-namespace {
-
-Tensor make_tensor(
+Tensor allocate_tensor(
     const ArrayRef<Tensor::SizesType>& sizes,
     const ArrayRef<Tensor::DimOrderType>& dim_order,
     const ArrayRef<Tensor::StridesType>& strides,
@@ -73,8 +71,6 @@ Tensor make_tensor(
   return Tensor{tensor_impl};
 }
 
-} // namespace
-
 bool tensor_is_broadcastable_to(
     const exec_aten::ArrayRef<Tensor::SizesType> broadcast_from_shape,
     const exec_aten::ArrayRef<Tensor::SizesType> broadcast_to_shape) {
@@ -171,7 +167,7 @@ Tensor broadcast_tensor(
 
   // Once we have discovered that broadcast_from can be broadcasted into
   // broadcast_to, use repeat() to do the broadcast.
-  Tensor out = make_tensor(
+  Tensor out = allocate_tensor(
       broadcast_to_shape,
       broadcast_to_dim_order,
       broadcast_to_strides,
diff --git a/kernels/portable/cpu/util/broadcast_util.h b/kernels/portable/cpu/util/broadcast_util.h
index 92d35f322fb..a563ac36c41 100644
--- a/kernels/portable/cpu/util/broadcast_util.h
+++ b/kernels/portable/cpu/util/broadcast_util.h
@@ -62,6 +62,23 @@ bool tensors_are_broadcastable_between(
  */
 bool tensors_are_broadcastable_between(const Tensor& a, const Tensor& b);
 
+/**
+ * Create a new tensor with the given sizes, dim_order, and strides. Memory
+ * is dynamically allocated within this function and the tensor must be freed
+ * only using free_tensor.
+ *
+ * @param[in] sizes The sizes of the tensor.
+ * @param[in] dim_order The dim order of the tensor.
+ * @param[in] strides The strides of the tensor.
+ * @param[in] dtype The data type of the tensor.
+ * @returns A new tensor with the given sizes, dim_order, and strides.
+ */
+Tensor allocate_tensor(
+    const ArrayRef<Tensor::SizesType>& sizes,
+    const ArrayRef<Tensor::DimOrderType>& dim_order,
+    const ArrayRef<Tensor::StridesType>& strides,
+    const ScalarType& dtype);
+
 /**
  * DEPRECATED: Use `delinearize_index()` and `linearize_access_indexes()` for
  * index remapping to avoid memory allocation.
@@ -75,7 +92,7 @@ bool tensors_are_broadcastable_between(const Tensor& a, const Tensor& b);
  * @param[in] broadcast_to The tensor to which we want to broadcast to.
  * @returns A new tensor with the same shape as broadcast_to and the data
  * repeated as appropriate. This tensor contains dynamically allocated memory
- * and must be freed using free_broadcast_tensor.
+ * and must be freed using free_tensor.
  */
 ET_DEPRECATED exec_aten::Tensor broadcast_tensor(
     const exec_aten::Tensor& broadcast_from,
@@ -192,19 +209,21 @@ ET_NODISCARD inline Error resize_to_broadcast_target_size(
 }
 
 /**
- * DEPRECATED: Use `delinearize_index()` and `linearize_access_indexes()` for
- * index remapping to avoid memory allocation.
- *
- * Free the dynamically allocated memory in broadcast_tensor. This should only
- * be used on a tensor returned by broadcast_tensor.
  *
  * @param[in] The tensor that was previosuly returned by a call to
- * broadcast_tensor.
+ * allocate_tensor.
  * @returns void
  */
 ET_DEPRECATED void free_broadcast_tensor(
     const exec_aten::Tensor& broadcast_tensor);
 
+/**
+ * Free the dynamically allocated memory in allocate_tensor. This should only
+ * be used on a tensor returned by allocate_tensor.
+ *
+ */
+void free_tensor(const exec_aten::Tensor& allocated_tensor);
+
 /**
  * Delinearize a flattened index to per-dimension indexes.
  *
diff --git a/kernels/portable/cpu/util/test/broadcast_test.cpp b/kernels/portable/cpu/util/test/broadcast_test.cpp
index d87e8ecec85..87ea8714236 100644
--- a/kernels/portable/cpu/util/test/broadcast_test.cpp
+++ b/kernels/portable/cpu/util/test/broadcast_test.cpp
@@ -38,11 +38,11 @@ TEST(BroadcastUtilTest, BroadcastTensor) {
 
   Tensor d = torch::executor::broadcast_tensor(a, c);
   EXPECT_TENSOR_DATA_EQ(d, tf.make({2, 2}, {2, 2, 2, 2}));
-  torch::executor::free_broadcast_tensor(d);
+  torch::executor::free_tensor(d);
 
   d = torch::executor::broadcast_tensor(b, c);
   EXPECT_TENSOR_DATA_EQ(d, tf.make({2, 2}, {2, 2, 2, 2}));
-  torch::executor::free_broadcast_tensor(d);
+  torch::executor::free_tensor(d);
 }
 
 TEST(BroadcastUtilTest, BroadcastableBetween) {
@@ -69,12 +69,12 @@ TEST(BroadcastUtilTest, BroadcastableToFrom) {
   ASSERT_TRUE(tensor_is_broadcastable_to(a, c));
   Tensor d = torch::executor::broadcast_tensor(a, c);
   EXPECT_TENSOR_DATA_EQ(d, tf.make({2, 2}, {2, 2, 2, 2}));
-  torch::executor::free_broadcast_tensor(d);
+  torch::executor::free_tensor(d);
 
   ASSERT_TRUE(tensor_is_broadcastable_to(b, c));
   d = torch::executor::broadcast_tensor(b, c);
   EXPECT_TENSOR_DATA_EQ(d, tf.make({2, 2}, {2, 2, 2, 2}));
-  torch::executor::free_broadcast_tensor(d);
+  torch::executor::free_tensor(d);
 }
 
 TEST(BroadcastUtilTest, NotBroadcastableTo) {