From 4e18b4b840e3b1ff44f14b9aee90544d65bf59d4 Mon Sep 17 00:00:00 2001
From: tkaruturi <tkaruturi@fb.com>
Date: Wed, 9 Oct 2024 16:15:14 -0700
Subject: [PATCH 1/2] Add allocate tensor util that uses temp allocator

Differential Revision: D64072692
---
 .../cpu/util/allocate_tensor_util.cpp         | 74 +++++++++++++++++++
 .../portable/cpu/util/allocate_tensor_util.h  | 18 +++++
 kernels/portable/cpu/util/targets.bzl         | 10 +++
 .../cpu/util/test/allocate_tensor_test.cpp    | 68 +++++++++++++++++
 kernels/portable/cpu/util/test/targets.bzl    | 10 +++
 5 files changed, 180 insertions(+)
 create mode 100644 kernels/portable/cpu/util/allocate_tensor_util.cpp
 create mode 100644 kernels/portable/cpu/util/allocate_tensor_util.h
 create mode 100644 kernels/portable/cpu/util/test/allocate_tensor_test.cpp

diff --git a/kernels/portable/cpu/util/allocate_tensor_util.cpp b/kernels/portable/cpu/util/allocate_tensor_util.cpp
new file mode 100644
index 00000000000..0bb10b6caff
--- /dev/null
+++ b/kernels/portable/cpu/util/allocate_tensor_util.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/kernels/portable/cpu/util/allocate_tensor_util.h"
+
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+using ScalarType = exec_aten::ScalarType;
+
+Tensor allocate_tensor(
+    KernelRuntimeContext& ctx,
+    const ArrayRef<Tensor::SizesType>& sizes,
+    const ArrayRef<Tensor::DimOrderType>& dim_order,
+    const ArrayRef<Tensor::StridesType>& strides,
+    const ScalarType& dtype) {
+  int dim = sizes.size();
+  int size_nbytes = dim * sizeof(Tensor::SizesType);
+  Result<void*> temp_mem_res_size = ctx.allocate_temp(size_nbytes);
+  void* size_data_ptr =
+      temp_mem_res_size.ok() ? temp_mem_res_size.get() : nullptr;
+  ET_CHECK_MSG(size_data_ptr != nullptr, "Failed to malloc for size bytes");
+  memcpy(size_data_ptr, sizes.data(), size_nbytes);
+
+  // TODO(T145322324): can we remove the static cast once size is unsigned?
+  size_t dim_order_nbytes =
+      static_cast<size_t>(dim) * sizeof(Tensor::DimOrderType);
+  Result<void*> temp_mem_res_dim_order = ctx.allocate_temp(dim_order_nbytes);
+  void* dim_order_data_ptr =
+      temp_mem_res_dim_order.ok() ? temp_mem_res_dim_order.get() : nullptr;
+  ET_CHECK_MSG(
+      dim_order_data_ptr != nullptr, "Failed to malloc for dim order bytes");
+  memcpy(dim_order_data_ptr, dim_order.data(), dim_order_nbytes);
+
+  int strides_nbytes = dim * sizeof(Tensor::StridesType);
+  Result<void*> temp_mem_res_strides = ctx.allocate_temp(strides_nbytes);
+  void* strides_data_ptr =
+      temp_mem_res_strides.ok() ? temp_mem_res_strides.get() : nullptr;
+  printf("strides_data_ptr: %p\n", strides_data_ptr);
+  fflush(stdout);
+  ET_CHECK_MSG(
+      strides_data_ptr != nullptr, "Failed to malloc for strides bytes");
+  memcpy(strides_data_ptr, strides.data(), strides_nbytes);
+
+  Result<void*> temp_mem_res_tensor = ctx.allocate_temp(sizeof(TensorImpl));
+  auto tensor_impl = static_cast<TensorImpl*>(
+      temp_mem_res_tensor.ok() ? temp_mem_res_tensor.get() : nullptr);
+  ET_CHECK_MSG(tensor_impl != nullptr, "Failed to malloc for data TensorImpl");
+
+  new (tensor_impl) TensorImpl(
+      dtype,
+      dim,
+      reinterpret_cast<Tensor::SizesType*>(size_data_ptr),
+      nullptr,
+      reinterpret_cast<Tensor::DimOrderType*>(dim_order_data_ptr),
+      reinterpret_cast<Tensor::StridesType*>(strides_data_ptr));
+
+  Result<void*> temp_mem_res_data = ctx.allocate_temp(tensor_impl->nbytes());
+  void* data_ptr = temp_mem_res_data.ok() ? temp_mem_res_data.get() : nullptr;
+  ET_CHECK_MSG(data_ptr != nullptr, "Failed to malloc for data buffer");
+  tensor_impl->set_data(data_ptr);
+
+  return Tensor{tensor_impl};
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/allocate_tensor_util.h b/kernels/portable/cpu/util/allocate_tensor_util.h
new file mode 100644
index 00000000000..cd9b10e0444
--- /dev/null
+++ b/kernels/portable/cpu/util/allocate_tensor_util.h
@@ -0,0 +1,18 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+Tensor allocate_tensor(
+    KernelRuntimeContext& ctx,
+    const ArrayRef<Tensor::SizesType>& sizes,
+    const ArrayRef<Tensor::DimOrderType>& dim_order,
+    const ArrayRef<Tensor::StridesType>& strides,
+    const ScalarType& dtype);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 82d3d84fa23..3ee3ceff6dc 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -237,6 +237,16 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "allocate_tensor_util",
+        srcs = ["allocate_tensor_util.cpp"],
+        exported_headers = ["allocate_tensor_util.h"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/..."],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in [True, False]:
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/allocate_tensor_test.cpp b/kernels/portable/cpu/util/test/allocate_tensor_test.cpp
new file mode 100644
index 00000000000..dcfea3687a6
--- /dev/null
+++ b/kernels/portable/cpu/util/test/allocate_tensor_test.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/kernels/portable/cpu/util/allocate_tensor_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/test/utils/DeathTest.h>
+using ScalarType = exec_aten::ScalarType;
+
+class AllocateTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    torch::executor::runtime_init();
+  }
+};
+
+TEST(AllocateTest, AllocateTensor) {
+  uint8_t* temp_allocator_ptr = (uint8_t*)malloc(2048);
+  executorch::runtime::MemoryAllocator temp_allocator(2048, temp_allocator_ptr);
+  executorch::runtime::KernelRuntimeContext ctx(nullptr, &temp_allocator);
+
+  executorch::aten::SizesType sizes[3] = {1, 2, 3};
+  executorch::aten::DimOrderType dim_order[3] = {0, 1, 2};
+  executorch::aten::StridesType strides[3] = {3, 3, 1};
+
+  torch::executor::ArrayRef<exec_aten::SizesType> sizes_ref(sizes, 3);
+  torch::executor::ArrayRef<exec_aten::StridesType> strides_ref(strides, 3);
+  torch::executor::ArrayRef<exec_aten::DimOrderType> dim_orders_ref(
+      dim_order, 3);
+
+  torch::executor::allocate_tensor(
+      ctx, sizes, dim_order, strides, ScalarType::Float);
+
+  free(temp_allocator_ptr);
+}
+
+TEST(AllocateTest, FailAllocateTensor) {
+  torch::executor::runtime_init();
+
+  uint8_t* temp_allocator_ptr = (uint8_t*)malloc(16);
+  executorch::runtime::MemoryAllocator temp_allocator(16, temp_allocator_ptr);
+  executorch::runtime::KernelRuntimeContext ctx(nullptr, &temp_allocator);
+
+  executorch::aten::SizesType sizes[3] = {1, 2, 3};
+  executorch::aten::DimOrderType dim_order[3] = {0, 1, 2};
+  executorch::aten::StridesType strides[3] = {3, 3, 1};
+
+  torch::executor::ArrayRef<exec_aten::SizesType> sizes_ref(sizes, 3);
+  torch::executor::ArrayRef<exec_aten::StridesType> strides_ref(strides, 3);
+  torch::executor::ArrayRef<exec_aten::DimOrderType> dim_orders_ref(
+      dim_order, 3);
+
+  ET_EXPECT_DEATH(
+      torch::executor::allocate_tensor(
+          ctx, sizes, dim_order, strides, ScalarType::Float),
+      "Failed to malloc");
+
+  free(temp_allocator_ptr);
+}
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 28988b90dcc..39ac40fa603 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -21,3 +21,13 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     )
+
+    runtime.cxx_test(
+        name = "allocate_tensor_test",
+        srcs = ["allocate_tensor_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/kernels/portable/cpu/util:allocate_tensor_util",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+    )

From e52a09f85a9d4eef65ea5c162d5b39575bbfeafe Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <tkaruturi@meta.com>
Date: Wed, 9 Oct 2024 16:20:36 -0700
Subject: [PATCH 2/2] Add sort util for 1D tensors (#2786)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/2786

This diff adds a simple sort utility that sorts a tensor's values and returns the sorted values and the sorted indices in the out tensors that are provided.
There are currently two limitations to this sort:
- It only supports 1D tensors currently, has to be extended to support 2D and greater tensors.
- Input types are assumed to be float and it currently asserts on that. This has to be templatized to support all dtypes.

Reviewed By: iseeyuan

Differential Revision: D55577025
---
 kernels/portable/cpu/util/sort_util.cpp       | 73 +++++++++++++++++++
 kernels/portable/cpu/util/sort_util.h         | 25 +++++++
 kernels/portable/cpu/util/targets.bzl         | 11 +++
 .../portable/cpu/util/test/sort_util_test.cpp | 45 ++++++++++++
 kernels/portable/cpu/util/test/targets.bzl    |  8 ++
 5 files changed, 162 insertions(+)
 create mode 100644 kernels/portable/cpu/util/sort_util.cpp
 create mode 100644 kernels/portable/cpu/util/sort_util.h
 create mode 100644 kernels/portable/cpu/util/test/sort_util_test.cpp

diff --git a/kernels/portable/cpu/util/sort_util.cpp b/kernels/portable/cpu/util/sort_util.cpp
new file mode 100644
index 00000000000..c57053a5088
--- /dev/null
+++ b/kernels/portable/cpu/util/sort_util.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/kernels/portable/cpu/util/sort_util.h"
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <algorithm>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+Error sort_tensor(
+    const Tensor& tensor,
+    Tensor& sorted_tensor,
+    Tensor& sorted_indices,
+    bool descending) {
+  // Check if the input tensor is a valid input
+  ET_CHECK_MSG(tensor.dim() == 1, "Input tensor must be 1D");
+
+  // Check if the output tensors are valid
+  ET_CHECK_MSG(sorted_tensor.dim() == 1, "Output tensor must be 1D");
+  ET_CHECK_MSG(sorted_indices.dim() == 1, "Output tensor must be 1D");
+
+  // Check if the output tensors have the same dtype
+  ET_CHECK_MSG(
+      tensor.scalar_type() == sorted_tensor.scalar_type(),
+      "Input and output tensors must have the same dtype");
+  ET_CHECK_MSG(
+      tensor.scalar_type() == ScalarType::Float,
+      "Only float inputs are supported currently");
+  ET_CHECK_MSG(
+      sorted_indices.scalar_type() == exec_aten::ScalarType::Long,
+      "Output tensor must be of type int64");
+
+  // Get the number of elements in the tensor
+  int size = tensor.numel();
+
+  // Create a tensor to store the indices
+  for (int i = 0; i < size; i++) {
+    sorted_indices.mutable_data_ptr<int64_t>()[i] = i;
+  }
+
+  // Sort the indices based on the corresponding tensor values
+  std::sort(
+      sorted_indices.mutable_data_ptr<int64_t>(),
+      sorted_indices.mutable_data_ptr<int64_t>() + size,
+      [&tensor, descending](int64_t i, int64_t j) {
+        if (descending) {
+          return tensor.const_data_ptr<float>()[i] >
+              tensor.const_data_ptr<float>()[j];
+        } else {
+          return tensor.const_data_ptr<float>()[i] <
+              tensor.const_data_ptr<float>()[j];
+        }
+      });
+
+  // Rearrange the tensor values based on the sorted indices
+  for (int i = 0; i < size; i++) {
+    sorted_tensor.mutable_data_ptr<float>()[i] = tensor.const_data_ptr<
+        float>()[sorted_indices.const_data_ptr<int64_t>()[i]];
+  }
+
+  return Error::Ok;
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/sort_util.h b/kernels/portable/cpu/util/sort_util.h
new file mode 100644
index 00000000000..9095490b327
--- /dev/null
+++ b/kernels/portable/cpu/util/sort_util.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+Error sort_tensor(
+    const Tensor& tensor,
+    Tensor& sorted_tensor,
+    Tensor& sorted_indice,
+    bool descending = false);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 3ee3ceff6dc..7212915c5f9 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -247,6 +247,17 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "sort_util",
+        srcs = ["sort_util.cpp"],
+        exported_headers = ["sort_util.h"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/runtime/core/exec_aten/util:tensor_util",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/torchvision/..."],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in [True, False]:
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/sort_util_test.cpp b/kernels/portable/cpu/util/test/sort_util_test.cpp
new file mode 100644
index 00000000000..e5dbfbd4b30
--- /dev/null
+++ b/kernels/portable/cpu/util/test/sort_util_test.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/sort_util.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/test/utils/DeathTest.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using torch::executor::ArrayRef;
+using torch::executor::testing::TensorFactory;
+
+TEST(SortUtilTest, SortTensorTest) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> lf;
+
+  Tensor a = tf.make({4}, {3, 2, 1, 4});
+  Tensor b = tf.zeros({4});
+  Tensor c = lf.zeros({4});
+
+  // Ascending order sort test
+  sort_tensor(a, b, c);
+
+  Tensor expected = tf.make({4}, {1, 2, 3, 4});
+  Tensor expected_indices = lf.make({4}, {2, 1, 0, 3});
+  EXPECT_TENSOR_EQ(b, expected);
+  EXPECT_TENSOR_EQ(c, expected_indices);
+
+  // Descending order sort test
+  sort_tensor(a, b, c, true);
+  expected = tf.make({4}, {4, 3, 2, 1});
+  expected_indices = lf.make({4}, {3, 0, 1, 2});
+  EXPECT_TENSOR_EQ(b, expected);
+  EXPECT_TENSOR_EQ(c, expected_indices);
+}
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 39ac40fa603..45687fd28bb 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -29,5 +29,13 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten:lib",
             "//executorch/kernels/portable/cpu/util:allocate_tensor_util",
             "//executorch/runtime/kernel:kernel_includes",
+        
+    runtime.cxx_test(
+        name = "sort_util_test",
+        srcs = ["sort_util_test.cpp"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/kernels/portable/cpu/util:sort_util",
         ],
     )