From f8bc0914e9d070b89491ceef0897f08a32de8da8 Mon Sep 17 00:00:00 2001 From: Manuel Candales Date: Fri, 29 Aug 2025 12:22:09 -0700 Subject: [PATCH] Optimize index_out via fast path (#13731) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/13731 Differential Revision: D81142086 --- kernels/portable/cpu/op_index.cpp | 174 ++++++++++++++++- kernels/test/op_index_test.cpp | 299 ++++++++++++++++++++++++++++-- 2 files changed, 451 insertions(+), 22 deletions(-) diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp index a81ce6ad737..e0ca951de85 100644 --- a/kernels/portable/cpu/op_index.cpp +++ b/kernels/portable/cpu/op_index.cpp @@ -22,21 +22,189 @@ namespace native { using Tensor = executorch::aten::Tensor; using TensorOptList = executorch::aten::ArrayRef>; -Tensor& index_Tensor_out( +namespace { + +bool check_fast_path_conditions( + ET_UNUSED const Tensor& in, + TensorOptList indices, + size_t* dim) { + bool found_index = false; + for (const auto i : c10::irange(indices.size())) { + if (indices[i].has_value()) { + *dim = i; + // Fast path only supports a single non-null index tensor + if (found_index) { + return false; + } + found_index = true; + const Tensor& index = indices[i].value(); + ScalarType ix_type = index.scalar_type(); + // Fast path only supports Long or Int index tensors + if (ix_type != ScalarType::Long && ix_type != ScalarType::Int) { + return false; + } + // Fast path only supports a 1-dimensional index tensor + if (index.dim() != 1) { + return false; + } + } + } + + // Fast path needs at least one non-null index tensor + if (!found_index) { + return false; + } + + return true; +} + +bool check_fast_path_args( KernelRuntimeContext& ctx, const Tensor& in, TensorOptList indices, + size_t dim, Tensor& out) { - (void)ctx; + ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out)); + + ET_CHECK_OR_RETURN_FALSE( + static_cast(indices.size()) <= in.dim(), + "Indexing too many dimensions"); + + const Tensor& index = indices[dim].value(); + + bool is_valid_index = true; + ET_SWITCH_TWO_TYPES( + Long, Int, index.scalar_type(), ctx, "index.Tensor", CTYPE, [&]() { + const CTYPE* const index_arr = index.const_data_ptr(); + for (const auto i : c10::irange(index.numel())) { + if (index_arr[i] < 0 || + index_arr[i] >= static_cast(in.size(dim))) { + ET_LOG( + Error, + "Index %" PRId64 + " out of range for tensor with size %zd" + " at dimension %zu", + static_cast(index_arr[i]), + in.size(dim), + dim); + is_valid_index = false; + break; + } + } + }); + + ET_CHECK_OR_RETURN_FALSE( + is_valid_index, + "Some index values are not within bounds of input tensor at indexed dim"); + return true; +} + +void get_fast_path_index_out_target_size( + const Tensor& in, + TensorOptList indices, + size_t dim, + Tensor::SizesType* out_sizes, + size_t* out_ndim) { + *out_ndim = in.dim(); + + for (const auto d : c10::irange(static_cast(in.dim()))) { + if (d != dim) { + out_sizes[d] = static_cast(in.size(d)); + } else { + out_sizes[d] = + static_cast(indices[dim].value().numel()); + } + } +} + +Tensor& fast_path( + KernelRuntimeContext& ctx, + const Tensor& in, + TensorOptList indices, + size_t dim, + Tensor& out) { ET_KERNEL_CHECK( - ctx, check_index_args(in, indices, out), InvalidArgument, out); + ctx, + check_fast_path_args(ctx, in, indices, dim, out), + InvalidArgument, + out); + + const Tensor& index = indices[dim].value(); + ScalarType index_type = index.scalar_type(); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + Tensor::SizesType expected_size[kTensorDimensionLimit]; + size_t expected_ndim = 0; + get_fast_path_index_out_target_size( + in, indices, dim, expected_size, &expected_ndim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_size, expected_ndim}) == Error::Ok, + InvalidArgument, + out); + + if (out.dim() == 0) { + memcpy(out.mutable_data_ptr(), in.const_data_ptr(), out.nbytes()); + return out; + } + + size_t leading_dims = getLeadingDims(in, dim); + size_t trailing_dims = getTrailingDims(in, dim); + + if (leading_dims == 0 || trailing_dims == 0) { + return out; + } + + size_t in_dim_length = in.size(dim); + size_t out_dim_length = out.size(dim); + + size_t length_per_step = trailing_dims * in.element_size(); + + const char* in_data = in.const_data_ptr(); + char* out_data = out.mutable_data_ptr(); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "index.Tensor_out"; + + ET_SWITCH_TWO_TYPES(Long, Int, index_type, ctx, op_name, CTYPE, [&]() { + const CTYPE* const index_arr = index.const_data_ptr(); + for (const auto i : c10::irange(leading_dims)) { + const char* src = in_data + i * in_dim_length * length_per_step; + char* dest = out_data + i * out_dim_length * length_per_step; + for (const auto j : c10::irange(out_dim_length)) { + const char* copy_src = src + index_arr[j] * length_per_step; + char* copy_dest = dest + j * length_per_step; + memcpy(copy_dest, copy_src, length_per_step); + } + } + }); + + return out; +} + +} // namespace + +Tensor& index_Tensor_out( + KernelRuntimeContext& ctx, + const Tensor& in, + TensorOptList indices, + Tensor& out) { ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out); + size_t dim = 0; + bool is_fast_path = check_fast_path_conditions(in, indices, &dim); + if (is_fast_path) { + return fast_path(ctx, in, indices, dim, out); + } + + ET_KERNEL_CHECK( + ctx, check_index_args(in, indices, out), InvalidArgument, out); + ScalarType in_type = in.scalar_type(); size_t block_count = count_index_blocks(indices); diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp index 2471d44b0a3..9f1f8e3e9f7 100644 --- a/kernels/test/op_index_test.cpp +++ b/kernels/test/op_index_test.cpp @@ -72,12 +72,12 @@ class OpIndexTensorOutTest : public OperatorTest { // clang-format on // indices [0, 1, 2], [1, 0, 3], expressed two different ways - optional indices[] = { + std::array, 3> indices = { optional(tfl.make({2}, {0, 1})), optional(tfl.make({2}, {1, 0})), optional(tfl.make({2}, {2, 3}))}; - optional indices_mixed[] = { + std::array, 3> indices_mixed = { optional(tfl.make({2}, {0, 1})), optional(tfb.make({2}, {false, true})), optional(tfl.make({2}, {2, 3}))}; @@ -203,27 +203,27 @@ TEST_F(OpIndexTensorOutTest, SelectFrontDimAllIndexes) { // Try to select the input value at indices // [1, 0, 1], [1, 0, 2]. This is expressed in various ways to test different // indexing expressions. - optional indices[] = { + std::array, 3> indices = { optional(tfl.make({1}, {1})), optional(tfl.make({1}, {0})), optional(tfl.make({2}, {1, 2}))}; - optional indices_int[] = { + std::array, 3> indices_int = { optional(tfi.make({1}, {1})), optional(tfi.make({1}, {0})), optional(tfi.make({2}, {1, 2}))}; - optional indices_negative[] = { + std::array, 3> indices_negative = { optional(tfl.make({1}, {-1})), optional(tfl.make({1}, {0})), optional(tfl.make({2}, {-3, -2}))}; - optional indices_bool[] = { + std::array, 3> indices_bool = { optional(tfb.make({2}, {false, true})), optional(tfb.make({3}, {true, false, false})), optional(tfl.make({2}, {-3, -2}))}; - optional indices_mixed[] = { + std::array, 3> indices_mixed = { optional(tfb.make({2}, {false, true})), optional(tfl.make({1}, {0})), optional(tfl.make({2}, {-3, -2}))}; @@ -264,7 +264,7 @@ TEST_F(OpIndexTensorOutTest, SelectTwoValuesAtSameIndex) { // clang-format on // Try to select the value at the same index - optional indices[] = { + std::array, 3> indices = { optional(tfl.make({1, 2}, {0, 0})), optional(tfl.make({1, 2}, {1, 1})), optional(tfl.make({1, 2}, {2, 2}))}; @@ -306,11 +306,11 @@ TEST_F(OpIndexTensorOutTest, IndicesFewerThanInputDimSupported) { // [1, 0, :], [1, 1, :]. This is expressed in various ways to test different // indexing expressions. - optional indices[] = { + std::array, 2> indices = { optional(tfl.make({1}, {1})), optional(tfl.make({2}, {0, 1}))}; - optional indices_mixed[] = { + std::array, 2> indices_mixed = { optional(tfi.make({1}, {-1})), optional(tfb.make({3}, {true, true, false}))}; @@ -349,7 +349,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) { }); // clang-format on - optional indices0[] = { + std::array, 3> indices0 = { optional(), optional(tfl.make({1}, {1})), optional(tfl.make({2}, {0, 1}))}; @@ -366,7 +366,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) { run_test_cases(x, /*indices=*/indices0, expected0); - optional indices1[] = { + std::array, 3> indices1 = { optional(tfl.make({1}, {1})), optional(), optional(tfl.make({2}, {0, 1}))}; @@ -383,7 +383,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) { run_test_cases(x, /*indices=*/indices1, expected1); - optional indices2[] = { + std::array, 3> indices2 = { optional(tfl.make({1}, {1})), optional(tfl.make({2}, {0, 1})), optional()}; @@ -408,13 +408,14 @@ TEST_F(OpIndexTensorOutTest, IndicesWithOnlyNullTensorsSupported) { TensorFactory tf; Tensor x = tf.make({2, 3}, {1., 2., 3., 4., 5., 6.}); - optional indices0[] = {optional()}; + std::array, 1> indices0 = {optional()}; run_test_cases(x, indices0, x); - optional indices1[] = {optional(), std::optional()}; + std::array, 2> indices1 = { + optional(), std::optional()}; run_test_cases(x, indices1, x); - optional indices2[] = { + std::array, 3> indices2 = { optional(), std::optional(), std::optional()}; Tensor out = tf.ones({2, 3}); ET_EXPECT_KERNEL_FAILURE_WITH_MSG( @@ -550,7 +551,7 @@ TEST_F(OpIndexTensorOutTest, InvalidIndicesShapesDies) { Tensor x = tf.zeros({2, 4, 7, 5}); // clang-format off - optional indices[] = { + std::array, 2> indices = { optional(tfl.make({3}, {1, 1, 1,})), optional(tfl.make({2}, {1, 2}))}; @@ -570,7 +571,7 @@ TEST_F(OpIndexTensorOutTest, InvalidIndicesShapeDies2) { Tensor x = tf.zeros({4, 4}); // clang-format off - optional indices[] = { + std::array, 2> indices = { optional(tfl.make({2, 2}, {1, 1, 1, 1,})), optional(tfl.make({1, 2}, {3, 0,}))}; @@ -607,7 +608,7 @@ TEST_F(OpIndexTensorOutTest, UpperBoundOutTensor) { // Try to select the tensor from the input // indices [0, 2, 2], [1, 1, 2] - optional indices[] = { + std::array, 3> indices = { optional(tfl.make({1, 2}, {0, 1})), optional(tfl.make({1, 2}, {2, 1})), optional(tfl.make({1, 2}, {2, 2}))}; @@ -627,3 +628,263 @@ TEST_F(OpIndexTensorOutTest, UpperBoundOutTensor) { EXPECT_TENSOR_EQ(out, ret); EXPECT_TENSOR_EQ(ret, expected); } + +// +// Fast Path Tests +// + +TEST_F(OpIndexTensorOutTest, FastPathFirstDim) { + TensorFactory tf; + TensorFactory tfl; + + // clang-format off + Tensor x = tf.make( + {2, 3, 4}, + { + // [0, :, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + std::array, 3> indices = { + optional(tfl.make({3}, {1, 0, 1})), + optional(), + optional()}; + + Tensor out = tf.zeros({3, 3, 4}); + // clang-format off + Tensor expected = tf.make( + {3, 3, 4}, + { + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + + // [0, :, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPathMiddleDim) { + TensorFactory tf; + TensorFactory tfl; + + // clang-format off + Tensor x = tf.make( + {2, 3, 4}, + { + // [0, :, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + std::array, 2> indices = { + optional(), optional(tfl.make({5}, {2, 0, 1, 0, 2}))}; + + Tensor out = tf.zeros({2, 5, 4}); + // clang-format off + Tensor expected = tf.make( + {2, 5, 4}, + { + // [0, :, :] + 9., 10., 11., 12., // [0, 2, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 1., 2., 3., 4., // [0, 0, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -9., -10., -11., -12., // [1, 2, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -1., -2., -3., -4., // [1, 0, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPathLastDim) { + TensorFactory tf; + TensorFactory tfl; + + // clang-format off + Tensor x = tf.make( + {2, 3, 4}, + { + // [0, :, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + std::array, 3> indices = { + optional(), + optional(), + optional(tfl.make({3}, {2, 0, 1}))}; + + Tensor out = tf.zeros({2, 3, 3}); + // clang-format off + Tensor expected = tf.make( + {2, 3, 3}, + { + 3., 1., 2., + 7., 5., 6., + 11., 9., 10., + + -3., -1., -2., + -7., -5., -6., + -11., -9., -10., + }); + // clang-format on + + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPathZeroDim) { + TensorFactory tf; + TensorFactory tfl; + + Tensor x = tf.ones({0}); + std::array, 1> indices = {optional(tfl.zeros({0}))}; + Tensor out = tf.zeros({0}); + Tensor expected = tf.ones({0}); + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPath1DLessElements) { + TensorFactory tf; + TensorFactory tfl; + + Tensor x = tf.make({5}, {1., 2., 3., 4., 5.}); + std::array, 1> indices = { + optional(tfl.make({3}, {2, 0, 1}))}; + Tensor out = tf.zeros({3}); + Tensor expected = tf.make({3}, {3., 1., 2.}); + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPath1DMoreElements) { + TensorFactory tf; + TensorFactory tfl; + + Tensor x = tf.make({5}, {1., 2., 3., 4., 5.}); + std::array, 1> indices = { + optional(tfl.make({7}, {2, 0, 1, 3, 3, 4, 1}))}; + Tensor out = tf.zeros({7}); + Tensor expected = tf.make({7}, {3., 1., 2., 4., 4., 5., 2.}); + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPathUpperBoundOutTensor) { + TensorFactory tf; + TensorFactory tfl; + + // clang-format off + Tensor x = tf.make( + {2, 3, 4}, + { + // [0, :, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + std::array, 3> indices = { + optional(), + optional(tfl.make({5}, {2, 0, 1, 0, 2})), + optional()}; + + Tensor out = + tf.zeros({5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + // clang-format off + Tensor expected = tf.make( + {2, 5, 4}, + { + // [0, :, :] + 9., 10., 11., 12., // [0, 2, :] + 1., 2., 3., 4., // [0, 0, :] + 5., 6., 7., 8., // [0, 1, :] + 1., 2., 3., 4., // [0, 0, :] + 9., 10., 11., 12., // [0, 2, :] + + // [1, :, :] + -9., -10., -11., -12., // [1, 2, :] + -1., -2., -3., -4., // [1, 0, :] + -5., -6., -7., -8., // [1, 1, :] + -1., -2., -3., -4., // [1, 0, :] + -9., -10., -11., -12., // [1, 2, :] + }); + // clang-format on + + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +} + +TEST_F(OpIndexTensorOutTest, FastPathEmptyInput) { + TensorFactory tf; + TensorFactory tfl; + + Tensor x = tf.ones({2, 3, 0, 4}); + std::array, 3> indices = { + optional(), + optional(tfl.make({5}, {2, 0, 1, 0, 2})), + optional()}; + Tensor out = tf.zeros( + {5, 5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND); + Tensor expected = tf.ones({2, 5, 0, 4}); + op_index_tensor_out(x, indices, out); + + EXPECT_TENSOR_EQ(out, expected); +}