From f8bc0914e9d070b89491ceef0897f08a32de8da8 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 29 Aug 2025 12:22:09 -0700
Subject: [PATCH] Optimize index_out via fast path (#13731)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/13731

Differential Revision: D81142086
---
 kernels/portable/cpu/op_index.cpp | 174 ++++++++++++++++-
 kernels/test/op_index_test.cpp    | 299 ++++++++++++++++++++++++++++--
 2 files changed, 451 insertions(+), 22 deletions(-)
diff --git a/kernels/portable/cpu/op_index.cpp b/kernels/portable/cpu/op_index.cpp
index a81ce6ad737..e0ca951de85 100644
--- a/kernels/portable/cpu/op_index.cpp
+++ b/kernels/portable/cpu/op_index.cpp
@@ -22,21 +22,189 @@ namespace native {
 using Tensor = executorch::aten::Tensor;
 using TensorOptList = executorch::aten::ArrayRef<std::optional<Tensor>>;
 
-Tensor& index_Tensor_out(
+namespace {
+
+bool check_fast_path_conditions(
+    ET_UNUSED const Tensor& in,
+    TensorOptList indices,
+    size_t* dim) {
+  bool found_index = false;
+  for (const auto i : c10::irange(indices.size())) {
+    if (indices[i].has_value()) {
+      *dim = i;
+      // Fast path only supports a single non-null index tensor
+      if (found_index) {
+        return false;
+      }
+      found_index = true;
+      const Tensor& index = indices[i].value();
+      ScalarType ix_type = index.scalar_type();
+      // Fast path only supports Long or Int index tensors
+      if (ix_type != ScalarType::Long && ix_type != ScalarType::Int) {
+        return false;
+      }
+      // Fast path only supports a 1-dimensional index tensor
+      if (index.dim() != 1) {
+        return false;
+      }
+    }
+  }
+
+  // Fast path needs at least one non-null index tensor
+  if (!found_index) {
+    return false;
+  }
+
+  return true;
+}
+
+bool check_fast_path_args(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     TensorOptList indices,
+    size_t dim,
     Tensor& out) {
-  (void)ctx;
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, out));
+
+  ET_CHECK_OR_RETURN_FALSE(
+      static_cast<ssize_t>(indices.size()) <= in.dim(),
+      "Indexing too many dimensions");
+
+  const Tensor& index = indices[dim].value();
+
+  bool is_valid_index = true;
+  ET_SWITCH_TWO_TYPES(
+      Long, Int, index.scalar_type(), ctx, "index.Tensor", CTYPE, [&]() {
+        const CTYPE* const index_arr = index.const_data_ptr<CTYPE>();
+        for (const auto i : c10::irange(index.numel())) {
+          if (index_arr[i] < 0 ||
+              index_arr[i] >= static_cast<CTYPE>(in.size(dim))) {
+            ET_LOG(
+                Error,
+                "Index %" PRId64
+                " out of range for tensor with size %zd"
+                " at dimension %zu",
+                static_cast<int64_t>(index_arr[i]),
+                in.size(dim),
+                dim);
+            is_valid_index = false;
+            break;
+          }
+        }
+      });
+
+  ET_CHECK_OR_RETURN_FALSE(
+      is_valid_index,
+      "Some index values are not within bounds of input tensor at indexed dim");
 
+  return true;
+}
+
+void get_fast_path_index_out_target_size(
+    const Tensor& in,
+    TensorOptList indices,
+    size_t dim,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = in.dim();
+
+  for (const auto d : c10::irange(static_cast<size_t>(in.dim()))) {
+    if (d != dim) {
+      out_sizes[d] = static_cast<Tensor::SizesType>(in.size(d));
+    } else {
+      out_sizes[d] =
+          static_cast<Tensor::SizesType>(indices[dim].value().numel());
+    }
+  }
+}
+
+Tensor& fast_path(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    TensorOptList indices,
+    size_t dim,
+    Tensor& out) {
   ET_KERNEL_CHECK(
-      ctx, check_index_args(in, indices, out), InvalidArgument, out);
+      ctx,
+      check_fast_path_args(ctx, in, indices, dim, out),
+      InvalidArgument,
+      out);
+
+  const Tensor& index = indices[dim].value();
+  ScalarType index_type = index.scalar_type();
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType expected_size[kTensorDimensionLimit];
+  size_t expected_ndim = 0;
+  get_fast_path_index_out_target_size(
+      in, indices, dim, expected_size, &expected_ndim);
 
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_size, expected_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (out.dim() == 0) {
+    memcpy(out.mutable_data_ptr(), in.const_data_ptr(), out.nbytes());
+    return out;
+  }
+
+  size_t leading_dims = getLeadingDims(in, dim);
+  size_t trailing_dims = getTrailingDims(in, dim);
+
+  if (leading_dims == 0 || trailing_dims == 0) {
+    return out;
+  }
+
+  size_t in_dim_length = in.size(dim);
+  size_t out_dim_length = out.size(dim);
+
+  size_t length_per_step = trailing_dims * in.element_size();
+
+  const char* in_data = in.const_data_ptr<char>();
+  char* out_data = out.mutable_data_ptr<char>();
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "index.Tensor_out";
+
+  ET_SWITCH_TWO_TYPES(Long, Int, index_type, ctx, op_name, CTYPE, [&]() {
+    const CTYPE* const index_arr = index.const_data_ptr<CTYPE>();
+    for (const auto i : c10::irange(leading_dims)) {
+      const char* src = in_data + i * in_dim_length * length_per_step;
+      char* dest = out_data + i * out_dim_length * length_per_step;
+      for (const auto j : c10::irange(out_dim_length)) {
+        const char* copy_src = src + index_arr[j] * length_per_step;
+        char* copy_dest = dest + j * length_per_step;
+        memcpy(copy_dest, copy_src, length_per_step);
+      }
+    }
+  });
+
+  return out;
+}
+
+} // namespace
+
+Tensor& index_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    TensorOptList indices,
+    Tensor& out) {
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   ET_KERNEL_CHECK(ctx, tensor_is_default_dim_order(in), InvalidArgument, out);
 
+  size_t dim = 0;
+  bool is_fast_path = check_fast_path_conditions(in, indices, &dim);
+  if (is_fast_path) {
+    return fast_path(ctx, in, indices, dim, out);
+  }
+
+  ET_KERNEL_CHECK(
+      ctx, check_index_args(in, indices, out), InvalidArgument, out);
+
   ScalarType in_type = in.scalar_type();
   size_t block_count = count_index_blocks(indices);
 
diff --git a/kernels/test/op_index_test.cpp b/kernels/test/op_index_test.cpp
index 2471d44b0a3..9f1f8e3e9f7 100644
--- a/kernels/test/op_index_test.cpp
+++ b/kernels/test/op_index_test.cpp
@@ -72,12 +72,12 @@ class OpIndexTensorOutTest : public OperatorTest {
     // clang-format on
 
     // indices [0, 1, 2], [1, 0, 3], expressed two different ways
-    optional<Tensor> indices[] = {
+    std::array<optional<Tensor>, 3> indices = {
         optional<Tensor>(tfl.make({2}, {0, 1})),
         optional<Tensor>(tfl.make({2}, {1, 0})),
         optional<Tensor>(tfl.make({2}, {2, 3}))};
 
-    optional<Tensor> indices_mixed[] = {
+    std::array<optional<Tensor>, 3> indices_mixed = {
         optional<Tensor>(tfl.make({2}, {0, 1})),
         optional<Tensor>(tfb.make({2}, {false, true})),
         optional<Tensor>(tfl.make({2}, {2, 3}))};
@@ -203,27 +203,27 @@ TEST_F(OpIndexTensorOutTest, SelectFrontDimAllIndexes) {
   // Try to select the input value at indices
   // [1, 0, 1], [1, 0, 2]. This is expressed in various ways to test different
   // indexing expressions.
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 3> indices = {
       optional<Tensor>(tfl.make({1}, {1})),
       optional<Tensor>(tfl.make({1}, {0})),
       optional<Tensor>(tfl.make({2}, {1, 2}))};
 
-  optional<Tensor> indices_int[] = {
+  std::array<optional<Tensor>, 3> indices_int = {
       optional<Tensor>(tfi.make({1}, {1})),
       optional<Tensor>(tfi.make({1}, {0})),
       optional<Tensor>(tfi.make({2}, {1, 2}))};
 
-  optional<Tensor> indices_negative[] = {
+  std::array<optional<Tensor>, 3> indices_negative = {
       optional<Tensor>(tfl.make({1}, {-1})),
       optional<Tensor>(tfl.make({1}, {0})),
       optional<Tensor>(tfl.make({2}, {-3, -2}))};
 
-  optional<Tensor> indices_bool[] = {
+  std::array<optional<Tensor>, 3> indices_bool = {
       optional<Tensor>(tfb.make({2}, {false, true})),
       optional<Tensor>(tfb.make({3}, {true, false, false})),
       optional<Tensor>(tfl.make({2}, {-3, -2}))};
 
-  optional<Tensor> indices_mixed[] = {
+  std::array<optional<Tensor>, 3> indices_mixed = {
       optional<Tensor>(tfb.make({2}, {false, true})),
       optional<Tensor>(tfl.make({1}, {0})),
       optional<Tensor>(tfl.make({2}, {-3, -2}))};
@@ -264,7 +264,7 @@ TEST_F(OpIndexTensorOutTest, SelectTwoValuesAtSameIndex) {
   // clang-format on
 
   // Try to select the value at the same index
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 3> indices = {
       optional<Tensor>(tfl.make({1, 2}, {0, 0})),
       optional<Tensor>(tfl.make({1, 2}, {1, 1})),
       optional<Tensor>(tfl.make({1, 2}, {2, 2}))};
@@ -306,11 +306,11 @@ TEST_F(OpIndexTensorOutTest, IndicesFewerThanInputDimSupported) {
   // [1, 0, :], [1, 1, :]. This is expressed in various ways to test different
   // indexing expressions.
 
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 2> indices = {
       optional<Tensor>(tfl.make({1}, {1})),
       optional<Tensor>(tfl.make({2}, {0, 1}))};
 
-  optional<Tensor> indices_mixed[] = {
+  std::array<optional<Tensor>, 2> indices_mixed = {
       optional<Tensor>(tfi.make({1}, {-1})),
       optional<Tensor>(tfb.make({3}, {true, true, false}))};
 
@@ -349,7 +349,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
       });
   // clang-format on
 
-  optional<Tensor> indices0[] = {
+  std::array<optional<Tensor>, 3> indices0 = {
       optional<Tensor>(),
       optional<Tensor>(tfl.make({1}, {1})),
       optional<Tensor>(tfl.make({2}, {0, 1}))};
@@ -366,7 +366,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
 
   run_test_cases(x, /*indices=*/indices0, expected0);
 
-  optional<Tensor> indices1[] = {
+  std::array<optional<Tensor>, 3> indices1 = {
       optional<Tensor>(tfl.make({1}, {1})),
       optional<Tensor>(),
       optional<Tensor>(tfl.make({2}, {0, 1}))};
@@ -383,7 +383,7 @@ TEST_F(OpIndexTensorOutTest, IndicesWithNullTensorsSupported) {
 
   run_test_cases(x, /*indices=*/indices1, expected1);
 
-  optional<Tensor> indices2[] = {
+  std::array<optional<Tensor>, 3> indices2 = {
       optional<Tensor>(tfl.make({1}, {1})),
       optional<Tensor>(tfl.make({2}, {0, 1})),
       optional<Tensor>()};
@@ -408,13 +408,14 @@ TEST_F(OpIndexTensorOutTest, IndicesWithOnlyNullTensorsSupported) {
   TensorFactory<ScalarType::Double> tf;
 
   Tensor x = tf.make({2, 3}, {1., 2., 3., 4., 5., 6.});
-  optional<Tensor> indices0[] = {optional<Tensor>()};
+  std::array<optional<Tensor>, 1> indices0 = {optional<Tensor>()};
   run_test_cases(x, indices0, x);
 
-  optional<Tensor> indices1[] = {optional<Tensor>(), std::optional<Tensor>()};
+  std::array<optional<Tensor>, 2> indices1 = {
+      optional<Tensor>(), std::optional<Tensor>()};
   run_test_cases(x, indices1, x);
 
-  optional<Tensor> indices2[] = {
+  std::array<optional<Tensor>, 3> indices2 = {
       optional<Tensor>(), std::optional<Tensor>(), std::optional<Tensor>()};
   Tensor out = tf.ones({2, 3});
   ET_EXPECT_KERNEL_FAILURE_WITH_MSG(
@@ -550,7 +551,7 @@ TEST_F(OpIndexTensorOutTest, InvalidIndicesShapesDies) {
 
   Tensor x = tf.zeros({2, 4, 7, 5});
   // clang-format off
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 2> indices = {
       optional<Tensor>(tfl.make({3}, {1, 1, 1,})),
       optional<Tensor>(tfl.make({2}, {1, 2}))};
 
@@ -570,7 +571,7 @@ TEST_F(OpIndexTensorOutTest, InvalidIndicesShapeDies2) {
 
   Tensor x = tf.zeros({4, 4});
   // clang-format off
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 2> indices = {
       optional<Tensor>(tfl.make({2, 2}, {1, 1, 1, 1,})),
       optional<Tensor>(tfl.make({1, 2}, {3, 0,}))};
 
@@ -607,7 +608,7 @@ TEST_F(OpIndexTensorOutTest, UpperBoundOutTensor) {
 
   // Try to select the tensor from the input
   // indices [0, 2, 2], [1, 1, 2]
-  optional<Tensor> indices[] = {
+  std::array<optional<Tensor>, 3> indices = {
       optional<Tensor>(tfl.make({1, 2}, {0, 1})),
       optional<Tensor>(tfl.make({1, 2}, {2, 1})),
       optional<Tensor>(tfl.make({1, 2}, {2, 2}))};
@@ -627,3 +628,263 @@ TEST_F(OpIndexTensorOutTest, UpperBoundOutTensor) {
   EXPECT_TENSOR_EQ(out, ret);
   EXPECT_TENSOR_EQ(ret, expected);
 }
+
+//
+// Fast Path Tests
+//
+
+TEST_F(OpIndexTensorOutTest, FastPathFirstDim) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  // clang-format off
+  Tensor x = tf.make(
+    {2, 3, 4},
+    {
+        // [0, :, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  std::array<optional<Tensor>, 3> indices = {
+      optional<Tensor>(tfl.make({3}, {1, 0, 1})),
+      optional<Tensor>(),
+      optional<Tensor>()};
+
+  Tensor out = tf.zeros({3, 3, 4});
+  // clang-format off
+  Tensor expected = tf.make(
+    {3, 3, 4},
+    {
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+
+        // [0, :, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPathMiddleDim) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  // clang-format off
+  Tensor x = tf.make(
+    {2, 3, 4},
+    {
+        // [0, :, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  std::array<optional<Tensor>, 2> indices = {
+      optional<Tensor>(), optional<Tensor>(tfl.make({5}, {2, 0, 1, 0, 2}))};
+
+  Tensor out = tf.zeros({2, 5, 4});
+  // clang-format off
+  Tensor expected = tf.make(
+    {2, 5, 4},
+    {
+        // [0, :, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -9., -10., -11., -12., // [1, 2, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPathLastDim) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  // clang-format off
+  Tensor x = tf.make(
+    {2, 3, 4},
+    {
+        // [0, :, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  std::array<optional<Tensor>, 3> indices = {
+      optional<Tensor>(),
+      optional<Tensor>(),
+      optional<Tensor>(tfl.make({3}, {2, 0, 1}))};
+
+  Tensor out = tf.zeros({2, 3, 3});
+  // clang-format off
+  Tensor expected = tf.make(
+    {2, 3, 3},
+    {
+        3.,   1.,   2.,
+        7.,   5.,   6.,
+       11.,   9.,  10.,
+
+       -3.,  -1.,  -2.,
+       -7.,  -5.,  -6.,
+      -11.,  -9., -10.,
+    });
+  // clang-format on
+
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPathZeroDim) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  Tensor x = tf.ones({0});
+  std::array<optional<Tensor>, 1> indices = {optional<Tensor>(tfl.zeros({0}))};
+  Tensor out = tf.zeros({0});
+  Tensor expected = tf.ones({0});
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPath1DLessElements) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  Tensor x = tf.make({5}, {1., 2., 3., 4., 5.});
+  std::array<optional<Tensor>, 1> indices = {
+      optional<Tensor>(tfl.make({3}, {2, 0, 1}))};
+  Tensor out = tf.zeros({3});
+  Tensor expected = tf.make({3}, {3., 1., 2.});
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPath1DMoreElements) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  Tensor x = tf.make({5}, {1., 2., 3., 4., 5.});
+  std::array<optional<Tensor>, 1> indices = {
+      optional<Tensor>(tfl.make({7}, {2, 0, 1, 3, 3, 4, 1}))};
+  Tensor out = tf.zeros({7});
+  Tensor expected = tf.make({7}, {3., 1., 2., 4., 4., 5., 2.});
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPathUpperBoundOutTensor) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  // clang-format off
+  Tensor x = tf.make(
+    {2, 3, 4},
+    {
+        // [0, :, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  std::array<optional<Tensor>, 3> indices = {
+      optional<Tensor>(),
+      optional<Tensor>(tfl.make({5}, {2, 0, 1, 0, 2})),
+      optional<Tensor>()};
+
+  Tensor out =
+      tf.zeros({5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  // clang-format off
+  Tensor expected = tf.make(
+    {2, 5, 4},
+    {
+        // [0, :, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        5.,   6.,   7.,   8., // [0, 1, :]
+        1.,   2.,   3.,   4., // [0, 0, :]
+        9.,  10.,  11.,  12., // [0, 2, :]
+
+        // [1, :, :]
+       -9., -10., -11., -12., // [1, 2, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -5.,  -6.,  -7.,  -8., // [1, 1, :]
+       -1.,  -2.,  -3.,  -4., // [1, 0, :]
+       -9., -10., -11., -12., // [1, 2, :]
+    });
+  // clang-format on
+
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}
+
+TEST_F(OpIndexTensorOutTest, FastPathEmptyInput) {
+  TensorFactory<ScalarType::Float> tf;
+  TensorFactory<ScalarType::Long> tfl;
+
+  Tensor x = tf.ones({2, 3, 0, 4});
+  std::array<optional<Tensor>, 3> indices = {
+      optional<Tensor>(),
+      optional<Tensor>(tfl.make({5}, {2, 0, 1, 0, 2})),
+      optional<Tensor>()};
+  Tensor out = tf.zeros(
+      {5, 5, 5, 5}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
+  Tensor expected = tf.ones({2, 5, 0, 4});
+  op_index_tensor_out(x, indices, out);
+
+  EXPECT_TENSOR_EQ(out, expected);
+}