From 2d3546ff8c9b13a15b1194ddab4d91bba18784c0 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 21 Oct 2025 10:23:37 -0700
Subject: [PATCH 1/2] make aoti_torch_empty_strided support creating
 incontiguous tensor

Pull Request resolved: https://github.com/pytorch/executorch/pull/15228

This diff modifies the `aoti_torch_empty_strided` function to support the creation of incontiguous tensors.

To achieve it, this diff:
1. update the way to calculate the memory size by using both tensor size and the stride
2. skip stride check in ETensor by adding and checking cmake macro `USE_CUDA_BACKEND` when building with CUDA backend support.

we will soon bring the ETensor check back for every backend after migrating to use slimtensor.
ghstack-source-id: 317688814
@exported-using-ghexport

Differential Revision: [D84938258](https://our.internmc.facebook.com/intern/diff/D84938258/)
---
 backends/cuda/CMakeLists.txt                  |  38 ++++-
 backends/cuda/runtime/TARGETS                 |  21 ++-
 backends/cuda/runtime/shims/memory.cpp        |  50 +++++--
 .../cuda/runtime/shims/tensor_attribute.h     |   2 +-
 .../test_aoti_torch__reinterpret_tensor.cpp   |   1 +
 .../tests/test_aoti_torch_empty_strided.cpp   | 135 ++++++++++++++----
 backends/cuda/runtime/tensor/tensor_maker.cpp | 126 ++++++++++++++++
 backends/cuda/runtime/tensor/tensor_maker.h   |  56 ++++++++
 8 files changed, 385 insertions(+), 44 deletions(-)
 create mode 100644 backends/cuda/runtime/tensor/tensor_maker.cpp
 create mode 100644 backends/cuda/runtime/tensor/tensor_maker.h

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 8b94351d469..1db8792e0c0 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -34,6 +34,39 @@ find_package(CUDAToolkit REQUIRED)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
+# CUDA tensor maker for backends that support incontiguous tensors
+set(_tensor_maker_sources runtime/tensor/tensor_maker.cpp)
+add_library(cuda_tensor_maker STATIC ${_tensor_maker_sources})
+target_include_directories(
+  cuda_tensor_maker
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  cuda_tensor_maker
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+if(APPLE)
+  target_link_options(cuda_tensor_maker PUBLIC -Wl,-export_dynamic)
+else()
+  target_link_options(
+    cuda_tensor_maker PUBLIC
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+  )
+endif()
+
+# Link against ExecuTorch core libraries
+target_link_libraries(cuda_tensor_maker PUBLIC executorch ${CMAKE_DL_LIBS})
+executorch_target_link_options_shared_lib(cuda_tensor_maker)
+
+install(
+  TARGETS cuda_tensor_maker
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
+
 # CUDA-specific AOTI functionality
 set(_aoti_cuda_sources
     runtime/cuda_backend.cpp
@@ -62,9 +95,10 @@ target_link_options(
   aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
 )
 
-# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+# Link against CUDA::cudart, common AOTI library, cuda_tensor_maker, and PyTorch
+# CUDA libraries
 target_link_libraries(
-  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+  aoti_cuda PUBLIC aoti_common cuda_tensor_maker CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index d18c0118542..a85f3a7e6a3 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -27,6 +27,25 @@ runtime.cxx_library(
     ],
 )
 
+runtime.cxx_library(
+    name = "tensor_maker",
+    srcs = [
+        "tensor/tensor_maker.cpp",
+    ],
+    headers = [
+        "tensor/tensor_maker.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+)
+
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
@@ -52,8 +71,8 @@ runtime.cxx_library(
     compiler_flags = ["-Wno-global-constructors"],
     visibility = ["@EXECUTORCH_CLIENTS"],
     deps = [
+        ":tensor_maker",
         "//executorch/backends/aoti:common_shims",
-        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/core:core",
         "//executorch/runtime/core/exec_aten:lib",
         "//executorch/runtime/platform:platform",
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 5d30d3124d9..46b8d448a3a 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -11,6 +11,7 @@
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
@@ -163,9 +164,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
 
   // Create ExecutorTorch tensor that wraps the existing memory
   // Note: We're NOT copying the data, just wrapping it
-  auto tensor = executorch::extension::from_blob(
-      data, // existing memory (don't copy!)
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = make_tensor(
       sizes, // tensor dimensions
+      data, // existing memory (don't copy!)
+      {}, // dim_order (empty, will be auto-generated)
       strides, // tensor strides (allows different strides)
       dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
   );
@@ -210,10 +213,6 @@ AOTITorchError aoti_torch_empty_strided(
 
   // This requires us to reserve CUDA memory and put it into a ETensor
   void* ptr;
-  int64_t numel = 1;
-  for (int64_t i = 0; i < ndim; i++) {
-    numel *= sizes_ptr[i];
-  }
 
   ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
 
@@ -223,7 +222,28 @@ AOTITorchError aoti_torch_empty_strided(
       InvalidArgument,
       "Invalid element size for dtype: %d",
       dtype);
-  int64_t nbytes = numel * element_size;
+
+  // Calculate storage size based on strides, matching PyTorch's behavior
+  // This is critical when sizes and strides don't match the expected contiguous
+  // layout Reference: PyTorch's computeStorageNbytes in EmptyTensor.cpp
+  int64_t storage_size = 1; // storage offset (0) + 1
+  for (int64_t i = 0; i < ndim; i++) {
+    if (sizes_ptr[i] == 0) {
+      storage_size = 0;
+      break;
+    }
+    // For each dimension, add stride[i] * (size[i] - 1)
+    // This gives us the maximum offset in that dimension
+    int64_t stride_i = (strides_ptr != nullptr) ? strides_ptr[i] : 1;
+    if (strides_ptr == nullptr) {
+      // Calculate contiguous stride if not provided
+      for (int64_t j = i + 1; j < ndim; j++) {
+        stride_i *= sizes_ptr[j];
+      }
+    }
+    storage_size += stride_i * (sizes_ptr[i] - 1);
+  }
+  int64_t nbytes = storage_size * element_size;
 
   if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
     ET_CUDA_CHECK_OR_RETURN_ERROR(
@@ -250,8 +270,13 @@ AOTITorchError aoti_torch_empty_strided(
   auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
 
   // ETensor creation with dynamic shape support for edge cases
-  auto tensor = executorch::extension::from_blob(
-      ptr, sizes, strides, dtype_to_scalar_type(dtype));
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  auto tensor = make_tensor(
+      sizes,
+      ptr,
+      {}, // dim_order (empty, will be auto-generated)
+      strides,
+      dtype_to_scalar_type(dtype));
 
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
@@ -259,7 +284,6 @@ AOTITorchError aoti_torch_empty_strided(
 
   // This tensor owns the memory it allocated, set reference count to 1
   memory_to_n_tensor[ptr] = 1;
-
   return Error::Ok;
 }
 
@@ -630,9 +654,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
   // Create new tensor view that reinterprets the same memory with different
   // shape/strides This creates a view, not a copy - the data pointer is shared
-  std::shared_ptr<Tensor> tensor = executorch::extension::from_blob(
-      data_ptr, // Reuse the same memory from source tensor
+  // Using CUDA-specific tensor maker that supports incontiguous tensors
+  std::shared_ptr<Tensor> tensor = make_tensor(
       sizes, // New sizes with explicit SizesType
+      data_ptr, // Reuse the same memory from source tensor
+      {}, // dim_order (empty, will be auto-generated)
       strides, // New strides with explicit StridesType
       dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
   );
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index 15a4e397d24..6b61b5bd3b8 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
 
 namespace executorch::backends::cuda {
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index 1cefca99c2a..d3044810b15 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -12,6 +12,7 @@
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 #include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
index da65129f18a..799a8d1221b 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -278,30 +278,6 @@ TEST_F(AOTITorchEmptyStridedTest, LargeTensor) {
   EXPECT_EQ(tensor->size(2), 50);
 }
 
-// Test error handling with memory allocation failures
-TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) {
-  // Try to create a very large tensor that might cause allocation failure
-  // (This test may pass or fail depending on available memory)
-  std::vector<int64_t> huge_sizes = {10000, 10000, 100}; // ~38GB for float32
-  Tensor* tensor;
-
-  AOTITorchError error = aoti_torch_empty_strided(
-      huge_sizes.size(),
-      huge_sizes.data(),
-      nullptr,
-      6, // float32
-      1, // CUDA device
-      0, // device index
-      &tensor);
-
-  // Either succeed or fail with memory allocation error
-  if (error == Error::Ok) {
-    EXPECT_NE(tensor, nullptr);
-  } else {
-    EXPECT_EQ(error, Error::MemoryAllocationFailed);
-  }
-}
-
 // Test aoti_torch_empty_strided with bfloat16 dtype
 TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) {
   // Test creating bfloat16 tensor on CUDA
@@ -509,11 +485,11 @@ TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) {
   EXPECT_EQ(sizes_ptr[2], 3);
 }
 
-// Test different data types (only float32 is currently supported)
+// Test different data types (currently we support bf16, fp32 and int32)
 TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
   std::vector<int64_t> sizes = {2, 3};
 
-  // Test float32 (dtype 6) - currently the only supported type
+  // Test float32 (dtype 6) - one of the supported types
   Tensor* tensor_float32;
   AOTITorchError error = aoti_torch_empty_strided(
       sizes.size(),
@@ -527,7 +503,7 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
   EXPECT_EQ(error, Error::Ok);
   EXPECT_NE(tensor_float32, nullptr);
 
-  // Test unsupported data types should return error
+  // Test int32 (dtype 3) - one of the supported types
   Tensor* tensor_int32;
   error = aoti_torch_empty_strided(
       sizes.size(),
@@ -538,7 +514,8 @@ TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
       0, // device index
       &tensor_int32);
 
-  EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_int32, nullptr);
 
   // Test another unsupported data type
   Tensor* tensor_float64;
@@ -586,3 +563,105 @@ TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) {
   EXPECT_EQ(tensor_5d->size(3), 4);
   EXPECT_EQ(tensor_5d->size(4), 5);
 }
+
+// Test incontiguous tensor creation - transpose-like layout
+TEST_F(AOTITorchEmptyStridedTest, IncontiguousTransposeLayout) {
+  // Create a tensor with transpose-like strides (column-major)
+  // For a 3x4 tensor in column-major order, strides should be [1, 3]
+  // This means each row step is 1, and each column step is 3
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {1, 3}; // Column-major (incontiguous)
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+
+  // Verify the strides are what we specified
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+  EXPECT_EQ(strides_ptr[0], 1); // Column-major stride for dimension 0
+  EXPECT_EQ(strides_ptr[1], 3); // Column-major stride for dimension 1
+
+  // Verify that memory was allocated correctly for incontiguous layout
+  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
+  // 1) + 1 = 1 * (3 - 1) + 3 * (4 - 1) + 1 = 1 * 2 + 3 * 3 + 1 = 2 + 9 + 1 = 12
+  // elements Total bytes = 12 * 4 = 48 bytes (for float32)
+  EXPECT_EQ(tensor->numel(), 12); // numel is still 3*4=12 for logical shape
+
+  // The tensor should be accessible and writable
+  void* data_ptr = tensor->mutable_data_ptr();
+  EXPECT_NE(data_ptr, nullptr);
+
+  // Verify we can use CUDA to write to the memory
+  std::vector<float> test_data(12, 1.0f);
+  cudaError_t cuda_err = cudaMemcpy(
+      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+}
+
+// Test incontiguous tensor creation - expanded/broadcasted stride pattern
+TEST_F(AOTITorchEmptyStridedTest, IncontiguousExpandedStrides) {
+  // Create a tensor with expanded strides (simulating broadcasting)
+  // A 2x3x4 tensor where the first dimension has stride 0 (expanded)
+  // This creates a tensor where the first dimension is "broadcasted"
+  std::vector<int64_t> sizes = {2, 3, 4};
+  std::vector<int64_t> strides = {0, 4, 1}; // First dimension has stride 0
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->size(2), 4);
+
+  // Verify the strides are what we specified
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+  EXPECT_EQ(strides_ptr[0], 0); // Expanded dimension stride
+  EXPECT_EQ(strides_ptr[1], 4);
+  EXPECT_EQ(strides_ptr[2], 1);
+
+  // Verify that memory was allocated correctly for this incontiguous layout
+  // Storage size should be: stride[0] * (size[0] - 1) + stride[1] * (size[1] -
+  // 1) + stride[2] * (size[2] - 1) + 1 = 0 * (2 - 1) + 4 * (3 - 1) + 1 * (4 -
+  // 1) + 1 = 0 + 8 + 3 + 1 = 12 elements Note: numel() returns logical number
+  // of elements (2*3*4=24), not storage size
+  EXPECT_EQ(tensor->numel(), 24); // Logical numel is 2*3*4=24
+
+  // The tensor should be accessible and writable
+  void* data_ptr = tensor->mutable_data_ptr();
+  EXPECT_NE(data_ptr, nullptr);
+
+  // Verify we can use CUDA to write to the allocated memory
+  // We only need to allocate 12 elements (storage size), not 24
+  std::vector<float> test_data(12, 2.0f);
+  cudaError_t cuda_err = cudaMemcpy(
+      data_ptr, test_data.data(), 12 * sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+}
diff --git a/backends/cuda/runtime/tensor/tensor_maker.cpp b/backends/cuda/runtime/tensor/tensor_maker.cpp
new file mode 100644
index 00000000000..01252082bfc
--- /dev/null
+++ b/backends/cuda/runtime/tensor/tensor_maker.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/tensor/tensor_maker.h>
+
+#include <numeric>
+
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace executorch::backends::cuda {
+
+namespace {
+#ifndef USE_ATEN_LIB
+/**
+ * A structure that consolidates the metadata (sizes, dim_order, strides) and
+ * the data buffer associated with a Tensor. Since Tensor does not own
+ * the memory for these metadata arrays or the data itself, this structure
+ * ensures that they are managed together and have the same lifetime as the
+ * Tensor. When the Tensor is destroyed, the Storage structure ensures
+ * proper cleanup of the associated metadata and data if needed.
+ */
+struct Storage final {
+  executorch::aten::TensorImpl tensor_impl;
+  executorch::aten::Tensor tensor;
+  std::vector<executorch::aten::SizesType> sizes;
+  std::vector<executorch::aten::DimOrderType> dim_order;
+  std::vector<executorch::aten::StridesType> strides;
+  std::function<void(void*)> deleter;
+
+  Storage(
+      executorch::aten::TensorImpl&& tensor_impl,
+      std::vector<executorch::aten::SizesType>&& sizes,
+      std::vector<executorch::aten::DimOrderType>&& dim_order,
+      std::vector<executorch::aten::StridesType>&& strides,
+      std::function<void(void*)>&& deleter)
+      : tensor_impl(std::move(tensor_impl)),
+        tensor(&this->tensor_impl),
+        sizes(std::move(sizes)),
+        dim_order(std::move(dim_order)),
+        strides(std::move(strides)),
+        deleter(std::move(deleter)) {}
+
+  ~Storage() {
+    if (deleter) {
+      deleter(tensor_impl.mutable_data());
+    }
+  }
+};
+#endif // USE_ATEN_LIB
+} // namespace
+
+TensorPtr make_tensor(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism,
+    std::function<void(void*)> deleter) {
+  const auto dim = sizes.size();
+  ET_CHECK_MSG(
+      dim_order.empty() || dim_order.size() == dim,
+      "dim_order size must match sizes or be empty.");
+  ET_CHECK_MSG(
+      strides.empty() || strides.size() == dim,
+      "strides size must match sizes or be empty.");
+
+  if (dim_order.empty()) {
+    dim_order.resize(dim);
+    std::iota(dim_order.begin(), dim_order.end(), 0);
+    if (!strides.empty()) {
+      std::sort(dim_order.begin(), dim_order.end(), [&](size_t a, size_t b) {
+        return strides[a] > strides[b];
+      });
+    }
+  }
+
+  // AOTI backends (like AOTI-CUDA) handle both contiguous and incontiguous
+  // tensors, so we skip stride calculation and incontiguous tensor checks.
+  // Strides are passed through as-is without validation.
+
+#ifndef USE_ATEN_LIB
+  executorch::aten::TensorImpl tensor_impl(
+      type,
+      dim,
+      sizes.data(),
+      data,
+      dim_order.data(),
+      strides.data(),
+      dim > 0 ? dynamism : executorch::aten::TensorShapeDynamism::STATIC);
+  auto storage = std::make_shared<Storage>(
+      std::move(tensor_impl),
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      std::move(deleter));
+  const auto tensor_ptr = &storage->tensor;
+  return std::shared_ptr<executorch::aten::Tensor>(
+      std::move(storage), tensor_ptr);
+#else
+  auto options = c10::TensorOptions()
+                     .dtype(c10::scalarTypeToTypeMeta(type))
+                     .device(c10::kCPU);
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      at::detail::computeStorageNbytes(
+          sizes, strides, options.dtype().itemsize()),
+      c10::InefficientStdFunctionContext::makeDataPtr(
+          data, std::move(deleter), options.device()),
+      nullptr,
+      false);
+  auto tensor_impl = c10::make_intrusive<executorch::aten::TensorImpl>(
+      std::move(storage),
+      c10::DispatchKeySet(c10::DispatchKey::CPU),
+      options.dtype());
+  tensor_impl->set_sizes_and_strides(sizes, strides);
+  return std::make_shared<executorch::aten::Tensor>(std::move(tensor_impl));
+#endif // USE_ATEN_LIB
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/tensor/tensor_maker.h b/backends/cuda/runtime/tensor/tensor_maker.h
new file mode 100644
index 00000000000..92cdec60bb4
--- /dev/null
+++ b/backends/cuda/runtime/tensor/tensor_maker.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+namespace executorch::backends::cuda {
+
+/**
+ * A smart pointer type for managing the lifecycle of a Tensor.
+ * This is compatible with executorch::extension::TensorPtr.
+ */
+using TensorPtr = std::shared_ptr<executorch::aten::Tensor>;
+
+/**
+ * Creates a TensorPtr for AOTI backends that skips stride calculation and
+ * incontiguous tensor checks. This is specifically designed for AOTI-CUDA
+ * which handles both contiguous and incontiguous tensors.
+ *
+ * This function is similar to executorch::extension::make_tensor_ptr but
+ * bypasses the stride validation that assumes contiguous tensors, making it
+ * suitable for AOTI backends that support arbitrary strides.
+ *
+ * @param sizes A vector specifying the size of each dimension.
+ * @param data A pointer to the data buffer.
+ * @param dim_order A vector specifying the order of dimensions.
+ * @param strides A vector specifying the strides of the tensor.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies the mutability of the tensor's shape.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * data buffer. If provided, this deleter will be called when the managed Tensor
+ * object is destroyed.
+ * @return A TensorPtr that manages the newly created Tensor.
+ */
+TensorPtr make_tensor(
+    std::vector<executorch::aten::SizesType> sizes,
+    void* data,
+    std::vector<executorch::aten::DimOrderType> dim_order,
+    std::vector<executorch::aten::StridesType> strides,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+    std::function<void(void*)> deleter = nullptr);
+
+} // namespace executorch::backends::cuda

From 5a4c94e53dcbf7d26ae41a83ac3d8dc5b8d1ee2d Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 21 Oct 2025 10:23:39 -0700
Subject: [PATCH 2/2] add module level benchmark for gemma3 model

Pull Request resolved: https://github.com/pytorch/executorch/pull/15241

This diff adds a module-level benchmark for the GEMMA3 model.
Also introduce mutlmodal_benchmark.cpp to replace original voxtral_runner.cpp for benchmarking both gemma3 and voxtral model in module level.
ghstack-source-id: 317688813

Differential Revision: [D84958564](https://our.internmc.facebook.com/intern/diff/D84958564/)
---
 .github/workflows/cuda.yml                   | 109 ++++-
 backends/cuda/CMakeLists.txt                 |   7 +-
 backends/cuda/cuda_backend.py                |   3 +
 backends/cuda/tests/multimodal_benchmark.cpp | 465 +++++++++++++++++++
 backends/cuda/tests/voxtral_runner.cpp       | 264 -----------
 5 files changed, 578 insertions(+), 270 deletions(-)
 create mode 100644 backends/cuda/tests/multimodal_benchmark.cpp
 delete mode 100644 backends/cuda/tests/voxtral_runner.cpp

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 9ee72a34ef0..f59a29420d7 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -164,6 +164,61 @@ jobs:
         ls -al "${RUNNER_ARTIFACT_DIR}"
         echo "::endgroup::"
 
+  export-gemma3-cuda-artifact:
+    name: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Gemma3"
+        optimum-cli export executorch \
+            --model "google/gemma-3-4b-it" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 64 \
+            --output_dir ./
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Store Gemma3 Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}/"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}/"
+        echo "::endgroup::"
+
   benchmark-voxtral-cuda:
     name: benchmark-voxtral-cuda
     needs: export-voxtral-cuda-artifact
@@ -204,13 +259,63 @@ jobs:
               -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
-        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
         echo "::endgroup::"
 
         echo "::group::Run Voxtral Benchmark"
 
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+        cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  benchmark-gemma3-cuda:
+    name: benchmark-gemma3-cuda
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
 
         echo "::endgroup::"
 
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 1db8792e0c0..af36c89585e 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -105,11 +105,10 @@ target_link_libraries(
 executorch_target_link_options_shared_lib(aoti_cuda)
 
 if(BUILD_TESTING)
-  # Add runtime
-  add_executable(voxtral_runner tests/voxtral_runner.cpp)
+  add_executable(multimodal_benchmark tests/multimodal_benchmark.cpp)
   target_link_libraries(
-    voxtral_runner PUBLIC aoti_cuda extension_module_static
-                          extension_flat_tensor portable_ops_lib
+    multimodal_benchmark PUBLIC aoti_cuda extension_module_static
+                                extension_flat_tensor portable_ops_lib
   )
 endif()
 
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index ba6da92b991..f8482835ea5 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -140,6 +140,9 @@ def preprocess(
                 user_input_placeholders.append(node.meta["val"])
 
         options: dict[str, typing.Any] = {
+            # Disable this to support sdpa decomposition
+            # TODO(gasoonjia): remove it after pin bump to latest pytorch
+            "loop_ordering_after_fusion": False,
             # Better model precision
             "emulate_precision_casts": True,
             # Embed CUDA kernel binaries directly into the compiled shared object
diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp
new file mode 100644
index 00000000000..679db889b71
--- /dev/null
+++ b/backends/cuda/tests/multimodal_benchmark.cpp
@@ -0,0 +1,465 @@
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using executorch::aten::TensorShapeDynamism;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+enum class ModelType { GEMMA3, VOXTRAL, UNKNOWN };
+
+struct ModelConfig {
+  std::string name;
+  size_t token_seq_len;
+  size_t text_embed_dim;
+  std::vector<std::string> expected_methods;
+};
+
+const std::map<ModelType, ModelConfig> model_configs = {
+    {ModelType::GEMMA3,
+     {"gemma3",
+      128,
+      2304,
+      {"vision_encoder", "token_embedding", "text_decoder"}}},
+    {ModelType::VOXTRAL,
+     {"voxtral",
+      1138,
+      3072,
+      {"audio_encoder", "token_embedding", "text_decoder"}}}};
+
+ModelType parse_model_type(const std::string& model_name) {
+  std::string lower_name = model_name;
+  std::transform(
+      lower_name.begin(),
+      lower_name.end(),
+      lower_name.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  if (lower_name.find("gemma3") != std::string::npos) {
+    return ModelType::GEMMA3;
+  } else if (lower_name.find("voxtral") != std::string::npos) {
+    return ModelType::VOXTRAL;
+  }
+  return ModelType::UNKNOWN;
+}
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+void dump_tensor_to_file(const std::string& filename, const Tensor& tensor) {
+  std::ofstream file(filename, std::ios::binary);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open file for writing: " << filename << std::endl;
+    return;
+  }
+
+  int32_t dtype = static_cast<int32_t>(tensor.scalar_type());
+  file.write(reinterpret_cast<const char*>(&dtype), sizeof(int32_t));
+
+  int32_t ndim = static_cast<int32_t>(tensor.sizes().size());
+  file.write(reinterpret_cast<const char*>(&ndim), sizeof(int32_t));
+
+  for (size_t i = 0; i < tensor.sizes().size(); ++i) {
+    int64_t dim_size = tensor.sizes()[i];
+    file.write(reinterpret_cast<const char*>(&dim_size), sizeof(int64_t));
+  }
+
+  const void* data_ptr = tensor.const_data_ptr();
+  size_t element_size = 0;
+
+  switch (tensor.scalar_type()) {
+    case ScalarType::Float:
+      element_size = sizeof(float);
+      break;
+    case ScalarType::BFloat16:
+      element_size = 2;
+      break;
+    case ScalarType::Half:
+      element_size = 2;
+      break;
+    case ScalarType::Long:
+      element_size = sizeof(int64_t);
+      break;
+    case ScalarType::Int:
+      element_size = sizeof(int32_t);
+      break;
+    default:
+      std::cerr << "Unsupported dtype for dumping: "
+                << executorch::runtime::toString(tensor.scalar_type())
+                << std::endl;
+      return;
+  }
+
+  size_t data_size = tensor.numel() * element_size;
+  file.write(reinterpret_cast<const char*>(data_ptr), data_size);
+  file.close();
+
+  std::cout << "Dumped tensor to: " << filename << std::endl;
+}
+
+TensorPtr create_vision_input() {
+  const auto sizes = to_sizes({1, 3, 896, 896});
+  const size_t numel = 1ull * 3ull * 896ull * 896ull;
+  std::vector<float> data(numel);
+  for (size_t i = 0; i < numel; ++i) {
+    data[i] = static_cast<float>((i % 255) / 255.0);
+  }
+  return make_tensor_ptr<float>(
+      sizes,
+      std::move(data),
+      {},
+      {},
+      ScalarType::BFloat16,
+      TensorShapeDynamism::DYNAMIC_UNBOUND);
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input(const ModelConfig& config) {
+  const auto sizes = to_sizes({1, static_cast<int64_t>(config.token_seq_len)});
+  std::vector<int64_t> data(config.token_seq_len);
+  for (size_t i = 0; i < config.token_seq_len; ++i) {
+    data[i] = static_cast<int64_t>(i + 1);
+  }
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input(const ModelConfig& config) {
+  const auto sizes = to_sizes({static_cast<int64_t>(config.token_seq_len)});
+  std::vector<int64_t> data(config.token_seq_len);
+  for (size_t i = 0; i < config.token_seq_len; ++i) {
+    data[i] = static_cast<int64_t>(i);
+  }
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding(const ModelConfig& config) {
+  const auto sizes = to_sizes(
+      {1,
+       static_cast<int64_t>(config.token_seq_len),
+       static_cast<int64_t>(config.text_embed_dim)});
+  const size_t numel = 1ull * config.token_seq_len * config.text_embed_dim;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN };
+
+MethodCategory categorize_method(const std::string& method_name) {
+  std::string lower_name = method_name;
+  std::transform(
+      lower_name.begin(),
+      lower_name.end(),
+      lower_name.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  if (lower_name.find("vision") != std::string::npos ||
+      lower_name.find("audio") != std::string::npos ||
+      lower_name.find("encoder") != std::string::npos) {
+    return MethodCategory::ENCODER;
+  } else if (
+      lower_name.find("token") != std::string::npos &&
+      lower_name.find("embedding") != std::string::npos) {
+    return MethodCategory::TOKEN_EMBEDDING;
+  } else if (
+      lower_name.find("text") != std::string::npos &&
+      lower_name.find("decoder") != std::string::npos) {
+    return MethodCategory::TEXT_DECODER;
+  }
+  return MethodCategory::UNKNOWN;
+}
+
+std::vector<EValue> create_inputs_for_method(
+    const std::string& method_name,
+    MethodCategory category,
+    ModelType model_type,
+    const ModelConfig& config,
+    const EValue* token_output,
+    std::vector<TensorPtr>& owned_inputs) {
+  std::vector<EValue> inputs;
+
+  switch (category) {
+    case MethodCategory::ENCODER: {
+      if (method_name.find("vision") != std::string::npos) {
+        auto input = create_vision_input();
+        owned_inputs.emplace_back(input);
+        inputs.emplace_back(*input);
+      } else if (method_name.find("audio") != std::string::npos) {
+        auto input = create_audio_input();
+        owned_inputs.emplace_back(input);
+        inputs.emplace_back(*input);
+      }
+      break;
+    }
+
+    case MethodCategory::TOKEN_EMBEDDING: {
+      auto token_ids = create_token_ids_input(config);
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
+      break;
+    }
+
+    case MethodCategory::TEXT_DECODER: {
+      if (token_output && token_output->isTensor()) {
+        inputs.emplace_back(*token_output);
+      } else {
+        auto fallback_embedding = create_fallback_text_embedding(config);
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
+      }
+
+      auto positions = create_positions_input(config);
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  return inputs;
+}
+
+Error execute_method(
+    Module& module,
+    const std::string& method_name,
+    MethodCategory category,
+    ModelType model_type,
+    const ModelConfig& config,
+    const EValue* token_output,
+    MethodTiming& timing,
+    EValue* output_storage = nullptr) {
+  ET_LOG(Info, "Loading %s...", method_name.c_str());
+
+  const auto load_start = Clock::now();
+  const Error load_err = module.load_method(method_name);
+  const auto load_end = Clock::now();
+  if (load_err != Error::Ok) {
+    std::cerr << "Failed to load method " << method_name << ": error code "
+              << static_cast<int>(load_err) << std::endl;
+    return load_err;
+  }
+  timing.load_ms = DurationMs(load_end - load_start).count();
+
+  std::vector<TensorPtr> owned_inputs;
+  std::vector<EValue> inputs = create_inputs_for_method(
+      method_name, category, model_type, config, token_output, owned_inputs);
+
+  const auto run_start = Clock::now();
+  ET_LOG(Info, "%s running", method_name.c_str());
+  Result<std::vector<EValue>> output_result =
+      module.execute(method_name, inputs);
+  ET_LOG(Info, "%s done", method_name.c_str());
+  const auto run_end = Clock::now();
+  timing.run_ms = DurationMs(run_end - run_start).count();
+
+  if (output_result.error() != Error::Ok) {
+    std::cerr << method_name << " execution failed: error code "
+              << static_cast<int>(output_result.error()) << std::endl;
+    return output_result.error();
+  }
+
+  const auto& outputs = output_result.get();
+  if (!outputs.empty() && outputs[0].isTensor()) {
+    print_tensor_summary(method_name + " output", outputs[0].toTensor());
+
+    if (category == MethodCategory::ENCODER ||
+        category == MethodCategory::TOKEN_EMBEDDING) {
+      dump_tensor_to_file(method_name + "_output.bin", outputs[0].toTensor());
+    }
+
+    if (output_storage) {
+      *output_storage = outputs[0];
+    }
+  }
+
+  return Error::Ok;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 4) {
+    std::cerr
+        << "Usage: " << argv[0]
+        << " <model_name> <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+        << std::endl;
+    std::cerr << "  model_name: gemma3 or voxtral" << std::endl;
+    return 1;
+  }
+
+  const std::string model_name = argv[1];
+  const std::string program_path = argv[2];
+  const std::string data_map_path = argv[3];
+
+  const ModelType model_type = parse_model_type(model_name);
+  if (model_type == ModelType::UNKNOWN) {
+    std::cerr << "Unknown model type: " << model_name << std::endl;
+    std::cerr << "Supported models: gemma3, voxtral" << std::endl;
+    return 1;
+  }
+
+  const ModelConfig& config = model_configs.at(model_type);
+  std::cout << "Running benchmark for model: " << config.name << std::endl;
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    auto method_names_result = module.method_names();
+    if (method_names_result.error() != Error::Ok) {
+      std::cerr << "Failed to get method names: error code "
+                << static_cast<int>(method_names_result.error()) << std::endl;
+      return 1;
+    }
+
+    const auto& available_methods = method_names_result.get();
+
+    std::cout << "Checking for expected methods..." << std::endl;
+    std::vector<std::string> missing_methods;
+    for (const auto& expected : config.expected_methods) {
+      if (available_methods.find(expected) == available_methods.end()) {
+        missing_methods.push_back(expected);
+      } else {
+        std::cout << "  ✓ " << expected << std::endl;
+      }
+    }
+
+    if (!missing_methods.empty()) {
+      std::cerr << "\nError: Missing expected methods:" << std::endl;
+      for (const auto& missing : missing_methods) {
+        std::cerr << "  ✗ " << missing << std::endl;
+      }
+      return 1;
+    }
+
+    std::map<std::string, MethodTiming> timings;
+    EValue token_output;
+    bool token_executed = false;
+
+    for (const auto& method_name : config.expected_methods) {
+      MethodCategory category = categorize_method(method_name);
+      MethodTiming timing;
+
+      const EValue* input_token_ptr =
+          (category == MethodCategory::TEXT_DECODER && token_executed)
+          ? &token_output
+          : nullptr;
+
+      EValue* output_storage = (category == MethodCategory::TOKEN_EMBEDDING)
+          ? &token_output
+          : nullptr;
+
+      Error err = execute_method(
+          module,
+          method_name,
+          category,
+          model_type,
+          config,
+          input_token_ptr,
+          timing,
+          output_storage);
+
+      if (err != Error::Ok) {
+        return 1;
+      }
+
+      if (category == MethodCategory::TOKEN_EMBEDDING) {
+        token_executed = true;
+      }
+
+      timings[method_name] = timing;
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "\n=== Benchmark Results ===" << std::endl;
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "\nMethod load latency (ms):" << std::endl;
+    for (const auto& [name, timing] : timings) {
+      std::cout << "  " << name << ": " << timing.load_ms << std::endl;
+    }
+
+    std::cout << "\nRun latency (ms):" << std::endl;
+    for (const auto& [name, timing] : timings) {
+      std::cout << "  " << name << ": " << timing.run_ms << std::endl;
+    }
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
deleted file mode 100644
index feed458e1f5..00000000000
--- a/backends/cuda/tests/voxtral_runner.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
-#include <executorch/runtime/core/portable_type/tensor.h>
-
-namespace {
-
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::extension::make_tensor_ptr;
-using executorch::extension::TensorPtr;
-using executorch::extension::module::Module;
-using executorch::runtime::Error;
-using executorch::runtime::EValue;
-using executorch::runtime::Result;
-using Clock = std::chrono::steady_clock;
-using DurationMs = std::chrono::duration<double, std::milli>;
-
-std::vector<executorch::aten::SizesType> to_sizes(
-    std::initializer_list<int64_t> dims) {
-  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
-}
-
-std::string format_shape(const Tensor& tensor) {
-  std::ostringstream oss;
-  oss << "[";
-  const auto& sizes = tensor.sizes();
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    if (i > 0) {
-      oss << ", ";
-    }
-    oss << sizes[i];
-  }
-  oss << "]";
-  return oss.str();
-}
-
-void print_tensor_summary(const std::string& label, const Tensor& tensor) {
-  std::cout << "    " << label
-            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
-            << ", shape=" << format_shape(tensor)
-            << ", numel=" << tensor.numel() << std::endl;
-}
-
-TensorPtr create_audio_input() {
-  const auto sizes = to_sizes({3, 128, 3000});
-  const size_t numel = 3ull * 128ull * 3000ull;
-  std::vector<float> data(numel, 0.5f);
-  return make_tensor_ptr<float>(
-      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
-}
-
-TensorPtr create_token_ids_input() {
-  const auto sizes = to_sizes({1, 1138});
-  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
-  return make_tensor_ptr<int64_t>(sizes, std::move(data));
-}
-
-TensorPtr create_positions_input() {
-  const auto sizes = to_sizes({1138});
-  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
-  return make_tensor_ptr<int64_t>(sizes, std::move(data));
-}
-
-TensorPtr create_fallback_text_embedding() {
-  const auto sizes = to_sizes({1, 1138, 3072});
-  const size_t numel = 1ull * 1138ull * 3072ull;
-  std::vector<float> data(numel, 0.0f);
-  return make_tensor_ptr<float>(
-      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
-}
-
-struct MethodTiming {
-  double load_ms{0.0};
-  double run_ms{0.0};
-};
-
-} // namespace
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    std::cerr << "Usage: " << argv[0]
-              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
-              << std::endl;
-    return 1;
-  }
-
-  const std::string program_path = argv[1];
-  const std::string data_map_path = argv[2];
-
-  try {
-    Module module(program_path, data_map_path);
-
-    const auto program_load_start = Clock::now();
-    const Error program_load_error = module.load();
-    const auto program_load_end = Clock::now();
-    if (program_load_error != Error::Ok) {
-      std::cerr << "Failed to load ExecuTorch program: error code "
-                << static_cast<int>(program_load_error) << std::endl;
-      return 1;
-    }
-    const DurationMs program_load_latency =
-        program_load_end - program_load_start;
-
-    MethodTiming audio_timing;
-    MethodTiming token_timing;
-    MethodTiming text_timing;
-
-    auto measure_method_load =
-        [&](const std::string& name) -> std::pair<Error, double> {
-      const auto start = Clock::now();
-      const Error err = module.load_method(name);
-      const auto end = Clock::now();
-      return {err, DurationMs(end - start).count()};
-    };
-
-    // audio_encoder
-    {
-      const auto [err, load_ms] = measure_method_load("audio_encoder");
-      if (err != Error::Ok) {
-        std::cerr << "Failed to load method audio_encoder: error code "
-                  << static_cast<int>(err) << std::endl;
-        return 1;
-      }
-      audio_timing.load_ms = load_ms;
-
-      const TensorPtr audio_input = create_audio_input();
-      std::vector<EValue> inputs;
-      std::vector<TensorPtr> owned_inputs;
-      owned_inputs.emplace_back(audio_input);
-      inputs.emplace_back(*audio_input);
-
-      const auto run_start = Clock::now();
-      Result<std::vector<EValue>> output_result =
-          module.execute("audio_encoder", inputs);
-      const auto run_end = Clock::now();
-      audio_timing.run_ms = DurationMs(run_end - run_start).count();
-
-      if (output_result.error() != Error::Ok) {
-        std::cerr << "audio_encoder execution failed: error code "
-                  << static_cast<int>(output_result.error()) << std::endl;
-        return 1;
-      }
-
-      const auto& outputs = output_result.get();
-      if (!outputs.empty() && outputs[0].isTensor()) {
-        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
-      }
-    }
-
-    EValue token_output;
-    bool token_executed = false;
-
-    // token_embedding
-    {
-      const auto [err, load_ms] = measure_method_load("token_embedding");
-      if (err != Error::Ok) {
-        std::cerr << "Failed to load method token_embedding: error code "
-                  << static_cast<int>(err) << std::endl;
-        return 1;
-      }
-      token_timing.load_ms = load_ms;
-
-      const TensorPtr token_ids = create_token_ids_input();
-      std::vector<EValue> inputs;
-      std::vector<TensorPtr> owned_inputs;
-      owned_inputs.emplace_back(token_ids);
-      inputs.emplace_back(*token_ids);
-
-      const auto run_start = Clock::now();
-      auto token_output_result = module.execute("token_embedding", inputs);
-      const auto run_end = Clock::now();
-      token_timing.run_ms = DurationMs(run_end - run_start).count();
-
-      if (token_output_result.error() != Error::Ok) {
-        std::cerr << "token_embedding execution failed: error code "
-                  << static_cast<int>(token_output_result.error()) << std::endl;
-        return 1;
-      }
-
-      token_executed = true;
-      const auto& outputs = token_output_result.get();
-      if (!outputs.empty() && outputs[0].isTensor()) {
-        print_tensor_summary("token_embedding output", outputs[0].toTensor());
-        token_output = outputs[0];
-      }
-    }
-
-    // text_decoder
-    {
-      const auto [err, load_ms] = measure_method_load("text_decoder");
-      if (err != Error::Ok) {
-        std::cerr << "Failed to load method text_decoder: error code "
-                  << static_cast<int>(err) << std::endl;
-        return 1;
-      }
-      text_timing.load_ms = load_ms;
-
-      std::vector<EValue> inputs;
-      std::vector<TensorPtr> owned_inputs;
-      if (token_executed) {
-        if (token_output.isTensor()) {
-          inputs.emplace_back(token_output);
-        }
-      }
-
-      if (inputs.empty()) {
-        auto fallback_embedding = create_fallback_text_embedding();
-        owned_inputs.emplace_back(fallback_embedding);
-        inputs.emplace_back(*fallback_embedding);
-      }
-
-      auto positions = create_positions_input();
-      owned_inputs.emplace_back(positions);
-      inputs.emplace_back(*positions);
-
-      const auto run_start = Clock::now();
-      Result<std::vector<EValue>> output_result =
-          module.execute("text_decoder", inputs);
-      const auto run_end = Clock::now();
-      text_timing.run_ms = DurationMs(run_end - run_start).count();
-
-      if (output_result.error() != Error::Ok) {
-        std::cerr << "text_decoder execution failed: error code "
-                  << static_cast<int>(output_result.error()) << std::endl;
-        return 1;
-      }
-
-      const auto& outputs = output_result.get();
-      if (!outputs.empty() && outputs[0].isTensor()) {
-        print_tensor_summary("text_decoder output", outputs[0].toTensor());
-      }
-    }
-
-    std::cout << std::fixed << std::setprecision(3);
-    std::cout << "Program load latency (ms): " << program_load_latency.count()
-              << std::endl;
-
-    std::cout << "Method load latency (ms):" << std::endl;
-    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
-    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
-    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
-
-    std::cout << "Run latency (ms):" << std::endl;
-    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
-    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
-    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
-
-    return 0;
-  } catch (const std::exception& ex) {
-    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
-    return 1;
-  }
-}