From 74e7ffa41c9735d9b03a102a730f55c90e70c90b Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 30 Sep 2025 23:36:58 -0700
Subject: [PATCH 01/15] Make it work

---
 backends/cuda/cuda_backend.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index a39065f6a52..aee58deef8a 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -12,9 +12,13 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+<<<<<<< HEAD
 from executorch.backends.cuda.replace_slice_copy_with_slice import (
     ReplaceSliceCopyWithSlicePass,
 )
+=======
+from executorch.backends.cuda.replace_slice_copy_with_slice import ReplaceSliceCopyWithSlicePass
+>>>>>>> e5be1a2b85 (Make it work)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (
@@ -33,6 +37,8 @@
 # required fallback kernels but not supported
 missing_fallback_kernels: Set[str] = set()
 
+class COMPILE_SPEC_KEYS(Enum):
+    METHOD_NAME = "method_name"
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"

From 4222fe65fa99130e28009fb58b7aee9379229ec8 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 1 Oct 2025 15:28:05 -0700
Subject: [PATCH 02/15] ET AOTI CUDA runtime libraries

---
 CMakeLists.txt                         |  15 ++
 backends/aoti/aoti_model_container.h   |   1 +
 backends/cuda/CMakeLists.txt           |  72 ++++++
 backends/cuda/runtime/cuda_backend.cpp | 337 +++++++++++++++++++++++++
 backends/cuda/tests/voxtral_runner.cpp | 255 +++++++++++++++++++
 extension/llm/runner/pybindings.cpp    |   2 +-
 6 files changed, 681 insertions(+), 1 deletion(-)
 create mode 100644 backends/cuda/CMakeLists.txt
 create mode 100644 backends/cuda/runtime/cuda_backend.cpp
 create mode 100644 backends/cuda/tests/voxtral_runner.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7012ec641bf..f023069add6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,6 +587,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -1021,6 +1031,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  # Add flat tensor extension if it's built
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 4b20aefc976..09634f2c1ca 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Type definitions
+using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 
 // Forward declarations for AOT Inductor model container
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..7f8266adfe0
--- /dev/null
+++ b/backends/cuda/CMakeLists.txt
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC
+    ${CUDAToolkit_INCLUDE_DIRS}
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+# Ensure symbols are exported properly
+target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda
+  PUBLIC
+    aoti_common
+    CUDA::cudart
+    ${CMAKE_DL_LIBS}
+    # Link PyTorch libraries for AOTI CUDA functions
+    ${TORCH_LIBRARIES}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+# Add runtime
+add_executable(voxtral_runner tests/voxtral_runner.cpp)
+target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor)
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
new file mode 100644
index 00000000000..6c03361a153
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <dlfcn.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// Include our shim layer headers
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class CudaBackend final : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
+      return Error::AccessFailed;
+    }
+
+    return Error::Ok;
+  }
+
+ public:
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+
+    string so_blob_key = "so_blob";
+
+    Result<FreeableBuffer> aoti_cuda_buffer =
+        named_data_map->get_data(so_blob_key.c_str());
+
+    ET_CHECK_OK_OR_RETURN_ERROR(aoti_cuda_buffer);
+
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / ("aoti_cuda_" + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ofstream outfile(so_path.c_str(), ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    outfile.write(
+        (char*)aoti_cuda_buffer->data(),
+        sizeof(void*) * aoti_cuda_buffer->size());
+
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
+      return Error::AccessFailed;
+    }
+
+    processed->Free();
+
+    // Register all shared library functions
+    Error reg_err = register_shared_library_functions(so_handle);
+    if (reg_err != Error::Ok) {
+      return reg_err;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr);
+    if (err != Error::Ok) {
+      return err;
+    }
+    ET_LOG(Info, "container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    if (n_inputs + n_outputs != args.size()) {
+      ET_LOG(
+          Error,
+          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+          n_inputs,
+          n_outputs,
+          args.size());
+      return Error::InvalidArgument;
+    }
+
+    // NOTE: ExecutorTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    // Process input tensors: ExecutorTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      // Get tensor dimensions and properties from ExecutorTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
+        return Error::Internal;
+      }
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
+        return Error::Internal;
+      }
+    }
+
+    // Process output tensors: create GPU counterparts for ExecutorTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecutorTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
+        return Error::Internal;
+      }
+
+      gpu_outputs[i] = gpu_output_handle;
+    }
+
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        nullptr, // Pass the actual CUDA stream!
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    if (error != Error::Ok) {
+      ET_LOG(
+          Error,
+          "AOTInductorModelContainerRun failed with error code %d",
+          error);
+      return Error::Internal;
+    }
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
+        return Error::Internal;
+      }
+    }
+
+    // Clean up GPU tensors that we created (ExecutorTorch tensors are always
+    // CPU, so all GPU tensors are our copies)
+    for (int i = 0; i < n_inputs; i++) {
+      // All GPU input tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
+
+    for (int i = 0; i < n_outputs; i++) {
+      // All GPU output tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_outputs[i]);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Delete the container BEFORE closing the shared library
+    if (handle->container_handle != nullptr) {
+      AOTIRuntimeError delete_result =
+          AOTInductorModelContainerDelete(handle->container_handle);
+      if (delete_result != Error::Ok) {
+        ET_LOG(
+            Error,
+            "AOTInductorModelContainerDelete failed with error code %d",
+            delete_result);
+      }
+    }
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    free(handle);
+    clear_all_tensors();
+  }
+};
+
+} // namespace cuda
+
+namespace {
+auto cls = cuda::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
new file mode 100644
index 00000000000..71775bf3469
--- /dev/null
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -0,0 +1,255 @@
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input() {
+  const auto sizes = to_sizes({1, 1138});
+  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input() {
+  const auto sizes = to_sizes({1138});
+  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding() {
+  const auto sizes = to_sizes({1, 1138, 3072});
+  const size_t numel = 1ull * 1138ull * 3072ull;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0]
+              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+              << std::endl;
+    return 1;
+  }
+
+  const std::string program_path = argv[1];
+  const std::string data_map_path = argv[2];
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    MethodTiming audio_timing;
+    MethodTiming token_timing;
+    MethodTiming text_timing;
+
+    auto measure_method_load =
+        [&](const std::string& name) -> std::pair<Error, double> {
+      const auto start = Clock::now();
+      const Error err = module.load_method(name);
+      const auto end = Clock::now();
+      return {err, DurationMs(end - start).count()};
+    };
+
+    // audio_encoder
+    {
+      const auto [err, load_ms] = measure_method_load("audio_encoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method audio_encoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      audio_timing.load_ms = load_ms;
+
+      const TensorPtr audio_input = create_audio_input();
+      std::vector<EValue> inputs;
+      inputs.emplace_back(audio_input);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("audio_encoder", inputs);
+      const auto run_end = Clock::now();
+      audio_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "audio_encoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
+      }
+    }
+
+    EValue token_output;
+    bool token_executed = false;
+
+    // token_embedding
+    {
+      const auto [err, load_ms] = measure_method_load("token_embedding");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method token_embedding: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      token_timing.load_ms = load_ms;
+
+      const TensorPtr token_ids = create_token_ids_input();
+      std::vector<EValue> inputs;
+      inputs.emplace_back(token_ids);
+
+      const auto run_start = Clock::now();
+      auto token_output_result = module.execute("token_embedding", inputs);
+      const auto run_end = Clock::now();
+      token_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (token_output_result.error() != Error::Ok) {
+        std::cerr << "token_embedding execution failed: error code "
+                  << static_cast<int>(token_output_result.error()) << std::endl;
+        return 1;
+      }
+
+      token_executed = true;
+      const auto& outputs = token_output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("token_embedding output", outputs[0].toTensor());
+        token_output = outputs[0];
+      }
+    }
+
+    // text_decoder
+    {
+      const auto [err, load_ms] = measure_method_load("text_decoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method text_decoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      text_timing.load_ms = load_ms;
+
+      std::vector<EValue> inputs;
+      if (token_executed) {
+        if (token_output.isTensor()) {
+          inputs.emplace_back(token_output);
+        }
+      }
+
+      if (inputs.empty()) {
+        inputs.emplace_back(create_fallback_text_embedding());
+      }
+
+      inputs.emplace_back(create_positions_input());
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("text_decoder", inputs);
+      const auto run_end = Clock::now();
+      text_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "text_decoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("text_decoder output", outputs[0].toTensor());
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "Method load latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
+
+    std::cout << "Run latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
index bcc6aba0f8e..08051515d8d 100644
--- a/extension/llm/runner/pybindings.cpp
+++ b/extension/llm/runner/pybindings.cpp
@@ -644,4 +644,4 @@ PYBIND11_MODULE(_llm_runner, m) {
       .def("__repr__", [](const PyMultimodalRunner& runner) {
         return "<MultimodalRunner>";
       });
-}
\ No newline at end of file
+}

From c838eee3f63a5416467b6b413f7bc346d4759b4a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 1 Oct 2025 22:39:42 -0700
Subject: [PATCH 03/15] Resize tensor

---
 backends/cuda/runtime/cuda_backend.cpp | 50 +++++++++++++++++---------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 6c03361a153..1501eace28c 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -6,11 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-
-#include <dlfcn.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <unistd.h>
 #include <cstdio>
 
@@ -105,15 +105,29 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
       FreeableBuffer* processed, // This will be a empty buffer
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    const NamedDataMap* named_data_map = context.get_named_data_map();
-
-    string so_blob_key = "so_blob";
-
-    Result<FreeableBuffer> aoti_cuda_buffer =
-        named_data_map->get_data(so_blob_key.c_str());
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
 
-    ET_CHECK_OK_OR_RETURN_ERROR(aoti_cuda_buffer);
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
 
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    if (!aoti_cuda_buffer.ok()) {
+      ET_LOG(
+          Error,
+          "Failed to get data for key %s: 0x%x",
+          so_blob_key.c_str(),
+          aoti_cuda_buffer.error());
+      return aoti_cuda_buffer.error();
+    }
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
@@ -226,7 +240,7 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
         return Error::Internal;
       }
     }
-
+    ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecutorTorch CPU
     // tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -255,7 +269,7 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
 
       gpu_outputs[i] = gpu_output_handle;
     }
-
+    ET_LOG(Info, "Outputs created on GPU");
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,
@@ -277,11 +291,15 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
-        return Error::Internal;
-      }
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
     }
 
     // Clean up GPU tensors that we created (ExecutorTorch tensors are always

From 366c7631375fe98e443381fb062825f2ed2a9773 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Thu, 2 Oct 2025 15:11:20 -0700
Subject: [PATCH 04/15] Make Voxtral work

---
 backends/aoti/utils.h                  |  2 ++
 backends/cuda/CMakeLists.txt           |  2 +-
 backends/cuda/runtime/cuda_backend.cpp | 11 ++++++++---
 backends/cuda/runtime/shims/utils.h    |  5 ++++-
 backends/cuda/tests/voxtral_runner.cpp | 17 +++++++++++++----
 5 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 1c872e08648..78c07bcea6e 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 7f8266adfe0..2d08b142605 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -63,7 +63,7 @@ executorch_target_link_options_shared_lib(aoti_cuda)
 
 # Add runtime
 add_executable(voxtral_runner tests/voxtral_runner.cpp)
-target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor)
+target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor portable_ops_lib)
 
 install(
   TARGETS aoti_cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 1501eace28c..52bc133c658 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -131,15 +131,20 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
-        temp_dir / ("aoti_cuda_" + to_string(getpid()) + ".so");
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
     ofstream outfile(so_path.c_str(), ios::binary);
 
     // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
     outfile.write(
-        (char*)aoti_cuda_buffer->data(),
-        sizeof(void*) * aoti_cuda_buffer->size());
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
 
     // Finish writing the file to disk
     outfile.close();
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h
index 99d2bc102f5..02c3abfc83f 100644
--- a/backends/cuda/runtime/shims/utils.h
+++ b/backends/cuda/runtime/shims/utils.h
@@ -40,6 +40,7 @@ namespace cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
 };
@@ -100,6 +101,7 @@ using AOTITorchError = Error;
 // Helper function to check if a dtype is supported in ET CUDA backend
 inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
   switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
       return true;
@@ -113,8 +115,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
       dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
 
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
index 71775bf3469..feed458e1f5 100644
--- a/backends/cuda/tests/voxtral_runner.cpp
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -136,7 +136,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr audio_input = create_audio_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(audio_input);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =
@@ -171,7 +173,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr token_ids = create_token_ids_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(token_ids);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
 
       const auto run_start = Clock::now();
       auto token_output_result = module.execute("token_embedding", inputs);
@@ -203,6 +207,7 @@ int main(int argc, char** argv) {
       text_timing.load_ms = load_ms;
 
       std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
       if (token_executed) {
         if (token_output.isTensor()) {
           inputs.emplace_back(token_output);
@@ -210,10 +215,14 @@ int main(int argc, char** argv) {
       }
 
       if (inputs.empty()) {
-        inputs.emplace_back(create_fallback_text_embedding());
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
       }
 
-      inputs.emplace_back(create_positions_input());
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =

From a7ecb6fbaf3011e7572926e9f5cf58251e9fb2ec Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Thu, 2 Oct 2025 15:12:03 -0700
Subject: [PATCH 05/15] Fix merge conflict

---
 backends/cuda/cuda_backend.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index aee58deef8a..b5f6de1f151 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -12,13 +12,9 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
-<<<<<<< HEAD
 from executorch.backends.cuda.replace_slice_copy_with_slice import (
     ReplaceSliceCopyWithSlicePass,
 )
-=======
-from executorch.backends.cuda.replace_slice_copy_with_slice import ReplaceSliceCopyWithSlicePass
->>>>>>> e5be1a2b85 (Make it work)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (

From d1b21a0c8d77709b3d6f0ab898f679cb74e170fe Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 3 Oct 2025 10:32:33 -0700
Subject: [PATCH 06/15] Update

---
 backends/cuda/CMakeLists.txt      | 32 +++++++++++++++----------------
 backends/cuda/cuda_backend.py     |  4 ----
 backends/cuda/cuda_partitioner.py |  3 ++-
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 2d08b142605..77b270cf5ab 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -29,19 +29,17 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 # CUDA-specific AOTI functionality
-set(_aoti_cuda_sources
-    runtime/cuda_backend.cpp
-    runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp)
+set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
+                       runtime/shims/tensor_attribute.cpp
+)
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
   aoti_cuda
-  PUBLIC
-    ${CUDAToolkit_INCLUDE_DIRS}
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
-    $<INSTALL_INTERFACE:include>
-    # PyTorch AOTI headers from ExecutorTorch's torch detection
-    ${TORCH_INCLUDE_DIRS}
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
 )
 target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
@@ -50,12 +48,9 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
   aoti_cuda
-  PUBLIC
-    aoti_common
-    CUDA::cudart
-    ${CMAKE_DL_LIBS}
-    # Link PyTorch libraries for AOTI CUDA functions
-    ${TORCH_LIBRARIES}
+  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+         # Link PyTorch libraries for AOTI CUDA functions
+         ${TORCH_LIBRARIES}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
@@ -63,7 +58,10 @@ executorch_target_link_options_shared_lib(aoti_cuda)
 
 # Add runtime
 add_executable(voxtral_runner tests/voxtral_runner.cpp)
-target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor portable_ops_lib)
+target_link_libraries(
+  voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor
+                        portable_ops_lib
+)
 
 install(
   TARGETS aoti_cuda
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index b5f6de1f151..936a917fbc2 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -36,10 +36,6 @@
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
 
-class COMPILE_SPEC_KEYS(Enum):
-    METHOD_NAME = "method_name"
-
-
 # context manager for non-fallback guarantee
 # it will raise exception when generating fallback kernels during aoti compile
 @contextlib.contextmanager
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index 14c75bdb937..64df7b7dcb2 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -15,7 +15,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 
 
@@ -54,6 +54,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags

From 514832a29fc3ebd194d359ff725dca15d16c128a Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 30 Sep 2025 23:36:58 -0700
Subject: [PATCH 07/15] Make it work

---
 backends/cuda/cuda_backend.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index 8ed8cdefbb1..b3920e2fe06 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -12,9 +12,13 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+<<<<<<< HEAD
 from executorch.backends.cuda.replace_slice_copy_with_slice import (
     ReplaceSliceCopyWithSlicePass,
 )
+=======
+from executorch.backends.cuda.replace_slice_copy_with_slice import ReplaceSliceCopyWithSlicePass
+>>>>>>> e5be1a2b85 (Make it work)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (
@@ -33,6 +37,8 @@
 # required fallback kernels but not supported
 missing_fallback_kernels: Set[str] = set()
 
+class COMPILE_SPEC_KEYS(Enum):
+    METHOD_NAME = "method_name"
 
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"

From ab58b488453b2532a729296d3af4d267a6ec74d7 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 1 Oct 2025 15:28:05 -0700
Subject: [PATCH 08/15] ET AOTI CUDA runtime libraries

---
 CMakeLists.txt                         |  15 ++
 backends/aoti/aoti_model_container.h   |   1 +
 backends/cuda/CMakeLists.txt           |  72 ++++++
 backends/cuda/runtime/cuda_backend.cpp | 337 +++++++++++++++++++++++++
 backends/cuda/tests/voxtral_runner.cpp | 255 +++++++++++++++++++
 extension/llm/runner/pybindings.cpp    |   2 +-
 6 files changed, 681 insertions(+), 1 deletion(-)
 create mode 100644 backends/cuda/CMakeLists.txt
 create mode 100644 backends/cuda/runtime/cuda_backend.cpp
 create mode 100644 backends/cuda/tests/voxtral_runner.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7012ec641bf..f023069add6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,6 +587,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -1021,6 +1031,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  # Add flat tensor extension if it's built
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 4b20aefc976..09634f2c1ca 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -21,6 +21,7 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Type definitions
+using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 
 // Forward declarations for AOT Inductor model container
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..7f8266adfe0
--- /dev/null
+++ b/backends/cuda/CMakeLists.txt
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC
+    ${CUDAToolkit_INCLUDE_DIRS}
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+# Ensure symbols are exported properly
+target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda
+  PUBLIC
+    aoti_common
+    CUDA::cudart
+    ${CMAKE_DL_LIBS}
+    # Link PyTorch libraries for AOTI CUDA functions
+    ${TORCH_LIBRARIES}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+# Add runtime
+add_executable(voxtral_runner tests/voxtral_runner.cpp)
+target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor)
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
new file mode 100644
index 00000000000..6c03361a153
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <dlfcn.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// Include our shim layer headers
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class CudaBackend final : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
+      return Error::AccessFailed;
+    }
+
+    return Error::Ok;
+  }
+
+ public:
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+
+    string so_blob_key = "so_blob";
+
+    Result<FreeableBuffer> aoti_cuda_buffer =
+        named_data_map->get_data(so_blob_key.c_str());
+
+    ET_CHECK_OK_OR_RETURN_ERROR(aoti_cuda_buffer);
+
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / ("aoti_cuda_" + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ofstream outfile(so_path.c_str(), ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    outfile.write(
+        (char*)aoti_cuda_buffer->data(),
+        sizeof(void*) * aoti_cuda_buffer->size());
+
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
+      return Error::AccessFailed;
+    }
+
+    processed->Free();
+
+    // Register all shared library functions
+    Error reg_err = register_shared_library_functions(so_handle);
+    if (reg_err != Error::Ok) {
+      return reg_err;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr);
+    if (err != Error::Ok) {
+      return err;
+    }
+    ET_LOG(Info, "container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    if (n_inputs + n_outputs != args.size()) {
+      ET_LOG(
+          Error,
+          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+          n_inputs,
+          n_outputs,
+          args.size());
+      return Error::InvalidArgument;
+    }
+
+    // NOTE: ExecutorTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    // Process input tensors: ExecutorTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      // Get tensor dimensions and properties from ExecutorTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
+        return Error::Internal;
+      }
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
+        return Error::Internal;
+      }
+    }
+
+    // Process output tensors: create GPU counterparts for ExecutorTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecutorTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
+        return Error::Internal;
+      }
+
+      gpu_outputs[i] = gpu_output_handle;
+    }
+
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        nullptr, // Pass the actual CUDA stream!
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    if (error != Error::Ok) {
+      ET_LOG(
+          Error,
+          "AOTInductorModelContainerRun failed with error code %d",
+          error);
+      return Error::Internal;
+    }
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
+        return Error::Internal;
+      }
+    }
+
+    // Clean up GPU tensors that we created (ExecutorTorch tensors are always
+    // CPU, so all GPU tensors are our copies)
+    for (int i = 0; i < n_inputs; i++) {
+      // All GPU input tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
+
+    for (int i = 0; i < n_outputs; i++) {
+      // All GPU output tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_outputs[i]);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Delete the container BEFORE closing the shared library
+    if (handle->container_handle != nullptr) {
+      AOTIRuntimeError delete_result =
+          AOTInductorModelContainerDelete(handle->container_handle);
+      if (delete_result != Error::Ok) {
+        ET_LOG(
+            Error,
+            "AOTInductorModelContainerDelete failed with error code %d",
+            delete_result);
+      }
+    }
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    free(handle);
+    clear_all_tensors();
+  }
+};
+
+} // namespace cuda
+
+namespace {
+auto cls = cuda::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
new file mode 100644
index 00000000000..71775bf3469
--- /dev/null
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -0,0 +1,255 @@
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input() {
+  const auto sizes = to_sizes({1, 1138});
+  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input() {
+  const auto sizes = to_sizes({1138});
+  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding() {
+  const auto sizes = to_sizes({1, 1138, 3072});
+  const size_t numel = 1ull * 1138ull * 3072ull;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0]
+              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+              << std::endl;
+    return 1;
+  }
+
+  const std::string program_path = argv[1];
+  const std::string data_map_path = argv[2];
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    MethodTiming audio_timing;
+    MethodTiming token_timing;
+    MethodTiming text_timing;
+
+    auto measure_method_load =
+        [&](const std::string& name) -> std::pair<Error, double> {
+      const auto start = Clock::now();
+      const Error err = module.load_method(name);
+      const auto end = Clock::now();
+      return {err, DurationMs(end - start).count()};
+    };
+
+    // audio_encoder
+    {
+      const auto [err, load_ms] = measure_method_load("audio_encoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method audio_encoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      audio_timing.load_ms = load_ms;
+
+      const TensorPtr audio_input = create_audio_input();
+      std::vector<EValue> inputs;
+      inputs.emplace_back(audio_input);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("audio_encoder", inputs);
+      const auto run_end = Clock::now();
+      audio_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "audio_encoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
+      }
+    }
+
+    EValue token_output;
+    bool token_executed = false;
+
+    // token_embedding
+    {
+      const auto [err, load_ms] = measure_method_load("token_embedding");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method token_embedding: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      token_timing.load_ms = load_ms;
+
+      const TensorPtr token_ids = create_token_ids_input();
+      std::vector<EValue> inputs;
+      inputs.emplace_back(token_ids);
+
+      const auto run_start = Clock::now();
+      auto token_output_result = module.execute("token_embedding", inputs);
+      const auto run_end = Clock::now();
+      token_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (token_output_result.error() != Error::Ok) {
+        std::cerr << "token_embedding execution failed: error code "
+                  << static_cast<int>(token_output_result.error()) << std::endl;
+        return 1;
+      }
+
+      token_executed = true;
+      const auto& outputs = token_output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("token_embedding output", outputs[0].toTensor());
+        token_output = outputs[0];
+      }
+    }
+
+    // text_decoder
+    {
+      const auto [err, load_ms] = measure_method_load("text_decoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method text_decoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      text_timing.load_ms = load_ms;
+
+      std::vector<EValue> inputs;
+      if (token_executed) {
+        if (token_output.isTensor()) {
+          inputs.emplace_back(token_output);
+        }
+      }
+
+      if (inputs.empty()) {
+        inputs.emplace_back(create_fallback_text_embedding());
+      }
+
+      inputs.emplace_back(create_positions_input());
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("text_decoder", inputs);
+      const auto run_end = Clock::now();
+      text_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "text_decoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("text_decoder output", outputs[0].toTensor());
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "Method load latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
+
+    std::cout << "Run latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
index bcc6aba0f8e..08051515d8d 100644
--- a/extension/llm/runner/pybindings.cpp
+++ b/extension/llm/runner/pybindings.cpp
@@ -644,4 +644,4 @@ PYBIND11_MODULE(_llm_runner, m) {
       .def("__repr__", [](const PyMultimodalRunner& runner) {
         return "<MultimodalRunner>";
       });
-}
\ No newline at end of file
+}

From 5152cf9d8d575e09deef4cb9b8461f3fa42ec048 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Wed, 1 Oct 2025 22:39:42 -0700
Subject: [PATCH 09/15] Resize tensor

---
 backends/cuda/runtime/cuda_backend.cpp | 50 +++++++++++++++++---------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 6c03361a153..1501eace28c 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -6,11 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
-
-#include <dlfcn.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <unistd.h>
 #include <cstdio>
 
@@ -105,15 +105,29 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
       FreeableBuffer* processed, // This will be a empty buffer
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    const NamedDataMap* named_data_map = context.get_named_data_map();
-
-    string so_blob_key = "so_blob";
-
-    Result<FreeableBuffer> aoti_cuda_buffer =
-        named_data_map->get_data(so_blob_key.c_str());
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
 
-    ET_CHECK_OK_OR_RETURN_ERROR(aoti_cuda_buffer);
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
 
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    if (!aoti_cuda_buffer.ok()) {
+      ET_LOG(
+          Error,
+          "Failed to get data for key %s: 0x%x",
+          so_blob_key.c_str(),
+          aoti_cuda_buffer.error());
+      return aoti_cuda_buffer.error();
+    }
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
@@ -226,7 +240,7 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
         return Error::Internal;
       }
     }
-
+    ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecutorTorch CPU
     // tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -255,7 +269,7 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
 
       gpu_outputs[i] = gpu_output_handle;
     }
-
+    ET_LOG(Info, "Outputs created on GPU");
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,
@@ -277,11 +291,15 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
-        return Error::Internal;
-      }
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
     }
 
     // Clean up GPU tensors that we created (ExecutorTorch tensors are always

From 1cdbd616dc5940b8e79274b8591c0e6d4daebaf7 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Thu, 2 Oct 2025 15:11:20 -0700
Subject: [PATCH 10/15] Make Voxtral work

---
 backends/aoti/utils.h                  |  2 ++
 backends/cuda/CMakeLists.txt           |  2 +-
 backends/cuda/runtime/cuda_backend.cpp | 11 ++++++++---
 backends/cuda/runtime/shims/utils.h    |  5 ++++-
 backends/cuda/tests/voxtral_runner.cpp | 17 +++++++++++++----
 5 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 1c872e08648..78c07bcea6e 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -34,6 +34,8 @@ inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
   // Convert based on known PyTorch dtype codes (without CUDA-specific
   // dependency)
   switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
     case 15: // PyTorch's bfloat16 dtype code
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 7f8266adfe0..2d08b142605 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -63,7 +63,7 @@ executorch_target_link_options_shared_lib(aoti_cuda)
 
 # Add runtime
 add_executable(voxtral_runner tests/voxtral_runner.cpp)
-target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor)
+target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor portable_ops_lib)
 
 install(
   TARGETS aoti_cuda
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 1501eace28c..52bc133c658 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -131,15 +131,20 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
-        temp_dir / ("aoti_cuda_" + to_string(getpid()) + ".so");
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
 
     // Create a temporary file
     ofstream outfile(so_path.c_str(), ios::binary);
 
     // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
     outfile.write(
-        (char*)aoti_cuda_buffer->data(),
-        sizeof(void*) * aoti_cuda_buffer->size());
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
 
     // Finish writing the file to disk
     outfile.close();
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/shims/utils.h
index 99d2bc102f5..02c3abfc83f 100644
--- a/backends/cuda/runtime/shims/utils.h
+++ b/backends/cuda/runtime/shims/utils.h
@@ -40,6 +40,7 @@ namespace cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
   FLOAT32 = 6, // PyTorch's float32 dtype code
   BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
 };
@@ -100,6 +101,7 @@ using AOTITorchError = Error;
 // Helper function to check if a dtype is supported in ET CUDA backend
 inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
   switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
     case static_cast<int32_t>(SupportedDTypes::FLOAT32):
     case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
       return true;
@@ -113,8 +115,9 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
   ET_CHECK_OR_RETURN_ERROR(
       is_dtype_supported_in_et_cuda(dtype),
       InvalidArgument,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32), %d (bfloat16)",
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
       dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
       static_cast<int32_t>(SupportedDTypes::FLOAT32),
       static_cast<int32_t>(SupportedDTypes::BFLOAT16));
 
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
index 71775bf3469..feed458e1f5 100644
--- a/backends/cuda/tests/voxtral_runner.cpp
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -136,7 +136,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr audio_input = create_audio_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(audio_input);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =
@@ -171,7 +173,9 @@ int main(int argc, char** argv) {
 
       const TensorPtr token_ids = create_token_ids_input();
       std::vector<EValue> inputs;
-      inputs.emplace_back(token_ids);
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
 
       const auto run_start = Clock::now();
       auto token_output_result = module.execute("token_embedding", inputs);
@@ -203,6 +207,7 @@ int main(int argc, char** argv) {
       text_timing.load_ms = load_ms;
 
       std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
       if (token_executed) {
         if (token_output.isTensor()) {
           inputs.emplace_back(token_output);
@@ -210,10 +215,14 @@ int main(int argc, char** argv) {
       }
 
       if (inputs.empty()) {
-        inputs.emplace_back(create_fallback_text_embedding());
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
       }
 
-      inputs.emplace_back(create_positions_input());
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
 
       const auto run_start = Clock::now();
       Result<std::vector<EValue>> output_result =

From a6ee5757d892e5338979624f4bb9c4b0f4afbb25 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Thu, 2 Oct 2025 15:12:03 -0700
Subject: [PATCH 11/15] Fix merge conflict

---
 backends/cuda/cuda_backend.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index b3920e2fe06..abc8aec48f6 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -12,13 +12,9 @@
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
-<<<<<<< HEAD
 from executorch.backends.cuda.replace_slice_copy_with_slice import (
     ReplaceSliceCopyWithSlicePass,
 )
-=======
-from executorch.backends.cuda.replace_slice_copy_with_slice import ReplaceSliceCopyWithSlicePass
->>>>>>> e5be1a2b85 (Make it work)
 from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir._warnings import experimental
 from executorch.exir.backend.backend_details import (

From c73b059440b2c4ac5ecb3c2b7d6270c423f1231e Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 3 Oct 2025 10:32:33 -0700
Subject: [PATCH 12/15] Update

---
 backends/cuda/CMakeLists.txt  | 32 +++++++++++++++-----------------
 backends/cuda/cuda_backend.py |  4 ----
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 2d08b142605..77b270cf5ab 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -29,19 +29,17 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 # CUDA-specific AOTI functionality
-set(_aoti_cuda_sources
-    runtime/cuda_backend.cpp
-    runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp)
+set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
+                       runtime/shims/tensor_attribute.cpp
+)
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
   aoti_cuda
-  PUBLIC
-    ${CUDAToolkit_INCLUDE_DIRS}
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
-    $<INSTALL_INTERFACE:include>
-    # PyTorch AOTI headers from ExecutorTorch's torch detection
-    ${TORCH_INCLUDE_DIRS}
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
 )
 target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
@@ -50,12 +48,9 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
   aoti_cuda
-  PUBLIC
-    aoti_common
-    CUDA::cudart
-    ${CMAKE_DL_LIBS}
-    # Link PyTorch libraries for AOTI CUDA functions
-    ${TORCH_LIBRARIES}
+  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+         # Link PyTorch libraries for AOTI CUDA functions
+         ${TORCH_LIBRARIES}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
@@ -63,7 +58,10 @@ executorch_target_link_options_shared_lib(aoti_cuda)
 
 # Add runtime
 add_executable(voxtral_runner tests/voxtral_runner.cpp)
-target_link_libraries(voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor portable_ops_lib)
+target_link_libraries(
+  voxtral_runner PUBLIC aoti_cuda extension_module_static extension_flat_tensor
+                        portable_ops_lib
+)
 
 install(
   TARGETS aoti_cuda
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
index abc8aec48f6..a72538d3471 100644
--- a/backends/cuda/cuda_backend.py
+++ b/backends/cuda/cuda_backend.py
@@ -36,10 +36,6 @@
 class COMPILE_SPEC_KEYS(Enum):
     METHOD_NAME = "method_name"
 
-class COMPILE_SPEC_KEYS(Enum):
-    METHOD_NAME = "method_name"
-
-
 # context manager for non-fallback guarantee
 # it will raise exception when generating fallback kernels during aoti compile
 @contextlib.contextmanager

From 9605b9d89024f57eb3fd69f052ddbed2aef89662 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 6 Oct 2025 16:07:01 -0700
Subject: [PATCH 13/15] add cudagurad and cudastreamguard support

---
 aoti_debug_data/label_output.txt              |   1 +
 backends/aoti/aoti_model_container.h          |   1 +
 backends/cuda/CMakeLists.txt                  |   8 +-
 backends/cuda/runtime/TARGETS                 |   6 +-
 backends/cuda/runtime/cuda_backend.cpp        |  25 ++-
 backends/cuda/runtime/guard.cpp               | 146 +++++++++++++
 backends/cuda/runtime/guard.h                 | 200 ++++++++++++++++++
 backends/cuda/runtime/shims/cuda_guard.cpp    | 109 ++++++++++
 backends/cuda/runtime/shims/cuda_guard.h      | 104 +++++++++
 backends/cuda/runtime/shims/memory.cpp        |  77 +++----
 .../test_aoti_torch__reinterpret_tensor.cpp   |   2 +-
 .../shims/tests/test_aoti_torch_copy_.cpp     |   2 +-
 ..._aoti_torch_create_tensor_from_blob_v2.cpp |   2 +-
 .../test_aoti_torch_delete_tensor_object.cpp  |   2 +-
 .../tests/test_aoti_torch_empty_strided.cpp   |   2 +-
 backends/cuda/runtime/{shims => }/utils.h     |   0
 tools/cmake/preset/default.cmake              |   3 +
 17 files changed, 645 insertions(+), 45 deletions(-)
 create mode 100644 aoti_debug_data/label_output.txt
 create mode 100644 backends/cuda/runtime/guard.cpp
 create mode 100644 backends/cuda/runtime/guard.h
 create mode 100644 backends/cuda/runtime/shims/cuda_guard.cpp
 create mode 100644 backends/cuda/runtime/shims/cuda_guard.h
 rename backends/cuda/runtime/{shims => }/utils.h (100%)

diff --git a/aoti_debug_data/label_output.txt b/aoti_debug_data/label_output.txt
new file mode 100644
index 00000000000..b6e6319c5d2
--- /dev/null
+++ b/aoti_debug_data/label_output.txt
@@ -0,0 +1 @@
+-0.03913404420018196,0.11446283012628555,-1.7967588901519775,-1.2342960834503174,-0.8190052509307861,0.3239615559577942,-2.186603307723999,-1.2876651287078857,-1.9019219875335693,-0.7314844727516174,0.7164335250854492,-1.6698509454727173,-1.4514909982681274,-1.2658661603927612,-1.5797510147094727,-1.038211464881897,-0.2147778570652008,-2.0712575912475586,-1.5538352727890015,-1.2831263542175293,-0.5831859707832336,1.619336724281311,-0.030490178614854813,-0.4813898503780365,-1.1297798156738281,-0.369297593832016,0.388177752494812,0.057438768446445465,0.4631587564945221,-0.2705245912075043,-1.4319391250610352,-0.7513930797576904,-0.41541463136672974,-1.8500491380691528,-0.4206249713897705,-1.191244125366211,-0.519303023815155,-1.9623936414718628,1.366169810295105,-1.105899453163147,-0.7772476673126221,-0.020080728456377983,0.1334831416606903,1.3196574449539185,-0.22508525848388672,0.6348890662193298,-1.1425437927246094,0.4581128656864166,-0.8908204436302185,-1.1983755826950073,-1.0953805446624756,1.4282540082931519,0.46135658025741577,-0.43548062443733215,-0.3356531858444214,-1.5134204626083374,-0.923163890838623,-1.6103588342666626,-1.0705418586730957,1.3484578132629395,0.22440192103385925,-0.8475747108459473,1.326701045036316,0.9815365076065063,0.6489527821540833,-0.22182974219322205,0.8341851234436035,-1.0138921737670898,1.4669286012649536,-0.4402707815170288,-1.0489991903305054,0.36616742610931396,-0.7941051721572876,2.1867175102233887,-1.7013881206512451,0.38229987025260925,-1.0632861852645874,-0.6783405542373657,1.7636947631835938,0.10571509599685669,1.959584355354309,-0.495066374540329,-0.32849109172821045,0.42277011275291443,-1.8629132509231567,-0.9533417224884033,-0.08654581010341644,-0.41913020610809326,-0.7976489067077637,-0.6835601329803467,-1.6655486822128296,0.20317625999450684,0.6550376415252686,-0.8757399320602417,-1.2040992975234985,0.31358838081359863,-0.5407125353813171,-2.0862929821014404,1.310542106628418,-0.24725094437599182,-1.001437783241272,0.25003859400749207,-0.35498347878456116,2.1051602363586426,-0.8274843692779541,-2.2429769039154053,-0.9414843320846558,-1.8201124668121338,-2.0493545532226562,-1.4745506048202515,-1.219736933708191,3.529479503631592,-0.6299042105674744,0.09119724482297897,-0.3092268109321594,-1.460767149925232,-0.7344509363174438,1.3739006519317627,-1.0220916271209717,-1.8681803941726685,-0.36143067479133606,-1.2824289798736572,-0.6740354299545288,-1.5281565189361572,-1.2834731340408325,-1.421437382698059,0.7150492072105408,-0.31045711040496826,1.7923942804336548,-0.5272325873374939,-1.3796250820159912,-1.1029645204544067,-0.8410789370536804,-0.4111415445804596,-0.7273851037025452,0.8324657678604126,-1.6222048997879028,-0.8601832389831543,-0.4216291904449463,-1.2872693538665771,-0.71281498670578,-0.336731493473053,-0.009143831208348274,-0.07268509268760681,-0.9919785857200623,-0.5055995583534241,0.49048081040382385,1.40537691116333,-0.3487783968448639,0.37215569615364075,-0.7167516350746155,-0.15457889437675476,0.8864662647247314,-0.0710245817899704,-0.1462513506412506,-0.8821230530738831,-0.4791715741157532,-0.3290942311286926,0.4341500997543335,-0.38155701756477356,-0.3562145531177521,-0.4755186438560486,-0.6587932109832764,0.19298143684864044,0.5636974573135376,0.5926137566566467,0.6501497626304626,0.3383529484272003,0.4471110701560974,-0.4032756984233856,-0.5753277540206909,-0.7808830142021179,-0.14634644985198975,-0.4119536578655243,-0.6965261697769165,0.4276757538318634,-0.5369023084640503,-0.4773354232311249,-0.46046513319015503,-0.3265089690685272,-0.6540096402168274,0.20136262476444244,-0.8214278221130371,0.27484744787216187,-0.3489644229412079,-0.11341311782598495,-0.8741213083267212,0.2733824849128723,0.22086980938911438,-0.23494890332221985,1.1783021688461304,-0.2696937024593353,-0.8441450595855713,-0.7695454955101013,0.6256869435310364,0.2200777679681778,-0.7516354918479919,-0.6925065517425537,-0.7915982007980347,-0.06825603544712067,-0.29485490918159485,-1.769292950630188,0.06531324982643127,-0.8329363465309143,-0.8318166136741638,-0.38882702589035034,1.031890630722046,0.11254701763391495,0.5263988971710205,0.13890302181243896,0.8830300569534302,-0.757967472076416,-0.8113813400268555,-0.9798831343650818,-0.29218554496765137,-0.15626144409179688,0.3520606756210327,-0.8749599456787109,-1.105175256729126,-1.0363872051239014,0.7826569080352783,0.4742776155471802,-0.10010265558958054,-0.09958705306053162,0.10428497940301895,-0.7374796271324158,-0.269676148891449,-0.40000659227371216,0.2580138146877289,-0.5170254111289978,-1.2067655324935913,-0.7478824257850647,0.1923155039548874,-0.8107642531394958,-0.10571761429309845,-0.22538013756275177,0.11943108588457108,-0.7519386410713196,-0.05797750502824783,-0.07632803171873093,-0.022560788318514824,0.7565377950668335,-1.030219554901123,-0.4312279522418976,-0.70326167345047,-0.35997289419174194,-0.19848346710205078,-0.04439990222454071,0.04658837616443634,-0.06574892997741699,-0.2762015461921692,-1.3065524101257324,1.096301555633545,-0.6393932104110718,-0.44383108615875244,-1.0458232164382935,-0.07400067150592804,-0.20143833756446838,0.4280424416065216,-0.4929370582103729,-0.49234431982040405,-0.5363075733184814,0.5846503376960754,-0.34943896532058716,0.24590173363685608,-0.480078786611557,-0.5614951848983765,-0.3571852743625641,-0.4173789620399475,-0.18978659808635712,-0.14421017467975616,-0.04716923087835312,0.48622041940689087,0.9027771949768066,1.0469368696212769,1.3819180727005005,0.5635288953781128,1.21073579788208,1.0013066530227661,0.9923291206359863,0.564082145690918,0.9527998566627502,0.02868816815316677,1.6312838792800903,0.09990690648555756,1.1655672788619995,-0.9204019904136658,-0.38474252820014954,0.44546398520469666,-1.1219033002853394,-0.055179186165332794,0.1642802357673645,-0.17935679852962494,1.1197086572647095,0.29721322655677795,-0.40458935499191284,-0.29283538460731506,0.5524163842201233,0.6195399165153503,0.9029663801193237,-0.9659137725830078,-0.46226486563682556,0.5841659903526306,-0.33232733607292175,-1.402040958404541,-0.9897168278694153,-0.2813713252544403,-0.35019201040267944,-0.7215508818626404,-2.4211864471435547,0.6913756728172302,-0.24344231188297272,-0.249970942735672,-0.4972870349884033,0.18016159534454346,-0.7020791172981262,-0.7097585201263428,-1.1062068939208984,0.5809486508369446,0.30144134163856506,-0.1745019257068634,-2.0151519775390625,-1.7760765552520752,-1.3494994640350342,-1.3984824419021606,-1.7419989109039307,0.01414218544960022,-0.8015289306640625,-1.6386479139328003,-1.6633285284042358,0.09696164727210999,0.6919465065002441,-0.16852454841136932,-2.2988927364349365,-1.4693069458007812,0.1326369196176529,-1.3813385963439941,-0.08798029273748398,-1.0108819007873535,-0.19211843609809875,-1.291641116142273,-0.2874743938446045,-0.3788374364376068,-1.0629549026489258,-1.2921936511993408,-0.4923967719078064,-0.7385656237602234,-0.6891710758209229,-2.165138006210327,-0.6286326050758362,0.06201285868883133,0.8001828789710999,-0.6078219413757324,0.4206283688545227,0.4165705740451813,-1.5791865587234497,1.269034743309021,1.0095205307006836,1.3991831541061401,1.3111152648925781,0.4261591136455536,0.6746622323989868,0.19116273522377014,-0.4464258849620819,0.3325245678424835,-0.9348013997077942,-1.3907827138900757,-0.2390788048505783,-0.37911713123321533,-0.2881055176258087,-0.3421049118041992,0.16614066064357758,-0.7697164416313171,-0.39411890506744385,-0.48169296979904175,-0.6111875772476196,-0.37043431401252747,0.027377670630812645,-0.3591470718383789,-1.211280345916748,-0.8917186856269836,-0.7129324674606323,-0.5818975567817688,0.19753530621528625,-0.6446995139122009,0.033339742571115494,-0.32940128445625305,-0.015642445534467697,-0.804474949836731,0.5313539505004883,-0.30171895027160645,-0.9753963947296143,0.6266018748283386,-2.8364689350128174,-1.0067397356033325,0.6884109973907471,-2.5539331436157227,-1.7143417596817017,-0.46985048055648804,2.501777410507202,0.5844317674636841,-0.7308326959609985,0.5766181349754333,-0.5962865352630615,1.0317273139953613,0.834532618522644,-1.1740212440490723,-1.293134093284607,-0.7926090359687805,2.19234299659729,-1.6712441444396973,-1.1683443784713745,0.09241408854722977,2.465853452682495,-0.6847071647644043,-2.500447988510132,1.8858778476715088,-0.7839821577072144,3.335675001144409,2.2426083087921143,0.22914011776447296,0.23041927814483643,-0.014630085788667202,0.731175422668457,-1.4038903713226318,-0.7349004149436951,1.289272427558899,0.9014060497283936,-0.7824952006340027,0.4509941637516022,-0.5313635468482971,-0.15091997385025024,1.9248706102371216,-0.6429073214530945,-0.6499042510986328,0.3431451916694641,-1.293764591217041,-0.447552353143692,1.2358976602554321,1.205706238746643,-0.06825567781925201,-0.08236853778362274,-0.8660196661949158,-0.5339680314064026,-1.0942151546478271,-0.7697574496269226,3.067143678665161,0.4294276833534241,-0.777762234210968,-0.002645888365805149,-0.5003012418746948,2.5627009868621826,-0.8476738333702087,0.4549064636230469,-2.102252244949341,-0.8092312216758728,1.1497148275375366,0.15704959630966187,-1.199030876159668,0.42982253432273865,-0.7956292033195496,0.646968424320221,2.2124745845794678,0.34438228607177734,1.2520134449005127,1.6449414491653442,-1.4007563591003418,-1.9928207397460938,-1.4737995862960815,0.6366598010063171,1.1514497995376587,-0.6591994166374207,-0.22759179770946503,3.276012659072876,-0.884784460067749,-1.6824296712875366,-2.6889488697052,0.5569191575050354,0.5331900715827942,-0.5247107148170471,-0.8498672246932983,2.040318489074707,1.3180416822433472,-1.0281294584274292,0.019382581114768982,0.30637645721435547,0.5861398577690125,0.8791986703872681,1.771469235420227,-2.106912851333618,0.6799858808517456,2.2856736183166504,0.20724669098854065,1.7107316255569458,1.9191973209381104,-0.5829983353614807,-0.18784354627132416,-1.9702904224395752,-1.4302761554718018,3.83213210105896,-0.7085038423538208,0.7749373912811279,1.2931575775146484,1.6124844551086426,0.10646260529756546,-0.2813243567943573,-0.9420999884605408,2.3512871265411377,-0.1446448266506195,-2.674955129623413,0.7504120469093323,-1.1596482992172241,3.023696184158325,0.5688168406486511,1.3560736179351807,-0.3119358718395233,0.9887985587120056,0.3230132758617401,-0.2484995275735855,0.9285800457000732,-0.6535630822181702,0.9466335773468018,0.9305082559585571,0.5683583617210388,0.14294113218784332,-1.4507019519805908,0.4145489037036896,1.6770821809768677,-0.09353156387805939,0.5606762766838074,2.2269623279571533,0.8668820858001709,-0.007117525674402714,-0.34680721163749695,2.2545907497406006,-1.2005592584609985,-1.2632337808609009,-0.41279536485671997,-0.42132940888404846,0.9835960865020752,0.8272786736488342,0.6241384744644165,1.0821422338485718,0.7622082233428955,-1.1228857040405273,0.9723652005195618,0.36126333475112915,-1.9292453527450562,0.37655675411224365,2.0753841400146484,1.0763671398162842,2.3848659992218018,-0.4731055796146393,1.5599720478057861,-0.7774162292480469,-1.6710920333862305,2.6560840606689453,-0.2484363168478012,0.9694924354553223,2.255502939224243,-1.8118550777435303,-0.5005996823310852,-1.6837778091430664,1.2990540266036987,-0.6730730533599854,-2.072417736053467,-0.18276265263557434,1.9738023281097412,-0.6781049966812134,-1.818922996520996,0.4480375647544861,-2.28879714012146,-0.00908045656979084,-0.6347148418426514,1.3799055814743042,-1.6420687437057495,-0.6599327325820923,2.2314329147338867,-0.4459935426712036,-0.9550604820251465,-2.1789066791534424,-2.745248794555664,-1.8957849740982056,2.5002524852752686,3.3978559970855713,1.7930076122283936,-0.26551035046577454,2.6639387607574463,1.8267580270767212,1.2778221368789673,1.3806127309799194,0.3047349452972412,0.3620739281177521,2.1399831771850586,0.08918695151805878,-1.0576759576797485,3.6191647052764893,-0.251950204372406,-0.0005357395857572556,-0.012165731750428677,2.5895073413848877,1.1865198612213135,1.6630536317825317,-1.9085454940795898,2.6172845363616943,-0.29602330923080444,1.0623855590820312,-0.25587570667266846,-0.11752499639987946,-1.7097687721252441,1.3697950839996338,-0.5039303302764893,-0.9196146726608276,1.0531651973724365,-0.38556379079818726,1.9614968299865723,1.2981394529342651,-0.0001594768837094307,1.159398078918457,1.3718421459197998,1.1871726512908936,-0.5226776599884033,0.41393616795539856,4.422908782958984,-1.563923716545105,-0.8469040989875793,2.03295636177063,-1.443242073059082,-0.6985523700714111,2.3754632472991943,1.0496516227722168,1.845125675201416,1.3324394226074219,2.4543023109436035,-1.714128851890564,1.9265140295028687,1.5339205265045166,-1.0431455373764038,-0.46131494641304016,0.46357810497283936,-1.936722993850708,2.5352182388305664,-0.4409525692462921,-0.22268688678741455,2.714348793029785,-1.147615671157837,-0.19659310579299927,0.8975319266319275,0.3840653598308563,-0.05605197697877884,0.9425553679466248,1.1385877132415771,-1.0380603075027466,1.365006923675537,-1.0930871963500977,-0.9596260190010071,-1.3788464069366455,-0.19989871978759766,-1.0799193382263184,-0.35174721479415894,0.1095135435461998,-1.9181528091430664,2.105410575866699,-0.31668996810913086,0.5644630193710327,-0.4833284020423889,1.4285516738891602,0.09488546848297119,-1.6459810733795166,-0.8980455994606018,-0.7912274599075317,-0.9761782288551331,1.1608631610870361,1.026419997215271,2.551722526550293,0.0494694747030735,1.9670685529708862,2.528393030166626,1.3381658792495728,-0.3226938545703888,2.3120529651641846,0.9690368175506592,-0.29134538769721985,2.7646093368530273,2.443957805633545,-1.7742195129394531,1.7002654075622559,-2.246009349822998,0.2429421842098236,0.5971425175666809,-1.1687904596328735,-0.041350968182086945,1.0711336135864258,0.8389820456504822,-0.8339453339576721,1.0551726818084717,1.8039215803146362,-0.37036386132240295,-1.4500930309295654,2.4588708877563477,1.5581408739089966,0.9164568185806274,1.3622764348983765,-0.6905568242073059,-2.599440813064575,-0.7772306203842163,-1.1390200853347778,-1.3564249277114868,2.162898302078247,-0.691100537776947,1.137965202331543,1.0822535753250122,0.8270388841629028,1.0938518047332764,2.3357534408569336,1.1950891017913818,-1.9673025608062744,-1.742155909538269,-0.3494393527507782,0.839860737323761,1.457176685333252,-1.7225475311279297,1.19826078414917,0.8133675456047058,0.4413813352584839,-0.6027377843856812,2.343456506729126,-0.09194470196962357,1.2105724811553955,1.895613431930542,0.1128043457865715,2.6773862838745117,-0.38433119654655457,0.44614654779434204,-2.458550453186035,-0.7954540252685547,0.6709498763084412,-0.2605575621128082,0.36024948954582214,0.2298925518989563,1.9240407943725586,1.1841048002243042,0.23768608272075653,0.22980962693691254,1.3572033643722534,0.6833814978599548,1.4841841459274292,2.023432731628418,0.49778687953948975,2.1852598190307617,-0.23032735288143158,-1.4224259853363037,-0.821727991104126,1.8110042810440063,0.7662848830223083,-0.31287911534309387,-0.9253693222999573,-2.3290951251983643,1.7022662162780762,0.6221295595169067,0.3600935637950897,1.6092052459716797,-2.481472969055176,3.1183104515075684,1.0247596502304077,0.602994441986084,0.5404156446456909,1.927536964416504,-1.2524707317352295,2.5788543224334717,0.08457116782665253,1.5262539386749268,2.325206995010376,-0.17360524833202362,0.5796419978141785,-0.5706197023391724,0.6842635273933411,1.509108543395996,0.270402729511261,-1.5574960708618164,0.1139160618185997,-0.3381737172603607,2.004261016845703,2.116377592086792,2.354534387588501,-1.7666290998458862,-0.3159435987472534,-0.13131661713123322,-2.567397117614746,1.8718032836914062,0.9907698631286621,-0.4175260663032532,1.781286358833313,0.035102955996990204,-0.4113989472389221,1.5520180463790894,0.2067878693342209,0.9118133187294006,1.884096384048462,-0.32759973406791687,-1.1003206968307495,-0.3034488260746002,-0.6599835753440857,-0.19405540823936462,1.701459527015686,-0.0491909384727478,-0.4178646206855774,-0.5576868653297424,-0.4071851074695587,-0.7226784825325012,1.1936362981796265,1.3203784227371216,1.7629250288009644,3.4847731590270996,-0.02172619290649891,-1.1755605936050415,-1.3610471487045288,-0.7484937310218811,2.2494332790374756,-1.63542640209198,-1.8949089050292969,0.689197301864624,0.6020944118499756,0.7739917039871216,-0.534808337688446,-1.028560996055603,1.5340405702590942,0.9423946738243103,1.248828649520874,-2.4921460151672363,0.8327887058258057,2.676758289337158,-1.0139472484588623,0.758230447769165,0.7645471692085266,-1.7233291864395142,1.2735943794250488,0.14504152536392212,1.8338035345077515,-1.1551995277404785,0.6325278878211975,1.216366171836853,0.621868908405304,-0.5008939504623413,2.465022563934326,1.242301106452942,3.2744531631469727,-0.6666258573532104,0.29124701023101807,-1.3240326642990112,-0.7237968444824219,0.9768669009208679,0.9748231768608093,-0.879506528377533,0.5778324604034424,1.0620003938674927,0.4258502721786499,0.37866470217704773,-0.7223685383796692,0.46414291858673096,0.778156042098999,1.4069161415100098,1.3072452545166016,-0.953467607498169,-1.2602347135543823,-2.1280739307403564,-0.9248873591423035,-0.5448404550552368,-0.3515288829803467,1.365014910697937,-0.2839147746562958,0.7390356063842773,2.0879857540130615,-1.0144470930099487,-2.2068302631378174,0.6423531770706177,0.2230144590139389,-0.24207915365695953,0.5139026641845703,-0.3370460271835327,-0.30347582697868347,1.148444652557373,0.39707741141319275,1.2999221086502075,-0.9705429673194885,2.6386616230010986,-1.9616032838821411,-0.5380308032035828,-0.5805827379226685,-0.289351224899292,-1.2450693845748901,1.5960633754730225,3.0975544452667236,1.6709198951721191,0.5844289660453796,0.17185071110725403,0.7383623719215393,0.7273778915405273,0.00386139377951622,0.835677444934845,-1.0136258602142334,2.543048620223999,3.237602710723877,0.5719550251960754,0.34810662269592285,0.4487835168838501,2.230459451675415,0.29298052191734314,-0.2576022744178772,0.49370115995407104,0.4814186096191406,0.2736748158931732,-0.19374693930149078,-0.16451512277126312,0.8118324875831604,-1.6326663494110107,1.0103230476379395,-0.22402316331863403,-0.5972715616226196,-0.027429714798927307,0.313273549079895,0.9720879793167114,-0.5924954414367676,-0.17367282509803772,-1.6531838178634644,1.0889884233474731,0.39624372124671936,-0.12865540385246277,-1.903689980506897,-0.7770265340805054,0.7412757873535156,-0.5807806849479675,-0.37048661708831787,-2.816236972808838,-1.3003090620040894,-1.2620137929916382,0.1466657519340515,-0.3549872934818268,0.2544991075992584,-1.1414480209350586,1.1943894624710083,-1.1381458044052124,-0.384520947933197,-0.8148205876350403,-1.0100181102752686,0.45429202914237976,-0.4273971617221832,-1.9397039413452148,0.7417730093002319,1.187584638595581,-0.44543829560279846,0.10715680569410324,0.18230198323726654,-1.1871659755706787,0.12491435557603836,-1.2263728380203247,0.014668535441160202,-0.7243265509605408,-1.0471806526184082,-0.1910388022661209,0.29107558727264404,-1.0062795877456665,0.1262378990650177,-1.733699083328247,-0.5085047483444214,-1.6183843612670898,0.6961213946342468,-0.45301946997642517,0.42451682686805725,1.8650354146957397,-0.9007144570350647,-0.630558967590332,-0.8635000586509705,-2.23286509513855,0.23365262150764465,0.22274832427501678,0.43871062994003296,0.4224851727485657,0.814043402671814,-0.25820058584213257,0.4357171058654785,-0.48871245980262756,-1.081297516822815,-1.363301396369934,-0.7314466834068298,-0.10283815115690231,-1.1898349523544312,-0.018862567842006683,-0.7326535582542419,-1.0590225458145142,0.24528144299983978,-2.1076948642730713,-2.1839776039123535,-0.6882885098457336,-0.7975447773933411,-0.18375058472156525,-0.5964457988739014,-1.5151821374893188,0.17243099212646484,0.18252255022525787
\ No newline at end of file
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index 09634f2c1ca..f7e42e2e58a 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -76,6 +76,7 @@ extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
 struct AOTIDelegateHandle {
   void* so_handle;
   AOTInductorModelContainerHandle container_handle;
+  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header dependency
 };
 
 } // namespace aoti
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index 77b270cf5ab..30e307bba99 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -29,8 +29,12 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
 # CUDA-specific AOTI functionality
-set(_aoti_cuda_sources runtime/cuda_backend.cpp runtime/shims/memory.cpp
-                       runtime/shims/tensor_attribute.cpp
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp
+    runtime/guard.cpp
+    runtime/shims/cuda_guard.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp
 )
 add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
 target_include_directories(
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
index 1aa38760e5a..0386b5a008d 100644
--- a/backends/cuda/runtime/TARGETS
+++ b/backends/cuda/runtime/TARGETS
@@ -5,13 +5,17 @@ oncall("executorch")
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
+        "guard.cpp",
+        "shims/cuda_guard.cpp",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
+        "guard.h",
+        "shims/cuda_guard.h",
         "shims/memory.h",
         "shims/tensor_attribute.h",
-        "shims/utils.h",
+        "utils.h",
     ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 52bc133c658..49f3fabe527 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <cuda_runtime.h>
 #include <dlfcn.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
@@ -24,6 +25,7 @@
 #include <executorch/backends/aoti/aoti_model_container.h>
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 
 namespace executorch {
 namespace backends {
@@ -176,6 +178,13 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
     handle->so_handle = so_handle;
     handle->container_handle = container_handle;
+
+    // Create a CUDA stream for asynchronous execution
+    cudaStream_t cuda_stream;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+    handle->cuda_stream = static_cast<void*>(cuda_stream);
+    ET_LOG(Info, "Created CUDA stream: %p", handle->cuda_stream);
+
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
@@ -282,7 +291,7 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
         n_inputs,
         gpu_outputs.data(), // Use GPU output tensors
         n_outputs,
-        nullptr, // Pass the actual CUDA stream!
+        handle->cuda_stream, // Pass the actual CUDA stream
         nullptr); // proxy_executor_handle can remain nullptr
 
     if (error != Error::Ok) {
@@ -325,6 +334,20 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
   void destroy(DelegateHandle* handle_) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Destroy the CUDA stream if it exists
+    if (handle->cuda_stream != nullptr) {
+      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+      if (stream_err != cudaSuccess) {
+        ET_LOG(
+            Error,
+            "Failed to destroy CUDA stream: %s",
+            cudaGetErrorString(stream_err));
+      } else {
+        ET_LOG(Info, "Destroyed CUDA stream: %p", handle->cuda_stream);
+      }
+    }
+
     // Delete the container BEFORE closing the shared library
     if (handle->container_handle != nullptr) {
       AOTIRuntimeError delete_result =
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
new file mode 100644
index 00000000000..36c541e1770
--- /dev/null
+++ b/backends/cuda/runtime/guard.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          original_device_index_,
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  int orig_index = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&orig_index));
+
+  original_device_index_ = orig_index;
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  if (original_stream_ != nullptr) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          device_index_);
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to get current stream for device %d", device_index);
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
new file mode 100644
index 00000000000..d421315ac1d
--- /dev/null
+++ b/backends/cuda/runtime/guard.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <cstdint>
+#include <unordered_map>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+// Type alias for device index
+using DeviceIndex = int32_t;
+
+/**
+ * Set the current CUDA stream for the specified device.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index (-1 to use current device)
+ * @return Error code indicating success or failure
+ */
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream for the specified device.
+ * If no stream has been set, creates a new stream and sets it as current.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @return Result containing the current stream on success, or an error code on
+ * failure
+ */
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * RAII guard that sets the current CUDA device and restores it on destruction.
+ * This ensures that the device is properly restored even if an exception
+ * occurs.
+ *
+ * NOTE: Do not use constructors directly. Use the create() factory method
+ * instead.
+ */
+class CUDAGuard {
+ private:
+  /**
+   * Private constructor - use create() factory method instead.
+   */
+  explicit CUDAGuard()
+      : original_device_index_(-1), current_device_index_(-1){};
+
+ public:
+  /**
+   * Factory method to create a CUDAGuard.
+   *
+   * @param device_index The device index to set as current
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAGuard> create(DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAGuard(CUDAGuard&& other) noexcept;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /**
+   * Destructor that restores the original device if necessary.
+   */
+  ~CUDAGuard();
+
+  /**
+   * Sets the CUDA device to the given device index.
+   *
+   * @param device_index The device index to set as current
+   * @return Error code indicating success or failure
+   */
+  Error set_index(DeviceIndex device_index);
+
+  /**
+   * Get the original device index before the guard was created.
+   *
+   * @return The original device index
+   */
+  DeviceIndex original_device() const {
+    return original_device_index_;
+  }
+
+  /**
+   * Get the current device index.
+   *
+   * @return The current device index
+   */
+  DeviceIndex current_device() const {
+    return current_device_index_;
+  }
+
+ private:
+  /// The original device before this guard was created
+  DeviceIndex original_device_index_;
+  /// The current device managed by this guard
+  DeviceIndex current_device_index_;
+};
+
+/**
+ * RAII guard that sets the current CUDA device and stream, restoring both on
+ * destruction. This is useful for temporarily switching to a different device
+ * and stream.
+ *
+ * NOTE: Do not use constructors directly. Use the create() factory method
+ * instead.
+ */
+class CUDAStreamGuard {
+ private:
+  // Private constructor that takes a CUDAGuard
+  explicit CUDAStreamGuard(CUDAGuard&& guard)
+      : device_guard_(std::move(guard)),
+        original_stream_(nullptr),
+        current_stream_(nullptr),
+        device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAStreamGuard.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAStreamGuard> create(
+      cudaStream_t stream,
+      DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAStreamGuard(CUDAStreamGuard&& other) noexcept;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete;
+
+  /**
+   * Destructor that restores the original stream and device.
+   */
+  ~CUDAStreamGuard();
+
+  /**
+   * Sets the CUDA stream to the given stream on the specified device.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Error code indicating success or failure
+   */
+  Error set_stream(cudaStream_t stream, DeviceIndex device_index);
+
+  /**
+   * Get the current guarded stream.
+   *
+   * @return The current stream
+   */
+  cudaStream_t stream() const {
+    return current_stream_;
+  }
+
+  /**
+   * Get the device index being guarded.
+   *
+   * @return The device index
+   */
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+ private:
+  /// The device guard that handles device switching
+  CUDAGuard device_guard_;
+  /// The original stream that was current before this guard
+  cudaStream_t original_stream_ = nullptr;
+  /// The current stream being guarded
+  cudaStream_t current_stream_ = nullptr;
+  /// The device index for this stream guard
+  DeviceIndex device_index_;
+};
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp
new file mode 100644
index 00000000000..5740d0bf654
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_guard failed: ret_guard is null");
+
+  auto result = CUDAGuard::create(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_guard_set_index failed: guard is null");
+
+  ET_CHECK_OK_OR_RETURN_ERROR(guard->set_index(device_index));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: ret_guard is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: stream is null");
+
+  auto result =
+      CUDAStreamGuard::create(static_cast<cudaStream_t>(stream), device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAStreamGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_stream_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_get_current_cuda_stream failed: ret_stream is null");
+
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_stream = static_cast<void*>(result.get());
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
new file mode 100644
index 00000000000..6da869064a7
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+
+extern "C" {
+
+// Handle types for CUDA guards
+using CUDAGuardHandle = CUDAGuard*;
+using CUDAStreamGuardHandle = CUDAStreamGuard*;
+
+/**
+ * Creates a CUDA device guard that sets the current device and restores it
+ * upon destruction.
+ *
+ * @param device_index The device index to set as current
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA device guard and frees its associated resources.
+ *
+ * @param guard Handle to the guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+/**
+ * Sets the CUDA device to a new index for an existing guard.
+ *
+ * @param guard Handle to the guard
+ * @param device_index The device index to set as current
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index);
+
+/**
+ * Creates a CUDA stream guard that sets the current device and stream,
+ * restoring both upon destruction.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index for the stream
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA stream guard and frees its associated resources.
+ *
+ * @param guard Handle to the stream guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+/**
+ * Gets the current CUDA stream for a specified device.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @param ret_stream Output parameter for the current stream (must not be null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream);
+
+} // extern "C"
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 2b32d820301..716800d2629 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdlib> // For posix_memalign
@@ -308,42 +308,48 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
 
       // Find the reference count for this memory address
       auto memory_it = memory_to_n_tensor.find(data_ptr);
-      if (memory_it != memory_to_n_tensor.end()) {
-        int32_t ref_count = memory_it->second;
-
-        if (ref_count == NOT_OWN) {
-          // Tensor never owned the memory, skip freeing
-          // Just remove tensor from tracking
-          tensors.erase(it);
-          return Error::Ok;
-        } else if (ref_count == 1) {
-          // Only current tensor using this memory, free it
-          // Determine if it's GPU memory
-          cudaPointerAttributes attributes{};
-          ET_CUDA_CHECK_OR_RETURN_ERROR(
-              cudaPointerGetAttributes(&attributes, data_ptr));
-
-          if (attributes.type == cudaMemoryTypeManaged) {
-            // This is CUDA managed memory - free with proper synchronization
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize());
-            ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr));
-          } else {
-            // This is CPU memory - free immediately
-            free(data_ptr);
-            data_ptr = nullptr;
-          }
-
-          // Remove from memory tracking
-          memory_to_n_tensor.erase(memory_it);
-        } else if (ref_count > 1) {
-          // Other tensors still using this memory, just decrement count
-          memory_to_n_tensor[data_ptr] = ref_count - 1;
+
+      ET_CHECK_OR_RETURN_ERROR(
+          memory_it != memory_to_n_tensor.end(),
+          Internal,
+          "Internal error: memory not found during deletion");
+
+      int32_t ref_count = memory_it->second;
+
+      ET_CHECK_OR_RETURN_ERROR(
+          ref_count >= 0 || ref_count == NOT_OWN,
+          Internal,
+          "Internal error: invalid ref count %d",
+          ref_count)
+
+      if (ref_count == NOT_OWN) {
+        // Tensor never owned the memory, skip freeing
+        // Just remove tensor from tracking
+        tensors.erase(it);
+        return Error::Ok;
+      } else if (ref_count == 1) {
+        // Only current tensor using this memory, free it
+        // Determine if it's GPU memory
+        cudaPointerAttributes attributes{};
+        ET_CUDA_CHECK_OR_RETURN_ERROR(
+            cudaPointerGetAttributes(&attributes, data_ptr));
+
+        if (attributes.type == cudaMemoryTypeManaged) {
+          // This is CUDA managed memory - free with proper synchronization
+          ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize());
+          ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr));
+        } else {
+          // This is CPU memory - free immediately
+          free(data_ptr);
+          data_ptr = nullptr;
         }
+
+        // Remove from memory tracking
+        memory_to_n_tensor.erase(memory_it);
       } else {
-        ET_CHECK_OR_RETURN_ERROR(
-            false,
-            Internal,
-            "Internal error: memory not found during deletion");
+        // ref_count > 1
+        // Other tensors still using this memory, just decrement count
+        memory_to_n_tensor[data_ptr] = ref_count - 1;
       }
 
       // Remove tensor from set (this will call the destructor if it's the last
@@ -379,7 +385,6 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
   aoti_torch_get_dtype(src, &src_dtype);
 
   ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype));
-
   ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype));
 
   // Check dtype compatibility - both tensors must have the same dtype
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
index ef00ecff656..e18bf142b5c 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
index 7579eaef039..9fca0f92cf8 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
index 2cb12719782..d9b785a5a78 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
index eceb141e9ca..10c8d8c1a31 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
index 8e6998f457c..da65129f18a 100644
--- a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -10,7 +10,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/cuda/runtime/shims/utils.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/platform.h>
 #include <gtest/gtest.h>
diff --git a/backends/cuda/runtime/shims/utils.h b/backends/cuda/runtime/utils.h
similarity index 100%
rename from backends/cuda/runtime/shims/utils.h
rename to backends/cuda/runtime/utils.h
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index fb0dc0a4ade..32043d4d427 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -129,6 +129,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_SIZE_TEST "Build the size test" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_XNNPACK "Build the XNNPACK backend" BOOL OFF
 )

From e349cce87f4a8abe4c33bb7d876360500cf24903 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 6 Oct 2025 16:07:29 -0700
Subject: [PATCH 14/15] add cudagurad and cudastreamguard support

---
 aoti_debug_data/label_output.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 aoti_debug_data/label_output.txt

diff --git a/aoti_debug_data/label_output.txt b/aoti_debug_data/label_output.txt
deleted file mode 100644
index b6e6319c5d2..00000000000
--- a/aoti_debug_data/label_output.txt
+++ /dev/null
@@ -1 +0,0 @@
--0.03913404420018196,0.11446283012628555,-1.7967588901519775,-1.2342960834503174,-0.8190052509307861,0.3239615559577942,-2.186603307723999,-1.2876651287078857,-1.9019219875335693,-0.7314844727516174,0.7164335250854492,-1.6698509454727173,-1.4514909982681274,-1.2658661603927612,-1.5797510147094727,-1.038211464881897,-0.2147778570652008,-2.0712575912475586,-1.5538352727890015,-1.2831263542175293,-0.5831859707832336,1.619336724281311,-0.030490178614854813,-0.4813898503780365,-1.1297798156738281,-0.369297593832016,0.388177752494812,0.057438768446445465,0.4631587564945221,-0.2705245912075043,-1.4319391250610352,-0.7513930797576904,-0.41541463136672974,-1.8500491380691528,-0.4206249713897705,-1.191244125366211,-0.519303023815155,-1.9623936414718628,1.366169810295105,-1.105899453163147,-0.7772476673126221,-0.020080728456377983,0.1334831416606903,1.3196574449539185,-0.22508525848388672,0.6348890662193298,-1.1425437927246094,0.4581128656864166,-0.8908204436302185,-1.1983755826950073,-1.0953805446624756,1.4282540082931519,0.46135658025741577,-0.43548062443733215,-0.3356531858444214,-1.5134204626083374,-0.923163890838623,-1.6103588342666626,-1.0705418586730957,1.3484578132629395,0.22440192103385925,-0.8475747108459473,1.326701045036316,0.9815365076065063,0.6489527821540833,-0.22182974219322205,0.8341851234436035,-1.0138921737670898,1.4669286012649536,-0.4402707815170288,-1.0489991903305054,0.36616742610931396,-0.7941051721572876,2.1867175102233887,-1.7013881206512451,0.38229987025260925,-1.0632861852645874,-0.6783405542373657,1.7636947631835938,0.10571509599685669,1.959584355354309,-0.495066374540329,-0.32849109172821045,0.42277011275291443,-1.8629132509231567,-0.9533417224884033,-0.08654581010341644,-0.41913020610809326,-0.7976489067077637,-0.6835601329803467,-1.6655486822128296,0.20317625999450684,0.6550376415252686,-0.8757399320602417,-1.2040992975234985,0.31358838081359863,-0.5407125353813171,-2.0862929821014404,1.310542106628418,-0.24725094437599182,-1.001437783241272,0.25003859400749207,-0.35498347878456116,2.1051602363586426,-0.8274843692779541,-2.2429769039154053,-0.9414843320846558,-1.8201124668121338,-2.0493545532226562,-1.4745506048202515,-1.219736933708191,3.529479503631592,-0.6299042105674744,0.09119724482297897,-0.3092268109321594,-1.460767149925232,-0.7344509363174438,1.3739006519317627,-1.0220916271209717,-1.8681803941726685,-0.36143067479133606,-1.2824289798736572,-0.6740354299545288,-1.5281565189361572,-1.2834731340408325,-1.421437382698059,0.7150492072105408,-0.31045711040496826,1.7923942804336548,-0.5272325873374939,-1.3796250820159912,-1.1029645204544067,-0.8410789370536804,-0.4111415445804596,-0.7273851037025452,0.8324657678604126,-1.6222048997879028,-0.8601832389831543,-0.4216291904449463,-1.2872693538665771,-0.71281498670578,-0.336731493473053,-0.009143831208348274,-0.07268509268760681,-0.9919785857200623,-0.5055995583534241,0.49048081040382385,1.40537691116333,-0.3487783968448639,0.37215569615364075,-0.7167516350746155,-0.15457889437675476,0.8864662647247314,-0.0710245817899704,-0.1462513506412506,-0.8821230530738831,-0.4791715741157532,-0.3290942311286926,0.4341500997543335,-0.38155701756477356,-0.3562145531177521,-0.4755186438560486,-0.6587932109832764,0.19298143684864044,0.5636974573135376,0.5926137566566467,0.6501497626304626,0.3383529484272003,0.4471110701560974,-0.4032756984233856,-0.5753277540206909,-0.7808830142021179,-0.14634644985198975,-0.4119536578655243,-0.6965261697769165,0.4276757538318634,-0.5369023084640503,-0.4773354232311249,-0.46046513319015503,-0.3265089690685272,-0.6540096402168274,0.20136262476444244,-0.8214278221130371,0.27484744787216187,-0.3489644229412079,-0.11341311782598495,-0.8741213083267212,0.2733824849128723,0.22086980938911438,-0.23494890332221985,1.1783021688461304,-0.2696937024593353,-0.8441450595855713,-0.7695454955101013,0.6256869435310364,0.2200777679681778,-0.7516354918479919,-0.6925065517425537,-0.7915982007980347,-0.06825603544712067,-0.29485490918159485,-1.769292950630188,0.06531324982643127,-0.8329363465309143,-0.8318166136741638,-0.38882702589035034,1.031890630722046,0.11254701763391495,0.5263988971710205,0.13890302181243896,0.8830300569534302,-0.757967472076416,-0.8113813400268555,-0.9798831343650818,-0.29218554496765137,-0.15626144409179688,0.3520606756210327,-0.8749599456787109,-1.105175256729126,-1.0363872051239014,0.7826569080352783,0.4742776155471802,-0.10010265558958054,-0.09958705306053162,0.10428497940301895,-0.7374796271324158,-0.269676148891449,-0.40000659227371216,0.2580138146877289,-0.5170254111289978,-1.2067655324935913,-0.7478824257850647,0.1923155039548874,-0.8107642531394958,-0.10571761429309845,-0.22538013756275177,0.11943108588457108,-0.7519386410713196,-0.05797750502824783,-0.07632803171873093,-0.022560788318514824,0.7565377950668335,-1.030219554901123,-0.4312279522418976,-0.70326167345047,-0.35997289419174194,-0.19848346710205078,-0.04439990222454071,0.04658837616443634,-0.06574892997741699,-0.2762015461921692,-1.3065524101257324,1.096301555633545,-0.6393932104110718,-0.44383108615875244,-1.0458232164382935,-0.07400067150592804,-0.20143833756446838,0.4280424416065216,-0.4929370582103729,-0.49234431982040405,-0.5363075733184814,0.5846503376960754,-0.34943896532058716,0.24590173363685608,-0.480078786611557,-0.5614951848983765,-0.3571852743625641,-0.4173789620399475,-0.18978659808635712,-0.14421017467975616,-0.04716923087835312,0.48622041940689087,0.9027771949768066,1.0469368696212769,1.3819180727005005,0.5635288953781128,1.21073579788208,1.0013066530227661,0.9923291206359863,0.564082145690918,0.9527998566627502,0.02868816815316677,1.6312838792800903,0.09990690648555756,1.1655672788619995,-0.9204019904136658,-0.38474252820014954,0.44546398520469666,-1.1219033002853394,-0.055179186165332794,0.1642802357673645,-0.17935679852962494,1.1197086572647095,0.29721322655677795,-0.40458935499191284,-0.29283538460731506,0.5524163842201233,0.6195399165153503,0.9029663801193237,-0.9659137725830078,-0.46226486563682556,0.5841659903526306,-0.33232733607292175,-1.402040958404541,-0.9897168278694153,-0.2813713252544403,-0.35019201040267944,-0.7215508818626404,-2.4211864471435547,0.6913756728172302,-0.24344231188297272,-0.249970942735672,-0.4972870349884033,0.18016159534454346,-0.7020791172981262,-0.7097585201263428,-1.1062068939208984,0.5809486508369446,0.30144134163856506,-0.1745019257068634,-2.0151519775390625,-1.7760765552520752,-1.3494994640350342,-1.3984824419021606,-1.7419989109039307,0.01414218544960022,-0.8015289306640625,-1.6386479139328003,-1.6633285284042358,0.09696164727210999,0.6919465065002441,-0.16852454841136932,-2.2988927364349365,-1.4693069458007812,0.1326369196176529,-1.3813385963439941,-0.08798029273748398,-1.0108819007873535,-0.19211843609809875,-1.291641116142273,-0.2874743938446045,-0.3788374364376068,-1.0629549026489258,-1.2921936511993408,-0.4923967719078064,-0.7385656237602234,-0.6891710758209229,-2.165138006210327,-0.6286326050758362,0.06201285868883133,0.8001828789710999,-0.6078219413757324,0.4206283688545227,0.4165705740451813,-1.5791865587234497,1.269034743309021,1.0095205307006836,1.3991831541061401,1.3111152648925781,0.4261591136455536,0.6746622323989868,0.19116273522377014,-0.4464258849620819,0.3325245678424835,-0.9348013997077942,-1.3907827138900757,-0.2390788048505783,-0.37911713123321533,-0.2881055176258087,-0.3421049118041992,0.16614066064357758,-0.7697164416313171,-0.39411890506744385,-0.48169296979904175,-0.6111875772476196,-0.37043431401252747,0.027377670630812645,-0.3591470718383789,-1.211280345916748,-0.8917186856269836,-0.7129324674606323,-0.5818975567817688,0.19753530621528625,-0.6446995139122009,0.033339742571115494,-0.32940128445625305,-0.015642445534467697,-0.804474949836731,0.5313539505004883,-0.30171895027160645,-0.9753963947296143,0.6266018748283386,-2.8364689350128174,-1.0067397356033325,0.6884109973907471,-2.5539331436157227,-1.7143417596817017,-0.46985048055648804,2.501777410507202,0.5844317674636841,-0.7308326959609985,0.5766181349754333,-0.5962865352630615,1.0317273139953613,0.834532618522644,-1.1740212440490723,-1.293134093284607,-0.7926090359687805,2.19234299659729,-1.6712441444396973,-1.1683443784713745,0.09241408854722977,2.465853452682495,-0.6847071647644043,-2.500447988510132,1.8858778476715088,-0.7839821577072144,3.335675001144409,2.2426083087921143,0.22914011776447296,0.23041927814483643,-0.014630085788667202,0.731175422668457,-1.4038903713226318,-0.7349004149436951,1.289272427558899,0.9014060497283936,-0.7824952006340027,0.4509941637516022,-0.5313635468482971,-0.15091997385025024,1.9248706102371216,-0.6429073214530945,-0.6499042510986328,0.3431451916694641,-1.293764591217041,-0.447552353143692,1.2358976602554321,1.205706238746643,-0.06825567781925201,-0.08236853778362274,-0.8660196661949158,-0.5339680314064026,-1.0942151546478271,-0.7697574496269226,3.067143678665161,0.4294276833534241,-0.777762234210968,-0.002645888365805149,-0.5003012418746948,2.5627009868621826,-0.8476738333702087,0.4549064636230469,-2.102252244949341,-0.8092312216758728,1.1497148275375366,0.15704959630966187,-1.199030876159668,0.42982253432273865,-0.7956292033195496,0.646968424320221,2.2124745845794678,0.34438228607177734,1.2520134449005127,1.6449414491653442,-1.4007563591003418,-1.9928207397460938,-1.4737995862960815,0.6366598010063171,1.1514497995376587,-0.6591994166374207,-0.22759179770946503,3.276012659072876,-0.884784460067749,-1.6824296712875366,-2.6889488697052,0.5569191575050354,0.5331900715827942,-0.5247107148170471,-0.8498672246932983,2.040318489074707,1.3180416822433472,-1.0281294584274292,0.019382581114768982,0.30637645721435547,0.5861398577690125,0.8791986703872681,1.771469235420227,-2.106912851333618,0.6799858808517456,2.2856736183166504,0.20724669098854065,1.7107316255569458,1.9191973209381104,-0.5829983353614807,-0.18784354627132416,-1.9702904224395752,-1.4302761554718018,3.83213210105896,-0.7085038423538208,0.7749373912811279,1.2931575775146484,1.6124844551086426,0.10646260529756546,-0.2813243567943573,-0.9420999884605408,2.3512871265411377,-0.1446448266506195,-2.674955129623413,0.7504120469093323,-1.1596482992172241,3.023696184158325,0.5688168406486511,1.3560736179351807,-0.3119358718395233,0.9887985587120056,0.3230132758617401,-0.2484995275735855,0.9285800457000732,-0.6535630822181702,0.9466335773468018,0.9305082559585571,0.5683583617210388,0.14294113218784332,-1.4507019519805908,0.4145489037036896,1.6770821809768677,-0.09353156387805939,0.5606762766838074,2.2269623279571533,0.8668820858001709,-0.007117525674402714,-0.34680721163749695,2.2545907497406006,-1.2005592584609985,-1.2632337808609009,-0.41279536485671997,-0.42132940888404846,0.9835960865020752,0.8272786736488342,0.6241384744644165,1.0821422338485718,0.7622082233428955,-1.1228857040405273,0.9723652005195618,0.36126333475112915,-1.9292453527450562,0.37655675411224365,2.0753841400146484,1.0763671398162842,2.3848659992218018,-0.4731055796146393,1.5599720478057861,-0.7774162292480469,-1.6710920333862305,2.6560840606689453,-0.2484363168478012,0.9694924354553223,2.255502939224243,-1.8118550777435303,-0.5005996823310852,-1.6837778091430664,1.2990540266036987,-0.6730730533599854,-2.072417736053467,-0.18276265263557434,1.9738023281097412,-0.6781049966812134,-1.818922996520996,0.4480375647544861,-2.28879714012146,-0.00908045656979084,-0.6347148418426514,1.3799055814743042,-1.6420687437057495,-0.6599327325820923,2.2314329147338867,-0.4459935426712036,-0.9550604820251465,-2.1789066791534424,-2.745248794555664,-1.8957849740982056,2.5002524852752686,3.3978559970855713,1.7930076122283936,-0.26551035046577454,2.6639387607574463,1.8267580270767212,1.2778221368789673,1.3806127309799194,0.3047349452972412,0.3620739281177521,2.1399831771850586,0.08918695151805878,-1.0576759576797485,3.6191647052764893,-0.251950204372406,-0.0005357395857572556,-0.012165731750428677,2.5895073413848877,1.1865198612213135,1.6630536317825317,-1.9085454940795898,2.6172845363616943,-0.29602330923080444,1.0623855590820312,-0.25587570667266846,-0.11752499639987946,-1.7097687721252441,1.3697950839996338,-0.5039303302764893,-0.9196146726608276,1.0531651973724365,-0.38556379079818726,1.9614968299865723,1.2981394529342651,-0.0001594768837094307,1.159398078918457,1.3718421459197998,1.1871726512908936,-0.5226776599884033,0.41393616795539856,4.422908782958984,-1.563923716545105,-0.8469040989875793,2.03295636177063,-1.443242073059082,-0.6985523700714111,2.3754632472991943,1.0496516227722168,1.845125675201416,1.3324394226074219,2.4543023109436035,-1.714128851890564,1.9265140295028687,1.5339205265045166,-1.0431455373764038,-0.46131494641304016,0.46357810497283936,-1.936722993850708,2.5352182388305664,-0.4409525692462921,-0.22268688678741455,2.714348793029785,-1.147615671157837,-0.19659310579299927,0.8975319266319275,0.3840653598308563,-0.05605197697877884,0.9425553679466248,1.1385877132415771,-1.0380603075027466,1.365006923675537,-1.0930871963500977,-0.9596260190010071,-1.3788464069366455,-0.19989871978759766,-1.0799193382263184,-0.35174721479415894,0.1095135435461998,-1.9181528091430664,2.105410575866699,-0.31668996810913086,0.5644630193710327,-0.4833284020423889,1.4285516738891602,0.09488546848297119,-1.6459810733795166,-0.8980455994606018,-0.7912274599075317,-0.9761782288551331,1.1608631610870361,1.026419997215271,2.551722526550293,0.0494694747030735,1.9670685529708862,2.528393030166626,1.3381658792495728,-0.3226938545703888,2.3120529651641846,0.9690368175506592,-0.29134538769721985,2.7646093368530273,2.443957805633545,-1.7742195129394531,1.7002654075622559,-2.246009349822998,0.2429421842098236,0.5971425175666809,-1.1687904596328735,-0.041350968182086945,1.0711336135864258,0.8389820456504822,-0.8339453339576721,1.0551726818084717,1.8039215803146362,-0.37036386132240295,-1.4500930309295654,2.4588708877563477,1.5581408739089966,0.9164568185806274,1.3622764348983765,-0.6905568242073059,-2.599440813064575,-0.7772306203842163,-1.1390200853347778,-1.3564249277114868,2.162898302078247,-0.691100537776947,1.137965202331543,1.0822535753250122,0.8270388841629028,1.0938518047332764,2.3357534408569336,1.1950891017913818,-1.9673025608062744,-1.742155909538269,-0.3494393527507782,0.839860737323761,1.457176685333252,-1.7225475311279297,1.19826078414917,0.8133675456047058,0.4413813352584839,-0.6027377843856812,2.343456506729126,-0.09194470196962357,1.2105724811553955,1.895613431930542,0.1128043457865715,2.6773862838745117,-0.38433119654655457,0.44614654779434204,-2.458550453186035,-0.7954540252685547,0.6709498763084412,-0.2605575621128082,0.36024948954582214,0.2298925518989563,1.9240407943725586,1.1841048002243042,0.23768608272075653,0.22980962693691254,1.3572033643722534,0.6833814978599548,1.4841841459274292,2.023432731628418,0.49778687953948975,2.1852598190307617,-0.23032735288143158,-1.4224259853363037,-0.821727991104126,1.8110042810440063,0.7662848830223083,-0.31287911534309387,-0.9253693222999573,-2.3290951251983643,1.7022662162780762,0.6221295595169067,0.3600935637950897,1.6092052459716797,-2.481472969055176,3.1183104515075684,1.0247596502304077,0.602994441986084,0.5404156446456909,1.927536964416504,-1.2524707317352295,2.5788543224334717,0.08457116782665253,1.5262539386749268,2.325206995010376,-0.17360524833202362,0.5796419978141785,-0.5706197023391724,0.6842635273933411,1.509108543395996,0.270402729511261,-1.5574960708618164,0.1139160618185997,-0.3381737172603607,2.004261016845703,2.116377592086792,2.354534387588501,-1.7666290998458862,-0.3159435987472534,-0.13131661713123322,-2.567397117614746,1.8718032836914062,0.9907698631286621,-0.4175260663032532,1.781286358833313,0.035102955996990204,-0.4113989472389221,1.5520180463790894,0.2067878693342209,0.9118133187294006,1.884096384048462,-0.32759973406791687,-1.1003206968307495,-0.3034488260746002,-0.6599835753440857,-0.19405540823936462,1.701459527015686,-0.0491909384727478,-0.4178646206855774,-0.5576868653297424,-0.4071851074695587,-0.7226784825325012,1.1936362981796265,1.3203784227371216,1.7629250288009644,3.4847731590270996,-0.02172619290649891,-1.1755605936050415,-1.3610471487045288,-0.7484937310218811,2.2494332790374756,-1.63542640209198,-1.8949089050292969,0.689197301864624,0.6020944118499756,0.7739917039871216,-0.534808337688446,-1.028560996055603,1.5340405702590942,0.9423946738243103,1.248828649520874,-2.4921460151672363,0.8327887058258057,2.676758289337158,-1.0139472484588623,0.758230447769165,0.7645471692085266,-1.7233291864395142,1.2735943794250488,0.14504152536392212,1.8338035345077515,-1.1551995277404785,0.6325278878211975,1.216366171836853,0.621868908405304,-0.5008939504623413,2.465022563934326,1.242301106452942,3.2744531631469727,-0.6666258573532104,0.29124701023101807,-1.3240326642990112,-0.7237968444824219,0.9768669009208679,0.9748231768608093,-0.879506528377533,0.5778324604034424,1.0620003938674927,0.4258502721786499,0.37866470217704773,-0.7223685383796692,0.46414291858673096,0.778156042098999,1.4069161415100098,1.3072452545166016,-0.953467607498169,-1.2602347135543823,-2.1280739307403564,-0.9248873591423035,-0.5448404550552368,-0.3515288829803467,1.365014910697937,-0.2839147746562958,0.7390356063842773,2.0879857540130615,-1.0144470930099487,-2.2068302631378174,0.6423531770706177,0.2230144590139389,-0.24207915365695953,0.5139026641845703,-0.3370460271835327,-0.30347582697868347,1.148444652557373,0.39707741141319275,1.2999221086502075,-0.9705429673194885,2.6386616230010986,-1.9616032838821411,-0.5380308032035828,-0.5805827379226685,-0.289351224899292,-1.2450693845748901,1.5960633754730225,3.0975544452667236,1.6709198951721191,0.5844289660453796,0.17185071110725403,0.7383623719215393,0.7273778915405273,0.00386139377951622,0.835677444934845,-1.0136258602142334,2.543048620223999,3.237602710723877,0.5719550251960754,0.34810662269592285,0.4487835168838501,2.230459451675415,0.29298052191734314,-0.2576022744178772,0.49370115995407104,0.4814186096191406,0.2736748158931732,-0.19374693930149078,-0.16451512277126312,0.8118324875831604,-1.6326663494110107,1.0103230476379395,-0.22402316331863403,-0.5972715616226196,-0.027429714798927307,0.313273549079895,0.9720879793167114,-0.5924954414367676,-0.17367282509803772,-1.6531838178634644,1.0889884233474731,0.39624372124671936,-0.12865540385246277,-1.903689980506897,-0.7770265340805054,0.7412757873535156,-0.5807806849479675,-0.37048661708831787,-2.816236972808838,-1.3003090620040894,-1.2620137929916382,0.1466657519340515,-0.3549872934818268,0.2544991075992584,-1.1414480209350586,1.1943894624710083,-1.1381458044052124,-0.384520947933197,-0.8148205876350403,-1.0100181102752686,0.45429202914237976,-0.4273971617221832,-1.9397039413452148,0.7417730093002319,1.187584638595581,-0.44543829560279846,0.10715680569410324,0.18230198323726654,-1.1871659755706787,0.12491435557603836,-1.2263728380203247,0.014668535441160202,-0.7243265509605408,-1.0471806526184082,-0.1910388022661209,0.29107558727264404,-1.0062795877456665,0.1262378990650177,-1.733699083328247,-0.5085047483444214,-1.6183843612670898,0.6961213946342468,-0.45301946997642517,0.42451682686805725,1.8650354146957397,-0.9007144570350647,-0.630558967590332,-0.8635000586509705,-2.23286509513855,0.23365262150764465,0.22274832427501678,0.43871062994003296,0.4224851727485657,0.814043402671814,-0.25820058584213257,0.4357171058654785,-0.48871245980262756,-1.081297516822815,-1.363301396369934,-0.7314466834068298,-0.10283815115690231,-1.1898349523544312,-0.018862567842006683,-0.7326535582542419,-1.0590225458145142,0.24528144299983978,-2.1076948642730713,-2.1839776039123535,-0.6882885098457336,-0.7975447773933411,-0.18375058472156525,-0.5964457988739014,-1.5151821374893188,0.17243099212646484,0.18252255022525787
\ No newline at end of file

From 57060e91837337ccaa3aa5fa3fa5b06249c7a286 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 7 Oct 2025 09:26:20 -0700
Subject: [PATCH 15/15] make voxtral runner exits nice and neat

---
 backends/cuda/runtime/cuda_backend.cpp | 47 +++++++++++++++++---------
 backends/cuda/runtime/shims/memory.cpp | 14 +++++---
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 49f3fabe527..680923fa590 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -316,14 +316,16 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
           i);
     }
 
-    // Clean up GPU tensors that we created (ExecutorTorch tensors are always
-    // CPU, so all GPU tensors are our copies)
-    for (int i = 0; i < n_inputs; i++) {
-      // All GPU input tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_inputs[i]);
-    }
+    // // Clean up GPU tensors that we created (ExecutorTorch tensors are always
+    // // CPU, so all GPU tensors are our copies)
+    // for (int i = 0; i < n_inputs; i++) {
+    //   ET_LOG(Info, "Deleting GPU input tensor %d", i);
+    //   // All GPU input tensors were created by us, delete them
+    //   aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    // }
 
     for (int i = 0; i < n_outputs; i++) {
+      ET_LOG(Info, "Deleting GPU output tensor %d", i);
       // All GPU output tensors were created by us, delete them
       aoti_torch_delete_tensor_object(gpu_outputs[i]);
     }
@@ -346,27 +348,40 @@ class CudaBackend final : public ::executorch::runtime::BackendInterface {
       } else {
         ET_LOG(Info, "Destroyed CUDA stream: %p", handle->cuda_stream);
       }
+      handle->cuda_stream = nullptr;
     }
 
     // Delete the container BEFORE closing the shared library
-    if (handle->container_handle != nullptr) {
-      AOTIRuntimeError delete_result =
-          AOTInductorModelContainerDelete(handle->container_handle);
-      if (delete_result != Error::Ok) {
-        ET_LOG(
-            Error,
-            "AOTInductorModelContainerDelete failed with error code %d",
-            delete_result);
-      }
-    }
+    // if (handle->container_handle != nullptr) {
+    //   ET_LOG(Info, "Deleting container_handle: %p",handle->container_handle);
+    //   AOTIRuntimeError delete_result =
+    //       AOTInductorModelContainerDelete(handle->container_handle);
+    //   if (delete_result != Error::Ok) {
+    //     ET_LOG(
+    //         Error,
+    //         "AOTInductorModelContainerDelete failed with error code %d",
+    //         delete_result);
+    //   }
+    //   handle->container_handle = nullptr;
+    // }
+
+    ET_LOG(Info, "Deleted container_handle: %p", handle->container_handle);
 
     // Now close the shared library
     if (handle->so_handle != nullptr) {
       dlclose(handle->so_handle);
+      handle->so_handle = nullptr;
     }
 
+    ET_LOG(Info, "Deleted so_handle: %p", handle->so_handle);
+
     free(handle);
+
+    ET_LOG(Info, "Deleted AOTI delegate handle: %p", handle);
+
     clear_all_tensors();
+
+    ET_LOG(Info, "Deleted all tensors");
   }
 };
 
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index 716800d2629..4350cac0ff8 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -271,10 +271,16 @@ void clear_all_tensors() {
   // Use aoti_torch_delete_tensor_object to properly delete each tensor
   // Note: We need to collect tensor pointers first since deletion modifies the
   // set
-  auto old_tensors =
-      std::move(tensors); // tensors is now empty and no need to copy
-  for (const auto& tensor_shared : old_tensors) {
-    aoti_torch_delete_tensor_object(tensor_shared.get());
+  ET_LOG(Info, "Clearing all tensors...");
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
+  }
+
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
   }
 
   // tensors set should now be empty, but ensure it's cleared