pytorch · Gasoonjia · Oct 9, 2025 · Oct 8, 2025 · Oct 9, 2025
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
@@ -40,7 +40,7 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
 target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against PyTorch libraries and standard libraries
+# Link against ExecuTorch libraries and standard libraries
 target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 executorch_target_link_options_shared_lib(aoti_common)
 

diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
@@ -127,11 +127,18 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
-// Currently only float32 is supported, but using robust enum-based approach
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
 
+int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // PyTorch's bfloat16 dtype code
+}
+
+int32_t aoti_torch_dtype_int64() {
+  return 4; // PyTorch's int64 dtype code
+}
+
 // Cleanup functions
 void cleanup_tensor_metadata() {
   internal::tensor_to_sizes.clear();

diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
@@ -58,6 +58,8 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
+int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int64();
 
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();

diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
@@ -51,7 +51,7 @@ def define_common_targets():
         link_whole = True,
         supports_python_dlopen = True,
         visibility = ["@EXECUTORCH_CLIENTS"],
-        deps = [
+        exported_deps = [
             ":common_shims",
             ":model_container",
         ],

diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -55,10 +55,7 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda
-  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI CUDA functions
-         ${TORCH_LIBRARIES}
+  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -34,3 +34,25 @@ runtime.cxx_library(
         ("cuda", None, "cuda-lazy"),
     ],
 )
+
+runtime.cxx_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.cpp",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        ":runtime_shims",
+        "//executorch/backends/aoti:aoti_common",
+        "//executorch/runtime/backend:interface",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -26,9 +26,14 @@
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
+
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
 
 using namespace std;
 using namespace aoti;
@@ -53,45 +58,11 @@ class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
   Error register_shared_library_functions(void* so_handle) const {
-    AOTInductorModelContainerCreateWithDevice =
-        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
-    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerDelete =
-        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerDelete"));
-    if (AOTInductorModelContainerDelete == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumInputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
-    if (AOTInductorModelContainerGetNumInputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumOutputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
-    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerRun =
-        reinterpret_cast<AOTInductorModelContainerRunFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerRun"));
-    if (AOTInductorModelContainerRun == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
-      return Error::AccessFailed;
-    }
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
 
     return Error::Ok;
   }
@@ -122,14 +93,13 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
     auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
-    if (!aoti_cuda_buffer.ok()) {
-      ET_LOG(
-          Error,
-          "Failed to get data for key %s: 0x%x",
-          so_blob_key.c_str(),
-          aoti_cuda_buffer.error());
-      return aoti_cuda_buffer.error();
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_cuda_buffer.ok(),
+        Internal,
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
@@ -144,39 +114,35 @@ class ET_EXPERIMENTAL CudaBackend final
         "Writing %zu bytes to %s",
         aoti_cuda_buffer->size(),
         so_path.c_str());
+
     outfile.write(
         static_cast<const char*>(aoti_cuda_buffer->data()),
         aoti_cuda_buffer->size());
 
-    if (!outfile) {
-      ET_LOG(Error, "Failed to write to file %s", so_path.c_str());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
     // Finish writing the file to disk
     outfile.close();
 
     // Load the ELF using dlopen
     void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (so_handle == nullptr) {
-      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
 
     processed->Free();
 
     // Register all shared library functions
-    Error reg_err = register_shared_library_functions(so_handle);
-    if (reg_err != Error::Ok) {
-      return reg_err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
 
-    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "cuda", nullptr);
-    if (err != Error::Ok) {
-      return err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr));
+
     ET_LOG(Info, "container_handle = %p", container_handle);
 
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
@@ -206,15 +172,13 @@ class ET_EXPERIMENTAL CudaBackend final
     AOTInductorModelContainerGetNumOutputs(
         handle->container_handle, &n_outputs);
 
-    if (n_inputs + n_outputs != args.size()) {
-      ET_LOG(
-          Error,
-          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-          n_inputs,
-          n_outputs,
-          args.size());
-      return Error::InvalidArgument;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
 
     // NOTE: ExecuTorch tensors are always on CPU/host memory
     // We need to create GPU copies for CUDA kernel execution
@@ -244,19 +208,20 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_input_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for input %d",
+          i);
 
       gpu_inputs[i] = gpu_input_handle;
 
       // Copy data from CPU to GPU
-      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          Internal,
+          "Failed to copy input %d from CPU to GPU",
+          i);
     }
     ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -280,10 +245,11 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_output_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for output %d",
+          i);
 
       gpu_outputs[i] = gpu_output_handle;
     }
@@ -298,13 +264,11 @@ class ET_EXPERIMENTAL CudaBackend final
         handle->cuda_stream, // Pass the actual CUDA stream
         nullptr); // proxy_executor_handle can remain nullptr
 
-    if (error != Error::Ok) {
-      ET_LOG(
-          Error,
-          "AOTInductorModelContainerRun failed with error code %d",
-          error);
-      return Error::Internal;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        error == Error::Ok,
+        Internal,
+        "AOTInductorModelContainerRun failed with error code %d",
+        error);
 
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -320,18 +284,6 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
-    // Clean up GPU tensors that we created (ExecuTorch tensors are always
-    // CPU, so all GPU tensors are our copies)
-    for (int i = 0; i < n_inputs; i++) {
-      // All GPU input tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_inputs[i]);
-    }
-
-    for (int i = 0; i < n_outputs; i++) {
-      // All GPU output tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_outputs[i]);
-    }
-
     return Error::Ok;
   }
 
@@ -352,18 +304,13 @@ class ET_EXPERIMENTAL CudaBackend final
       handle->cuda_stream = nullptr;
     }
 
-    // Delete the container BEFORE closing the shared library
-    if (handle->container_handle != nullptr) {
-      AOTIRuntimeError delete_result =
-          AOTInductorModelContainerDelete(handle->container_handle);
-      if (delete_result != Error::Ok) {
-        ET_LOG(
-            Error,
-            "AOTInductorModelContainerDelete failed with error code %d",
-            delete_result);
-      }
-      handle->container_handle = nullptr;
-    }
+    // NOTE: AOTInductorModelContainerDelete does not work correctly with
+    // multiple .so files. Deleting one container frees shared resources,
+    // which causes segmentation faults when attempting to delete other
+    // containers. As a workaround, we skip explicit container deletion
+    // and defer cleanup to the OS.
+    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library
     if (handle->so_handle != nullptr) {
@@ -374,27 +321,25 @@ class ET_EXPERIMENTAL CudaBackend final
     if (!handle->so_path.empty()) {
       std::error_code remove_error;
       std::filesystem::remove(handle->so_path, remove_error);
-      if (remove_error) {
-        ET_LOG(
-            Error,
-            "Failed to remove temporary shared library %s: %s",
-            handle->so_path.c_str(),
-            remove_error.message().c_str());
-      }
+      ET_CHECK_OR_LOG_ERROR(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str());
     }
 
     delete handle;
+    clear_all_tensors();
   }
 };
 
-} // namespace cuda
+} // namespace executorch::backends::cuda
 
+namespace executorch::backends {
 namespace {
 auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
-
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends