From 59a20ff9afda2a47a6835b912fb68d1ac7a01f2e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 8 Oct 2025 10:09:50 -0700
Subject: [PATCH 1/3] refactor cuda_backend.cpp

This diff does a comprehensive refactor on cuda_backend.cpp. Two main points:
1.  Reuse ExecuTorch standard macros (ET_CHECK_OR_RETURN_ERROR and others) to replaces exiting if..else + ET_LOG branches
2. Introduced LOAD_SYMBOL macro to concentrate the symbol loading pipeline.

Differential Revision: [D84135844](https://our.internmc.facebook.com/intern/diff/D84135844/)

[ghstack-poisoned]
---
 backends/cuda/runtime/cuda_backend.cpp | 175 ++++++++++---------------
 1 file changed, 68 insertions(+), 107 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index b25bbef6b04..4e87ff1b566 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -30,6 +30,13 @@ namespace executorch {
 namespace backends {
 namespace cuda {
 
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
+
 using namespace std;
 using namespace aoti;
 
@@ -53,45 +60,11 @@ class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
   Error register_shared_library_functions(void* so_handle) const {
-    AOTInductorModelContainerCreateWithDevice =
-        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
-    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerDelete =
-        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerDelete"));
-    if (AOTInductorModelContainerDelete == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumInputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
-    if (AOTInductorModelContainerGetNumInputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerGetNumOutputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
-    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerRun =
-        reinterpret_cast<AOTInductorModelContainerRunFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerRun"));
-    if (AOTInductorModelContainerRun == nullptr) {
-      ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
-      return Error::AccessFailed;
-    }
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
 
     return Error::Ok;
   }
@@ -122,14 +95,13 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
     auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
-    if (!aoti_cuda_buffer.ok()) {
-      ET_LOG(
-          Error,
-          "Failed to get data for key %s: 0x%x",
-          so_blob_key.c_str(),
-          aoti_cuda_buffer.error());
-      return aoti_cuda_buffer.error();
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_cuda_buffer.ok(),
+        aoti_cuda_buffer.error(),
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        aoti_cuda_buffer.error());
+
     // Generate dynamic temporary file path
     filesystem::path temp_dir = filesystem::temp_directory_path();
     filesystem::path so_path =
@@ -144,39 +116,35 @@ class ET_EXPERIMENTAL CudaBackend final
         "Writing %zu bytes to %s",
         aoti_cuda_buffer->size(),
         so_path.c_str());
+
     outfile.write(
         static_cast<const char*>(aoti_cuda_buffer->data()),
         aoti_cuda_buffer->size());
 
-    if (!outfile) {
-      ET_LOG(Error, "Failed to write to file %s", so_path.c_str());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
     // Finish writing the file to disk
     outfile.close();
 
     // Load the ELF using dlopen
     void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (so_handle == nullptr) {
-      ET_LOG(Error, "Failed to load shared library: %s", dlerror());
-      return Error::AccessFailed;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
 
     processed->Free();
 
     // Register all shared library functions
-    Error reg_err = register_shared_library_functions(so_handle);
-    if (reg_err != Error::Ok) {
-      return reg_err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
 
     AOTInductorModelContainerHandle container_handle = nullptr;
 
-    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "cuda", nullptr);
-    if (err != Error::Ok) {
-      return err;
-    }
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr));
+
     ET_LOG(Info, "container_handle = %p", container_handle);
 
     AOTIDelegateHandle* handle = new AOTIDelegateHandle();
@@ -206,15 +174,13 @@ class ET_EXPERIMENTAL CudaBackend final
     AOTInductorModelContainerGetNumOutputs(
         handle->container_handle, &n_outputs);
 
-    if (n_inputs + n_outputs != args.size()) {
-      ET_LOG(
-          Error,
-          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
-          n_inputs,
-          n_outputs,
-          args.size());
-      return Error::InvalidArgument;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
 
     // NOTE: ExecuTorch tensors are always on CPU/host memory
     // We need to create GPU copies for CUDA kernel execution
@@ -244,19 +210,20 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_input_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for input %d",
+          i);
 
       gpu_inputs[i] = gpu_input_handle;
 
       // Copy data from CPU to GPU
-      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
-      if (copy_err != Error::Ok) {
-        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          Internal,
+          "Failed to copy input %d from CPU to GPU",
+          i);
     }
     ET_LOG(Info, "Inputs copied to GPU");
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -280,10 +247,11 @@ class ET_EXPERIMENTAL CudaBackend final
           0, // device_index = 0
           &gpu_output_handle);
 
-      if (create_err != Error::Ok) {
-        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
-        return Error::Internal;
-      }
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for output %d",
+          i);
 
       gpu_outputs[i] = gpu_output_handle;
     }
@@ -298,13 +266,11 @@ class ET_EXPERIMENTAL CudaBackend final
         handle->cuda_stream, // Pass the actual CUDA stream
         nullptr); // proxy_executor_handle can remain nullptr
 
-    if (error != Error::Ok) {
-      ET_LOG(
-          Error,
-          "AOTInductorModelContainerRun failed with error code %d",
-          error);
-      return Error::Internal;
-    }
+    ET_CHECK_OR_RETURN_ERROR(
+        error == Error::Ok,
+        Internal,
+        "AOTInductorModelContainerRun failed with error code %d",
+        error);
 
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -356,12 +322,10 @@ class ET_EXPERIMENTAL CudaBackend final
     if (handle->container_handle != nullptr) {
       AOTIRuntimeError delete_result =
           AOTInductorModelContainerDelete(handle->container_handle);
-      if (delete_result != Error::Ok) {
-        ET_LOG(
-            Error,
-            "AOTInductorModelContainerDelete failed with error code %d",
-            delete_result);
-      }
+      ET_CHECK_OR_LOG(
+          delete_result == Error::Ok,
+          "Failed to delete AOTInductorModelContainer with error code %d",
+          delete_result);
       handle->container_handle = nullptr;
     }
 
@@ -373,14 +337,11 @@ class ET_EXPERIMENTAL CudaBackend final
     // Remove the temporary shared library file
     if (!handle->so_path.empty()) {
       std::error_code remove_error;
-      std::filesystem::remove(handle->so_path, remove_error);
-      if (remove_error) {
-        ET_LOG(
-            Error,
-            "Failed to remove temporary shared library %s: %s",
-            handle->so_path.c_str(),
-            remove_error.message().c_str());
-      }
+      ET_CHECK_OR_LOG(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str())
     }
 
     delete handle;

From f7292fee58b8e6f611c71e52ab8b18ddc541f28c Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 8 Oct 2025 10:35:50 -0700
Subject: [PATCH 2/3] Update on "refactor cuda_backend.cpp"

This diff does a comprehensive refactor on cuda_backend.cpp. Two main points:
1.  Reuse ExecuTorch standard macros (ET_CHECK_OR_RETURN_ERROR and others) to replaces exiting if..else + ET_LOG branches
2. Introduced LOAD_SYMBOL macro to concentrate the symbol loading pipeline.

Differential Revision: [D84135844](https://our.internmc.facebook.com/intern/diff/D84135844/)

[ghstack-poisoned]
---
 backends/cuda/runtime/cuda_backend.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 4e87ff1b566..782b2898077 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -337,6 +337,7 @@ class ET_EXPERIMENTAL CudaBackend final
     // Remove the temporary shared library file
     if (!handle->so_path.empty()) {
       std::error_code remove_error;
+      std::filesystem::remove(handle->so_path, remove_error);
       ET_CHECK_OR_LOG(
           !remove_error,
           "Failed to remove temporary shared library %s: %s",

From 8361cebb2b51fca4c7299bb096894749cc764f67 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 8 Oct 2025 10:44:49 -0700
Subject: [PATCH 3/3] Update on "refactor cuda_backend.cpp"

This diff does a comprehensive refactor on cuda_backend.cpp. Two main points:
1.  Reuse ExecuTorch standard macros (ET_CHECK_OR_RETURN_ERROR and others) to replaces exiting if..else + ET_LOG branches
2. Introduced LOAD_SYMBOL macro to concentrate the symbol loading pipeline.

Differential Revision: [D84135844](https://our.internmc.facebook.com/intern/diff/D84135844/)

[ghstack-poisoned]
---
 backends/cuda/runtime/cuda_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 782b2898077..2d3b72b5f55 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -342,7 +342,7 @@ class ET_EXPERIMENTAL CudaBackend final
           !remove_error,
           "Failed to remove temporary shared library %s: %s",
           handle->so_path.c_str(),
-          remove_error.message().c_str())
+          remove_error.message().c_str());
     }
 
     delete handle;