From 4703a085570a4b62e222d42302de5c7fead19fae Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 8 Oct 2025 10:09:55 -0700
Subject: [PATCH] update cuda delegate resource free pipeline for safety and
 segfault-free

This diff survives `clear_all_tensors()` function and enable it during backend destroy stage. Furthermore, we defer the container handle deletion to OS to avoid potential segfault if there's more than one .so files.

Differential Revision: [D84135792](https://our.internmc.facebook.com/intern/diff/D84135792/)

[ghstack-poisoned]
---
 backends/cuda/runtime/cuda_backend.cpp | 31 ++++++++------------------
 backends/cuda/runtime/shims/memory.cpp | 15 +++++++++----
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 4e87ff1b566..3c1e8e5dbb5 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -286,18 +286,6 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
-    // Clean up GPU tensors that we created (ExecuTorch tensors are always
-    // CPU, so all GPU tensors are our copies)
-    for (int i = 0; i < n_inputs; i++) {
-      // All GPU input tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_inputs[i]);
-    }
-
-    for (int i = 0; i < n_outputs; i++) {
-      // All GPU output tensors were created by us, delete them
-      aoti_torch_delete_tensor_object(gpu_outputs[i]);
-    }
-
     return Error::Ok;
   }
 
@@ -318,16 +306,14 @@ class ET_EXPERIMENTAL CudaBackend final
       handle->cuda_stream = nullptr;
     }
 
-    // Delete the container BEFORE closing the shared library
-    if (handle->container_handle != nullptr) {
-      AOTIRuntimeError delete_result =
-          AOTInductorModelContainerDelete(handle->container_handle);
-      ET_CHECK_OR_LOG(
-          delete_result == Error::Ok,
-          "Failed to delete AOTInductorModelContainer with error code %d",
-          delete_result);
-      handle->container_handle = nullptr;
-    }
+    // We noticed that AOTInductorModelContainerDelete doesn't work well with
+    // mutitple .so files when we tried to use it to delete container handle,
+    // since freeing one of them will free some sharing resources, leading to
+    // segfault when trying to free the other .so files. Now we do not explicted
+    // delete the container and defer to OS to handle them.
+    // TODO(gasoonjia): find a better and safer solution to delete the
+    // container.
+    // AOTInductorModelContainerDelete(handle->container_handle);
 
     // Now close the shared library
     if (handle->so_handle != nullptr) {
@@ -345,6 +331,7 @@ class ET_EXPERIMENTAL CudaBackend final
     }
 
     delete handle;
+    clear_all_tensors();
   }
 };
 
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index cbaca68576e..a054169330b 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -271,14 +271,21 @@ void clear_all_tensors() {
   // Use aoti_torch_delete_tensor_object to properly delete each tensor
   // Note: We need to collect tensor pointers first since deletion modifies the
   // set
-  auto old_tensors =
-      std::move(tensors); // tensors is now empty and no need to copy
-  for (const auto& tensor_shared : old_tensors) {
-    aoti_torch_delete_tensor_object(tensor_shared.get());
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
+  }
+
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
   }
 
   // tensors set should now be empty, but ensure it's cleared
   tensors.clear();
+
+  ET_LOG(Info, "Cleared all tensors");
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {