From 4703a085570a4b62e222d42302de5c7fead19fae Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 8 Oct 2025 10:09:55 -0700 Subject: [PATCH] update cuda delegate resource free pipeline for safety and segfault-free This diff survives `clear_all_tensors()` function and enable it during backend destroy stage. Furthermore, we defer the container handle deletion to OS to avoid potential segfault if there's more than one .so files. Differential Revision: [D84135792](https://our.internmc.facebook.com/intern/diff/D84135792/) [ghstack-poisoned] --- backends/cuda/runtime/cuda_backend.cpp | 31 ++++++++------------------ backends/cuda/runtime/shims/memory.cpp | 15 +++++++++---- 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 4e87ff1b566..3c1e8e5dbb5 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -286,18 +286,6 @@ class ET_EXPERIMENTAL CudaBackend final i); } - // Clean up GPU tensors that we created (ExecuTorch tensors are always - // CPU, so all GPU tensors are our copies) - for (int i = 0; i < n_inputs; i++) { - // All GPU input tensors were created by us, delete them - aoti_torch_delete_tensor_object(gpu_inputs[i]); - } - - for (int i = 0; i < n_outputs; i++) { - // All GPU output tensors were created by us, delete them - aoti_torch_delete_tensor_object(gpu_outputs[i]); - } - return Error::Ok; } @@ -318,16 +306,14 @@ class ET_EXPERIMENTAL CudaBackend final handle->cuda_stream = nullptr; } - // Delete the container BEFORE closing the shared library - if (handle->container_handle != nullptr) { - AOTIRuntimeError delete_result = - AOTInductorModelContainerDelete(handle->container_handle); - ET_CHECK_OR_LOG( - delete_result == Error::Ok, - "Failed to delete AOTInductorModelContainer with error code %d", - delete_result); - handle->container_handle = nullptr; - } + // We noticed that AOTInductorModelContainerDelete doesn't work well with + // mutitple .so files when we tried to use it to delete container handle, + // since freeing one of them will free some sharing resources, leading to + // segfault when trying to free the other .so files. Now we do not explicted + // delete the container and defer to OS to handle them. + // TODO(gasoonjia): find a better and safer solution to delete the + // container. + // AOTInductorModelContainerDelete(handle->container_handle); // Now close the shared library if (handle->so_handle != nullptr) { @@ -345,6 +331,7 @@ class ET_EXPERIMENTAL CudaBackend final } delete handle; + clear_all_tensors(); } }; diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp index cbaca68576e..a054169330b 100644 --- a/backends/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -271,14 +271,21 @@ void clear_all_tensors() { // Use aoti_torch_delete_tensor_object to properly delete each tensor // Note: We need to collect tensor pointers first since deletion modifies the // set - auto old_tensors = - std::move(tensors); // tensors is now empty and no need to copy - for (const auto& tensor_shared : old_tensors) { - aoti_torch_delete_tensor_object(tensor_shared.get()); + std::vector tensor_ptrs; + tensor_ptrs.reserve(tensors.size()); + for (const auto& tensor_shared : tensors) { + tensor_ptrs.push_back(tensor_shared.get()); + } + + // Now delete each tensor - this will modify the global tensors set + for (Tensor* tensor_ptr : tensor_ptrs) { + aoti_torch_delete_tensor_object(tensor_ptr); } // tensors set should now be empty, but ensure it's cleared tensors.clear(); + + ET_LOG(Info, "Cleared all tensors"); } AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {