Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/aoti/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
# Ensure symbols are exported properly
target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)

# Link against PyTorch libraries and standard libraries
# Link against ExecuTorch libraries and standard libraries
target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
executorch_target_link_options_shared_lib(aoti_common)

Expand Down
9 changes: 8 additions & 1 deletion backends/aoti/common_shims.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,18 @@ int32_t aoti_torch_layout_strided() {
}

// Dtype constants - these return the PyTorch dtype codes
// Currently only float32 is supported, but using robust enum-based approach
int32_t aoti_torch_dtype_float32() {
return 6; // PyTorch's float32 dtype code
}

int32_t aoti_torch_dtype_bfloat16() {
return 15; // PyTorch's bfloat16 dtype code
}

int32_t aoti_torch_dtype_int64() {
return 4; // PyTorch's int64 dtype code
}

// Cleanup functions
void cleanup_tensor_metadata() {
internal::tensor_to_sizes.clear();
Expand Down
2 changes: 2 additions & 0 deletions backends/aoti/common_shims.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
int32_t aoti_torch_device_type_cpu();
int32_t aoti_torch_layout_strided();
int32_t aoti_torch_dtype_float32();
int32_t aoti_torch_dtype_bfloat16();
int32_t aoti_torch_dtype_int64();

// Autograd mode functions
int32_t aoti_torch_grad_mode_is_enabled();
Expand Down
2 changes: 1 addition & 1 deletion backends/aoti/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def define_common_targets():
link_whole = True,
supports_python_dlopen = True,
visibility = ["@EXECUTORCH_CLIENTS"],
deps = [
exported_deps = [
":common_shims",
":model_container",
],
Expand Down
5 changes: 1 addition & 4 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,7 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)

# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
target_link_libraries(
aoti_cuda
PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
# Link PyTorch libraries for AOTI CUDA functions
${TORCH_LIBRARIES}
aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
)
# If you need other CUDA libraries, link them similarly:
# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
Expand Down
22 changes: 22 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,25 @@ runtime.cxx_library(
("cuda", None, "cuda-lazy"),
],
)

runtime.cxx_library(
name = "cuda_backend",
srcs = [
"cuda_backend.cpp",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
# Constructor needed for backend registration.
compiler_flags = ["-Wno-global-constructors"],
visibility = ["@EXECUTORCH_CLIENTS"],
deps = [
":runtime_shims",
"//executorch/backends/aoti:aoti_common",
"//executorch/runtime/backend:interface",
"//executorch/runtime/core/exec_aten/util:tensor_util",
],
external_deps = [
("cuda", None, "cuda-lazy"),
],
)
207 changes: 76 additions & 131 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,14 @@
#include <executorch/backends/cuda/runtime/shims/memory.h>
#include <executorch/backends/cuda/runtime/utils.h>

namespace executorch {
namespace backends {
namespace cuda {
namespace executorch::backends::cuda {

#define LOAD_SYMBOL(name, handle) \
do { \
name = reinterpret_cast<name##Func>(dlsym(handle, #name)); \
ET_CHECK_OR_RETURN_ERROR( \
name != nullptr, AccessFailed, "Failed to load " #name); \
} while (0)

using namespace std;
using namespace aoti;
Expand All @@ -53,45 +58,11 @@ class ET_EXPERIMENTAL CudaBackend final
: public ::executorch::runtime::BackendInterface {
private:
Error register_shared_library_functions(void* so_handle) const {
AOTInductorModelContainerCreateWithDevice =
reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
if (AOTInductorModelContainerCreateWithDevice == nullptr) {
ET_LOG(Error, "Failed to load AOTInductorModelContainerCreateWithDevice");
return Error::AccessFailed;
}

AOTInductorModelContainerDelete =
reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
dlsym(so_handle, "AOTInductorModelContainerDelete"));
if (AOTInductorModelContainerDelete == nullptr) {
ET_LOG(Error, "Failed to load AOTInductorModelContainerDelete");
return Error::AccessFailed;
}

AOTInductorModelContainerGetNumInputs =
reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
if (AOTInductorModelContainerGetNumInputs == nullptr) {
ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumInputs");
return Error::AccessFailed;
}

AOTInductorModelContainerGetNumOutputs =
reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
if (AOTInductorModelContainerGetNumOutputs == nullptr) {
ET_LOG(Error, "Failed to load AOTInductorModelContainerGetNumOutputs");
return Error::AccessFailed;
}

AOTInductorModelContainerRun =
reinterpret_cast<AOTInductorModelContainerRunFunc>(
dlsym(so_handle, "AOTInductorModelContainerRun"));
if (AOTInductorModelContainerRun == nullptr) {
ET_LOG(Error, "Failed to load AOTInductorModelContainerRun");
return Error::AccessFailed;
}
LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);

return Error::Ok;
}
Expand Down Expand Up @@ -122,14 +93,13 @@ class ET_EXPERIMENTAL CudaBackend final

const NamedDataMap* named_data_map = context.get_named_data_map();
auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
if (!aoti_cuda_buffer.ok()) {
ET_LOG(
Error,
"Failed to get data for key %s: 0x%x",
so_blob_key.c_str(),
aoti_cuda_buffer.error());
return aoti_cuda_buffer.error();
}
ET_CHECK_OR_RETURN_ERROR(
aoti_cuda_buffer.ok(),
Internal,
"Failed to get data for key %s: 0x%x",
so_blob_key.c_str(),
static_cast<uint32_t>(aoti_cuda_buffer.error()));

// Generate dynamic temporary file path
filesystem::path temp_dir = filesystem::temp_directory_path();
filesystem::path so_path =
Expand All @@ -144,39 +114,35 @@ class ET_EXPERIMENTAL CudaBackend final
"Writing %zu bytes to %s",
aoti_cuda_buffer->size(),
so_path.c_str());

outfile.write(
static_cast<const char*>(aoti_cuda_buffer->data()),
aoti_cuda_buffer->size());

if (!outfile) {
ET_LOG(Error, "Failed to write to file %s", so_path.c_str());
return Error::AccessFailed;
}
ET_CHECK_OR_RETURN_ERROR(
outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());

// Finish writing the file to disk
outfile.close();

// Load the ELF using dlopen
void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
if (so_handle == nullptr) {
ET_LOG(Error, "Failed to load shared library: %s", dlerror());
return Error::AccessFailed;
}
ET_CHECK_OR_RETURN_ERROR(
so_handle != nullptr,
AccessFailed,
"Failed to load shared library: %s",
dlerror());

processed->Free();

// Register all shared library functions
Error reg_err = register_shared_library_functions(so_handle);
if (reg_err != Error::Ok) {
return reg_err;
}
ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));

AOTInductorModelContainerHandle container_handle = nullptr;

AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
&container_handle, 1, "cuda", nullptr);
if (err != Error::Ok) {
return err;
}
ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
&container_handle, 1, "cuda", nullptr));

ET_LOG(Info, "container_handle = %p", container_handle);

AOTIDelegateHandle* handle = new AOTIDelegateHandle();
Expand Down Expand Up @@ -206,15 +172,13 @@ class ET_EXPERIMENTAL CudaBackend final
AOTInductorModelContainerGetNumOutputs(
handle->container_handle, &n_outputs);

if (n_inputs + n_outputs != args.size()) {
ET_LOG(
Error,
"number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
n_inputs,
n_outputs,
args.size());
return Error::InvalidArgument;
}
ET_CHECK_OR_RETURN_ERROR(
n_inputs + n_outputs == args.size(),
InvalidArgument,
"number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
n_inputs,
n_outputs,
args.size())

// NOTE: ExecuTorch tensors are always on CPU/host memory
// We need to create GPU copies for CUDA kernel execution
Expand Down Expand Up @@ -244,19 +208,20 @@ class ET_EXPERIMENTAL CudaBackend final
0, // device_index = 0
&gpu_input_handle);

if (create_err != Error::Ok) {
ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
return Error::Internal;
}
ET_CHECK_OR_RETURN_ERROR(
create_err == Error::Ok,
Internal,
"Failed to create GPU tensor for input %d",
i);

gpu_inputs[i] = gpu_input_handle;

// Copy data from CPU to GPU
Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
if (copy_err != Error::Ok) {
ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
return Error::Internal;
}
ET_CHECK_OR_RETURN_ERROR(
aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
Internal,
"Failed to copy input %d from CPU to GPU",
i);
}
ET_LOG(Info, "Inputs copied to GPU");
// Process output tensors: create GPU counterparts for ExecuTorch CPU
Expand All @@ -280,10 +245,11 @@ class ET_EXPERIMENTAL CudaBackend final
0, // device_index = 0
&gpu_output_handle);

if (create_err != Error::Ok) {
ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
return Error::Internal;
}
ET_CHECK_OR_RETURN_ERROR(
create_err == Error::Ok,
Internal,
"Failed to create GPU tensor for output %d",
i);

gpu_outputs[i] = gpu_output_handle;
}
Expand All @@ -298,13 +264,11 @@ class ET_EXPERIMENTAL CudaBackend final
handle->cuda_stream, // Pass the actual CUDA stream
nullptr); // proxy_executor_handle can remain nullptr

if (error != Error::Ok) {
ET_LOG(
Error,
"AOTInductorModelContainerRun failed with error code %d",
error);
return Error::Internal;
}
ET_CHECK_OR_RETURN_ERROR(
error == Error::Ok,
Internal,
"AOTInductorModelContainerRun failed with error code %d",
error);

// Copy GPU output results back to CPU output tensors
for (int i = 0; i < n_outputs; i++) {
Expand All @@ -320,18 +284,6 @@ class ET_EXPERIMENTAL CudaBackend final
i);
}

// Clean up GPU tensors that we created (ExecuTorch tensors are always
// CPU, so all GPU tensors are our copies)
for (int i = 0; i < n_inputs; i++) {
// All GPU input tensors were created by us, delete them
aoti_torch_delete_tensor_object(gpu_inputs[i]);
}

for (int i = 0; i < n_outputs; i++) {
// All GPU output tensors were created by us, delete them
aoti_torch_delete_tensor_object(gpu_outputs[i]);
}

return Error::Ok;
}

Expand All @@ -352,18 +304,13 @@ class ET_EXPERIMENTAL CudaBackend final
handle->cuda_stream = nullptr;
}

// Delete the container BEFORE closing the shared library
if (handle->container_handle != nullptr) {
AOTIRuntimeError delete_result =
AOTInductorModelContainerDelete(handle->container_handle);
if (delete_result != Error::Ok) {
ET_LOG(
Error,
"AOTInductorModelContainerDelete failed with error code %d",
delete_result);
}
handle->container_handle = nullptr;
}
// NOTE: AOTInductorModelContainerDelete does not work correctly with
// multiple .so files. Deleting one container frees shared resources,
// which causes segmentation faults when attempting to delete other
// containers. As a workaround, we skip explicit container deletion
// and defer cleanup to the OS.
// TODO(gasoonjia): Find a proper solution for safe container deletion.
// AOTInductorModelContainerDelete(handle->container_handle);

// Now close the shared library
if (handle->so_handle != nullptr) {
Expand All @@ -374,27 +321,25 @@ class ET_EXPERIMENTAL CudaBackend final
if (!handle->so_path.empty()) {
std::error_code remove_error;
std::filesystem::remove(handle->so_path, remove_error);
if (remove_error) {
ET_LOG(
Error,
"Failed to remove temporary shared library %s: %s",
handle->so_path.c_str(),
remove_error.message().c_str());
}
ET_CHECK_OR_LOG_ERROR(
!remove_error,
"Failed to remove temporary shared library %s: %s",
handle->so_path.c_str(),
remove_error.message().c_str());
}

delete handle;
clear_all_tensors();
}
};

} // namespace cuda
} // namespace executorch::backends::cuda

namespace executorch::backends {
namespace {
auto cls = cuda::CudaBackend();
executorch::runtime::Backend backend{"CudaBackend", &cls};
static executorch::runtime::Error success_with_compiler =
register_backend(backend);
} // namespace

} // namespace backends
} // namespace executorch
} // namespace executorch::backends
Loading
Loading