From f9d02b053b9e93a93c294bbbd96b85d599f9d50e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 8 Oct 2025 16:04:35 -0700
Subject: [PATCH 1/2] remove extra libtorch dependency

this diff removes extra libtorch dependency on cuda delegate, and introd missing bf16 and int64 shim layers

Differential Revision: [D84207378](https://our.internmc.facebook.com/intern/diff/D84207378/)

ghstack-source-id: 314984331
Pull Request resolved: https://github.com/pytorch/executorch/pull/14919
---
 backends/aoti/CMakeLists.txt   | 2 +-
 backends/aoti/common_shims.cpp | 9 ++++++++-
 backends/aoti/common_shims.h   | 2 ++
 backends/cuda/CMakeLists.txt   | 5 +----
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index ce364f2c4b0..845144af50f 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -40,7 +40,7 @@ target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
 target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against PyTorch libraries and standard libraries
+# Link against ExecuTorch libraries and standard libraries
 target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 executorch_target_link_options_shared_lib(aoti_common)
 
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index 2f9b36e3c4f..abc83779443 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -127,11 +127,18 @@ int32_t aoti_torch_layout_strided() {
 }
 
 // Dtype constants - these return the PyTorch dtype codes
-// Currently only float32 is supported, but using robust enum-based approach
 int32_t aoti_torch_dtype_float32() {
   return 6; // PyTorch's float32 dtype code
 }
 
+int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // PyTorch's bfloat16 dtype code
+}
+
+int32_t aoti_torch_dtype_int64() {
+  return 4; // PyTorch's int64 dtype code
+}
+
 // Cleanup functions
 void cleanup_tensor_metadata() {
   internal::tensor_to_sizes.clear();
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
index ffcbaa11a08..5f54cd1c878 100644
--- a/backends/aoti/common_shims.h
+++ b/backends/aoti/common_shims.h
@@ -58,6 +58,8 @@ AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
+int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int64();
 
 // Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled();
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
index dc5b1b786f8..575f676e4cc 100644
--- a/backends/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -55,10 +55,7 @@ target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
 
 # Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
 target_link_libraries(
-  aoti_cuda
-  PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
-         # Link PyTorch libraries for AOTI CUDA functions
-         ${TORCH_LIBRARIES}
+  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)

From 81f1fdd186060003baf3df1e14ec6a0cbb8e2889 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Thu, 9 Oct 2025 01:16:33 -0400
Subject: [PATCH 2/2] nested namespace in cuda runtime (#14929)

This PR was created by the merge bot to help merge the original PR into
the main branch.
ghstack PR number: https://github.com/pytorch/executorch/pull/14920 by
@Gasoonjia
^ Please use this as the source of truth for the PR details, comments,
and reviews
ghstack PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/52/base
ghstack PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/52/head
Merge bot PR base:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/51/orig
Merge bot PR head:
https://github.com/pytorch/executorch/tree/gh/gasoonjia/52/orig
Differential Revision:
[D84201518](https://our.internmc.facebook.com/intern/diff/D84201518/)
@diff-train-skip-merge

Co-authored-by: gasoonjia <gasoonjia@icloud.com>
---
 backends/cuda/runtime/cuda_backend.cpp           | 11 ++++-------
 backends/cuda/runtime/guard.cpp                  |  8 ++------
 backends/cuda/runtime/guard.h                    |  8 ++------
 backends/cuda/runtime/shims/cuda_guard.cpp       |  8 ++------
 backends/cuda/runtime/shims/cuda_guard.h         |  8 ++------
 backends/cuda/runtime/shims/memory.cpp           |  8 ++------
 backends/cuda/runtime/shims/memory.h             |  8 ++------
 backends/cuda/runtime/shims/tensor_attribute.cpp |  8 ++------
 backends/cuda/runtime/shims/tensor_attribute.h   |  8 ++------
 backends/cuda/runtime/utils.h                    |  8 ++------
 10 files changed, 22 insertions(+), 61 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index e10322ad40c..58ab54e1aac 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -26,9 +26,7 @@
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 #define LOAD_SYMBOL(name, handle)                                \
   do {                                                           \
@@ -335,14 +333,13 @@ class ET_EXPERIMENTAL CudaBackend final
   }
 };
 
-} // namespace cuda
+} // namespace executorch::backends::cuda
 
+namespace executorch::backends {
 namespace {
 auto cls = cuda::CudaBackend();
 executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
-
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
index 885efc7670d..674cc6387b3 100644
--- a/backends/cuda/runtime/guard.cpp
+++ b/backends/cuda/runtime/guard.cpp
@@ -9,9 +9,7 @@
 #include <executorch/backends/cuda/runtime/guard.h>
 #include <executorch/runtime/platform/log.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 namespace {
 // Thread-local stream storage (private to this file)
@@ -146,6 +144,4 @@ Result<CUDAStreamGuard> CUDAStreamGuard::create(
   return stream_guard;
 }
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
index 4e5a18a4c0f..3f187000f90 100644
--- a/backends/cuda/runtime/guard.h
+++ b/backends/cuda/runtime/guard.h
@@ -14,9 +14,7 @@
 #include <executorch/runtime/core/result.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::runtime::Error;
 using executorch::runtime::Result;
@@ -190,6 +188,4 @@ class CUDAStreamGuard {
   DeviceIndex device_index_;
 };
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp
index 5740d0bf654..bb07acc7ffa 100644
--- a/backends/cuda/runtime/shims/cuda_guard.cpp
+++ b/backends/cuda/runtime/shims/cuda_guard.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 extern "C" {
 
@@ -104,6 +102,4 @@ AOTITorchError aoti_torch_get_current_cuda_stream(
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
index 6da869064a7..f930f3df643 100644
--- a/backends/cuda/runtime/shims/cuda_guard.h
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -13,9 +13,7 @@
 #include <executorch/backends/cuda/runtime/guard.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::backends::aoti::AOTITorchError;
 
@@ -99,6 +97,4 @@ AOTITorchError aoti_torch_get_current_cuda_stream(
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
index a054169330b..b8e3dc8e21b 100644
--- a/backends/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -19,9 +19,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::aten::SizesType;
 using executorch::aten::StridesType;
@@ -659,6 +657,4 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
index bcec6621285..7a8d4c3609b 100644
--- a/backends/cuda/runtime/shims/memory.h
+++ b/backends/cuda/runtime/shims/memory.h
@@ -12,9 +12,7 @@
 #include <executorch/backends/aoti/common_shims.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 using executorch::backends::aoti::AOTITorchError;
 using executorch::backends::aoti::Tensor;
@@ -145,6 +143,4 @@ aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
 void clear_all_tensors();
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp
index 5b640b7a9e8..1a14c79f9f2 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.cpp
+++ b/backends/cuda/runtime/shims/tensor_attribute.cpp
@@ -8,9 +8,7 @@
 
 #include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 extern "C" {
 
@@ -31,6 +29,4 @@ int32_t aoti_torch_device_type_cuda() {
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
index e99958b4f0c..15a4e397d24 100644
--- a/backends/cuda/runtime/shims/tensor_attribute.h
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -12,9 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Common using declarations for ExecutorTorch types
 using executorch::runtime::Error;
@@ -35,6 +33,4 @@ int32_t aoti_torch_device_type_cuda();
 
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
index 02c3abfc83f..2d805724090 100644
--- a/backends/cuda/runtime/utils.h
+++ b/backends/cuda/runtime/utils.h
@@ -34,9 +34,7 @@
 #define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
   ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
 
-namespace executorch {
-namespace backends {
-namespace cuda {
+namespace executorch::backends::cuda {
 
 // Enum for supported data types in et-cuda backend
 enum class SupportedDTypes : int32_t {
@@ -125,6 +123,4 @@ inline AOTITorchError validate_dtype(int32_t dtype) {
 }
 } // extern "C"
 
-} // namespace cuda
-} // namespace backends
-} // namespace executorch
+} // namespace executorch::backends::cuda