From d6e44b62b326adb48f41675564090b7bb0a5d039 Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 2 Sep 2025 12:48:37 -0300 Subject: [PATCH 1/6] Remove CUDA from build. --- .bazelrc | 6 +++--- BUILD | 7 ------- WORKSPACE | 17 ----------------- torch_xla/csrc/runtime/BUILD | 9 --------- 4 files changed, 3 insertions(+), 36 deletions(-) diff --git a/.bazelrc b/.bazelrc index 3dec0dc4064..e790453d11a 100644 --- a/.bazelrc +++ b/.bazelrc @@ -91,7 +91,7 @@ build:short_logs --output_filter=DONT_MATCH_ANYTHING #build:tpu --@xla//xla/python:enable_tpu=true build:tpu --define=with_tpu_support=true -# Run tests serially with TPU and GPU (only 1 device is available). +# Run tests serially with TPU (only 1 device is available). test:tpu --local_test_jobs=1 ######################################################################### @@ -100,11 +100,11 @@ test:tpu --local_test_jobs=1 common --experimental_repo_remote_exec # Inherit environmental variables that are used in testing. -test --test_env=TPU_NUM_DEVICES --test_env=GPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER +test --test_env=TPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS -test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=PJRT_GPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR +test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR test --test_env=PJRT_LOCAL_PROCESS_RANK # This environmental variable is important for properly integrating with XLA. diff --git a/BUILD b/BUILD index 1b82e9d4b97..3d56a635e8b 100644 --- a/BUILD +++ b/BUILD @@ -1,8 +1,3 @@ -load( - "@xla//xla/tsl/platform/default:cuda_build_defs.bzl", - "if_cuda_is_configured", -) - load("@python//:defs.bzl", "compile_pip_requirements") load("@python_version_repo//:py_version.bzl", "REQUIREMENTS") @@ -41,8 +36,6 @@ cc_binary( "@torch//:libtorch", "@torch//:libtorch_cpu", "@torch//:libtorch_python", - ] + if_cuda_is_configured([ - "@xla//xla/stream_executor:cuda_platform", ]), ) diff --git a/WORKSPACE b/WORKSPACE index 8222c5797bb..d058b53082d 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -56,8 +56,6 @@ http_archive( ], patch_tool = "patch", patches = [ - "//openxla_patches:gpu_nvml.diff", - "//openxla_patches:gpu_race_condition.diff", "//openxla_patches:no_fortify.diff", ], strip_prefix = "xla-" + xla_hash, @@ -140,18 +138,3 @@ xla_workspace1() load("@xla//:workspace0.bzl", "xla_workspace0") xla_workspace0() - - -load( - "@xla//third_party/gpus:cuda_configure.bzl", - "cuda_configure", -) - -cuda_configure(name = "local_config_cuda") - -load( - "@xla//third_party/nccl:nccl_configure.bzl", - "nccl_configure", -) - -nccl_configure(name = "local_config_nccl") diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD index b381d3feff7..4076b29858d 100644 --- a/torch_xla/csrc/runtime/BUILD +++ b/torch_xla/csrc/runtime/BUILD @@ -2,10 +2,6 @@ load( "//bazel:rules_def.bzl", "ptxla_cc_test", ) -load( - "@xla//xla/tsl/platform/default:cuda_build_defs.bzl", - "if_cuda_is_configured", -) licenses(["notice"]) # Apache 2.0 @@ -134,7 +130,6 @@ cc_library( "@xla//xla:shape_util", "@xla//xla/hlo/builder:xla_computation", "@xla//xla/pjrt:pjrt_client", - "@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs", "@xla//xla/pjrt/c:pjrt_c_api_hdrs", "@xla//xla/pjrt/c:pjrt_c_api_wrapper_impl", "@xla//xla/pjrt:pjrt_c_api_client", @@ -218,8 +213,6 @@ cc_library( "@com_google_absl//absl/log:initialize", "@xla//xla/pjrt:pjrt_c_api_client", "@xla//xla/pjrt:tfrt_cpu_pjrt_client", - "@xla//xla/pjrt/gpu:se_gpu_pjrt_client", - "@xla//xla/service:gpu_plugin", ], ) @@ -295,8 +288,6 @@ cc_library( deps = [ "@xla//xla/backends/profiler/cpu:host_tracer", "@xla//xla/backends/profiler/cpu:metadata_collector", - ] + if_cuda_is_configured([ - "@xla//xla/backends/profiler/gpu:device_tracer", ]), alwayslink = True, ) From f984ef91c40963920854c211647f1cac99aa7392 Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 2 Sep 2025 12:53:53 -0300 Subject: [PATCH 2/6] Remove OpenXLA patches. --- openxla_patches/gpu_nvml.diff | 26 ------------------------- openxla_patches/gpu_race_condition.diff | 14 ------------- 2 files changed, 40 deletions(-) delete mode 100644 openxla_patches/gpu_nvml.diff delete mode 100644 openxla_patches/gpu_race_condition.diff diff --git a/openxla_patches/gpu_nvml.diff b/openxla_patches/gpu_nvml.diff deleted file mode 100644 index fd38807775a..00000000000 --- a/openxla_patches/gpu_nvml.diff +++ /dev/null @@ -1,26 +0,0 @@ -iff --git a/xla/service/gpu/model/gpu_collective_performance_model.cc b/xla/service/gpu/model/gpu_collective_performance_model.cc -index 496969f545..2d9f73ee36 100644 ---- a/xla/service/gpu/model/gpu_collective_performance_model.cc -+++ b/xla/service/gpu/model/gpu_collective_performance_model.cc -@@ -34,7 +34,7 @@ limitations under the License. - - #if GOOGLE_CUDA - #include "third_party/gpus/cuda/include/cuda.h" --#include "third_party/gpus/cuda/nvml/include/nvml.h" -+#include "third_party/gpus/cuda/include/nvml.h" - #endif // GOOGLE_CUDA - namespace xla { - namespace gpu { -diff --git a/xla/service/gpu/model/gpu_collective_performance_model.h b/xla/service/gpu/model/gpu_collective_performance_model.h -index 01c3f3eb45..f44057602b 100644 ---- a/xla/service/gpu/model/gpu_collective_performance_model.h -+++ b/xla/service/gpu/model/gpu_collective_performance_model.h -@@ -32,7 +32,7 @@ limitations under the License. - #include - #endif - --#include "third_party/gpus/cuda/nvml/include/nvml.h" -+#include "third_party/gpus/cuda/include/nvml.h" - // Below is a list of function pointers to be used - // for querying device properties through nvml library. - #define NVML_FUNCTOR(name, rettype, args) \ \ No newline at end of file diff --git a/openxla_patches/gpu_race_condition.diff b/openxla_patches/gpu_race_condition.diff deleted file mode 100644 index 082376116a3..00000000000 --- a/openxla_patches/gpu_race_condition.diff +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc -index 9279bd877..fab926a7c 100644 ---- a/xla/service/gpu/gpu_executable.cc -+++ b/xla/service/gpu/gpu_executable.cc -@@ -669,8 +669,7 @@ absl::StatusOr GpuExecutable::ExecuteAsyncOnStreamImpl( - #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM - - // Force synchronous execution if the allocator requires it. -- const bool block_host_until_done = -- !memory_allocator->AllowsAsynchronousDeallocation(); -+ const bool block_host_until_done = true; - - // Lock the GPU with a shared lock so that we don't interfere with autotuning - // that may be running during JIT compilation while allowing multiple XLA \ No newline at end of file From e180193acce6ee38662043708c2788eeb85b94b6 Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 2 Sep 2025 13:11:24 -0300 Subject: [PATCH 3/6] Fix build --- BUILD | 2 +- torch_xla/csrc/runtime/BUILD | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/BUILD b/BUILD index 3d56a635e8b..128f83dcd56 100644 --- a/BUILD +++ b/BUILD @@ -36,7 +36,7 @@ cc_binary( "@torch//:libtorch", "@torch//:libtorch_cpu", "@torch//:libtorch_python", - ]), + ], ) test_suite( diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD index 4076b29858d..79a814f258e 100644 --- a/torch_xla/csrc/runtime/BUILD +++ b/torch_xla/csrc/runtime/BUILD @@ -288,7 +288,7 @@ cc_library( deps = [ "@xla//xla/backends/profiler/cpu:host_tracer", "@xla//xla/backends/profiler/cpu:metadata_collector", - ]), + ), alwayslink = True, ) From 3bebe1caf0ed774a5c0953a138af857ac2befedd Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 2 Sep 2025 15:13:34 -0300 Subject: [PATCH 4/6] Fix build. --- torch_xla/csrc/runtime/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD index 79a814f258e..40d8390debc 100644 --- a/torch_xla/csrc/runtime/BUILD +++ b/torch_xla/csrc/runtime/BUILD @@ -288,7 +288,7 @@ cc_library( deps = [ "@xla//xla/backends/profiler/cpu:host_tracer", "@xla//xla/backends/profiler/cpu:metadata_collector", - ), + ], alwayslink = True, ) From 542ea19be83797f017f5534cf2959c9ba9faa7b8 Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Tue, 2 Sep 2025 18:48:16 -0300 Subject: [PATCH 5/6] Fix build. --- WORKSPACE | 15 +++++++++++++++ torch_xla/csrc/runtime/BUILD | 1 + 2 files changed, 16 insertions(+) diff --git a/WORKSPACE b/WORKSPACE index d058b53082d..70b7d9cc098 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -138,3 +138,18 @@ xla_workspace1() load("@xla//:workspace0.bzl", "xla_workspace0") xla_workspace0() + + +# Even though we don't support XLA:CUDA anymore, we still need to keep the +# following. The reason being that `pjrt_computation_client_test` depends on +# `@xla//xla/tools`, which calls: +# +# ``` +# load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")` +# ``` +load( + "@xla//third_party/gpus:cuda_configure.bzl", + "cuda_configure", +) + +cuda_configure(name = "local_config_cuda") diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD index 40d8390debc..4f0f3bf384e 100644 --- a/torch_xla/csrc/runtime/BUILD +++ b/torch_xla/csrc/runtime/BUILD @@ -211,6 +211,7 @@ cc_library( "@torch//:headers", "@com_google_absl//absl/log:absl_check", "@com_google_absl//absl/log:initialize", + "@xla//xla/pjrt/distributed:in_memory_key_value_store", "@xla//xla/pjrt:pjrt_c_api_client", "@xla//xla/pjrt:tfrt_cpu_pjrt_client", ], From cd2627e9da64f0a6511cb4363b545aa23e87a978 Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Wed, 3 Sep 2025 10:48:05 -0300 Subject: [PATCH 6/6] Revert pass through of CPU environment variable. --- .bazelrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bazelrc b/.bazelrc index e790453d11a..9c2667a8ac1 100644 --- a/.bazelrc +++ b/.bazelrc @@ -100,7 +100,7 @@ test:tpu --local_test_jobs=1 common --experimental_repo_remote_exec # Inherit environmental variables that are used in testing. -test --test_env=TPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER +test --test_env=TPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS