From d6e44b62b326adb48f41675564090b7bb0a5d039 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Tue, 2 Sep 2025 12:48:37 -0300
Subject: [PATCH 1/6] Remove CUDA from build.

---
 .bazelrc                     |  6 +++---
 BUILD                        |  7 -------
 WORKSPACE                    | 17 -----------------
 torch_xla/csrc/runtime/BUILD |  9 ---------
 4 files changed, 3 insertions(+), 36 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 3dec0dc4064..e790453d11a 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -91,7 +91,7 @@ build:short_logs --output_filter=DONT_MATCH_ANYTHING
 #build:tpu --@xla//xla/python:enable_tpu=true
 build:tpu --define=with_tpu_support=true
 
-# Run tests serially with TPU and GPU (only 1 device is available).
+# Run tests serially with TPU (only 1 device is available).
 test:tpu --local_test_jobs=1
 
 #########################################################################
@@ -100,11 +100,11 @@ test:tpu --local_test_jobs=1
 common --experimental_repo_remote_exec
 
 # Inherit environmental variables that are used in testing.
-test --test_env=TPU_NUM_DEVICES --test_env=GPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
+test --test_env=TPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
 test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS
 test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL
 test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS
-test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=PJRT_GPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
+test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
 test --test_env=PJRT_LOCAL_PROCESS_RANK
 
 # This environmental variable is important for properly integrating with XLA.
diff --git a/BUILD b/BUILD
index 1b82e9d4b97..3d56a635e8b 100644
--- a/BUILD
+++ b/BUILD
@@ -1,8 +1,3 @@
-load(
-    "@xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
-
 load("@python//:defs.bzl", "compile_pip_requirements")
 load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
 
@@ -41,8 +36,6 @@ cc_binary(
         "@torch//:libtorch",
         "@torch//:libtorch_cpu",
         "@torch//:libtorch_python",
-    ] + if_cuda_is_configured([
-        "@xla//xla/stream_executor:cuda_platform",
     ]),
 )
 
diff --git a/WORKSPACE b/WORKSPACE
index 8222c5797bb..d058b53082d 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -56,8 +56,6 @@ http_archive(
     ],
     patch_tool = "patch",
     patches = [
-        "//openxla_patches:gpu_nvml.diff",
-        "//openxla_patches:gpu_race_condition.diff",
         "//openxla_patches:no_fortify.diff",
     ],
     strip_prefix = "xla-" + xla_hash,
@@ -140,18 +138,3 @@ xla_workspace1()
 load("@xla//:workspace0.bzl", "xla_workspace0")
 
 xla_workspace0()
-
-
-load(
-    "@xla//third_party/gpus:cuda_configure.bzl",
-    "cuda_configure",
-)
-
-cuda_configure(name = "local_config_cuda")
-
-load(
-    "@xla//third_party/nccl:nccl_configure.bzl",
-    "nccl_configure",
-)
-
-nccl_configure(name = "local_config_nccl")
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
index b381d3feff7..4076b29858d 100644
--- a/torch_xla/csrc/runtime/BUILD
+++ b/torch_xla/csrc/runtime/BUILD
@@ -2,10 +2,6 @@ load(
     "//bazel:rules_def.bzl",
     "ptxla_cc_test",
 )
-load(
-    "@xla//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 licenses(["notice"])  # Apache 2.0
 
@@ -134,7 +130,6 @@ cc_library(
         "@xla//xla:shape_util",
         "@xla//xla/hlo/builder:xla_computation",
         "@xla//xla/pjrt:pjrt_client",
-        "@xla//xla/pjrt/c:pjrt_c_api_gpu_extension_hdrs",
         "@xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@xla//xla/pjrt/c:pjrt_c_api_wrapper_impl",
         "@xla//xla/pjrt:pjrt_c_api_client",
@@ -218,8 +213,6 @@ cc_library(
         "@com_google_absl//absl/log:initialize",
         "@xla//xla/pjrt:pjrt_c_api_client",
         "@xla//xla/pjrt:tfrt_cpu_pjrt_client",
-        "@xla//xla/pjrt/gpu:se_gpu_pjrt_client",
-        "@xla//xla/service:gpu_plugin",
     ],
 )
 
@@ -295,8 +288,6 @@ cc_library(
     deps = [
         "@xla//xla/backends/profiler/cpu:host_tracer",
         "@xla//xla/backends/profiler/cpu:metadata_collector",
-    ] + if_cuda_is_configured([
-        "@xla//xla/backends/profiler/gpu:device_tracer",
     ]),
     alwayslink = True,
 )

From f984ef91c40963920854c211647f1cac99aa7392 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Tue, 2 Sep 2025 12:53:53 -0300
Subject: [PATCH 2/6] Remove OpenXLA patches.

---
 openxla_patches/gpu_nvml.diff           | 26 -------------------------
 openxla_patches/gpu_race_condition.diff | 14 -------------
 2 files changed, 40 deletions(-)
 delete mode 100644 openxla_patches/gpu_nvml.diff
 delete mode 100644 openxla_patches/gpu_race_condition.diff

diff --git a/openxla_patches/gpu_nvml.diff b/openxla_patches/gpu_nvml.diff
deleted file mode 100644
index fd38807775a..00000000000
--- a/openxla_patches/gpu_nvml.diff
+++ /dev/null
@@ -1,26 +0,0 @@
-iff --git a/xla/service/gpu/model/gpu_collective_performance_model.cc b/xla/service/gpu/model/gpu_collective_performance_model.cc
-index 496969f545..2d9f73ee36 100644
---- a/xla/service/gpu/model/gpu_collective_performance_model.cc
-+++ b/xla/service/gpu/model/gpu_collective_performance_model.cc
-@@ -34,7 +34,7 @@ limitations under the License.
-
- #if GOOGLE_CUDA
- #include "third_party/gpus/cuda/include/cuda.h"
--#include "third_party/gpus/cuda/nvml/include/nvml.h"
-+#include "third_party/gpus/cuda/include/nvml.h"
- #endif  // GOOGLE_CUDA
- namespace xla {
- namespace gpu {
-diff --git a/xla/service/gpu/model/gpu_collective_performance_model.h b/xla/service/gpu/model/gpu_collective_performance_model.h
-index 01c3f3eb45..f44057602b 100644
---- a/xla/service/gpu/model/gpu_collective_performance_model.h
-+++ b/xla/service/gpu/model/gpu_collective_performance_model.h
-@@ -32,7 +32,7 @@ limitations under the License.
- #include <dlfcn.h>
- #endif
-
--#include "third_party/gpus/cuda/nvml/include/nvml.h"
-+#include "third_party/gpus/cuda/include/nvml.h"
- // Below is a list of function pointers to be used
- // for querying device properties through nvml library.
- #define NVML_FUNCTOR(name, rettype, args) \
\ No newline at end of file
diff --git a/openxla_patches/gpu_race_condition.diff b/openxla_patches/gpu_race_condition.diff
deleted file mode 100644
index 082376116a3..00000000000
--- a/openxla_patches/gpu_race_condition.diff
+++ /dev/null
@@ -1,14 +0,0 @@
-diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
-index 9279bd877..fab926a7c 100644
---- a/xla/service/gpu/gpu_executable.cc
-+++ b/xla/service/gpu/gpu_executable.cc
-@@ -669,8 +669,7 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
- #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-   // Force synchronous execution if the allocator requires it.
--  const bool block_host_until_done =
--      !memory_allocator->AllowsAsynchronousDeallocation();
-+  const bool block_host_until_done = true;
-
-   // Lock the GPU with a shared lock so that we don't interfere with autotuning
-   // that may be running during JIT compilation while allowing multiple XLA
\ No newline at end of file

From e180193acce6ee38662043708c2788eeb85b94b6 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Tue, 2 Sep 2025 13:11:24 -0300
Subject: [PATCH 3/6] Fix build

---
 BUILD                        | 2 +-
 torch_xla/csrc/runtime/BUILD | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/BUILD b/BUILD
index 3d56a635e8b..128f83dcd56 100644
--- a/BUILD
+++ b/BUILD
@@ -36,7 +36,7 @@ cc_binary(
         "@torch//:libtorch",
         "@torch//:libtorch_cpu",
         "@torch//:libtorch_python",
-    ]),
+    ],
 )
 
 test_suite(
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
index 4076b29858d..79a814f258e 100644
--- a/torch_xla/csrc/runtime/BUILD
+++ b/torch_xla/csrc/runtime/BUILD
@@ -288,7 +288,7 @@ cc_library(
     deps = [
         "@xla//xla/backends/profiler/cpu:host_tracer",
         "@xla//xla/backends/profiler/cpu:metadata_collector",
-    ]),
+    ),
     alwayslink = True,
 )
 

From 3bebe1caf0ed774a5c0953a138af857ac2befedd Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Tue, 2 Sep 2025 15:13:34 -0300
Subject: [PATCH 4/6] Fix build.

---
 torch_xla/csrc/runtime/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
index 79a814f258e..40d8390debc 100644
--- a/torch_xla/csrc/runtime/BUILD
+++ b/torch_xla/csrc/runtime/BUILD
@@ -288,7 +288,7 @@ cc_library(
     deps = [
         "@xla//xla/backends/profiler/cpu:host_tracer",
         "@xla//xla/backends/profiler/cpu:metadata_collector",
-    ),
+    ],
     alwayslink = True,
 )
 

From 542ea19be83797f017f5534cf2959c9ba9faa7b8 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Tue, 2 Sep 2025 18:48:16 -0300
Subject: [PATCH 5/6] Fix build.

---
 WORKSPACE                    | 15 +++++++++++++++
 torch_xla/csrc/runtime/BUILD |  1 +
 2 files changed, 16 insertions(+)

diff --git a/WORKSPACE b/WORKSPACE
index d058b53082d..70b7d9cc098 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -138,3 +138,18 @@ xla_workspace1()
 load("@xla//:workspace0.bzl", "xla_workspace0")
 
 xla_workspace0()
+
+
+# Even though we don't support XLA:CUDA anymore, we still need to keep the
+# following. The reason being that `pjrt_computation_client_test` depends on
+# `@xla//xla/tools`, which calls:
+#
+# ```
+# load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")`
+# ```
+load(
+    "@xla//third_party/gpus:cuda_configure.bzl",
+    "cuda_configure",
+)
+
+cuda_configure(name = "local_config_cuda")
diff --git a/torch_xla/csrc/runtime/BUILD b/torch_xla/csrc/runtime/BUILD
index 40d8390debc..4f0f3bf384e 100644
--- a/torch_xla/csrc/runtime/BUILD
+++ b/torch_xla/csrc/runtime/BUILD
@@ -211,6 +211,7 @@ cc_library(
         "@torch//:headers",
         "@com_google_absl//absl/log:absl_check",
         "@com_google_absl//absl/log:initialize",
+        "@xla//xla/pjrt/distributed:in_memory_key_value_store",
         "@xla//xla/pjrt:pjrt_c_api_client",
         "@xla//xla/pjrt:tfrt_cpu_pjrt_client",
     ],

From cd2627e9da64f0a6511cb4363b545aa23e87a978 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Wed, 3 Sep 2025 10:48:05 -0300
Subject: [PATCH 6/6] Revert pass through of CPU environment variable.

---
 .bazelrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.bazelrc b/.bazelrc
index e790453d11a..9c2667a8ac1 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -100,7 +100,7 @@ test:tpu --local_test_jobs=1
 common --experimental_repo_remote_exec
 
 # Inherit environmental variables that are used in testing.
-test --test_env=TPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
+test --test_env=TPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
 test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS
 test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL
 test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS