pytorch · ysiraichi · Aug 25, 2025 · Aug 25, 2025
diff --git a/.bazelrc b/.bazelrc
@@ -79,18 +79,6 @@ build:native_arch_posix --host_copt=-march=native
 
 build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=1
 
-build:cuda --repo_env TF_NEED_CUDA=1
-# "sm" means we emit only cubin, which is forward compatible within a GPU generation.
-# "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations.
-build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda --@local_config_cuda//:enable_cuda
-build:cuda --define=xla_python_enable_gpu=true
-build:cuda --cxxopt=-DXLA_CUDA=1
-
-# Coverage with cuda/gcc/nvcc requires manually setting coverage flags.
-coverage:cuda --per_file_copt=third_party/.*,torch_xla/.*@--coverage
-coverage:cuda --linkopt=-lgcov
-
 build:acl --define==build_with_acl=true
 
 build:nonccl --define=no_nccl_support=true
@@ -105,7 +93,6 @@ build:tpu --define=with_tpu_support=true
 
 # Run tests serially with TPU and GPU (only 1 device is available).
 test:tpu --local_test_jobs=1
-test:cuda --local_test_jobs=1
 
 #########################################################################
 # RBE config options below.

diff --git a/.circleci/build.sh b/.circleci/build.sh
@@ -50,7 +50,6 @@ source $XLA_DIR/xla_env
 export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
 export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1'  # cache bucket for CI
 export BUILD_CPP_TESTS='1'
-export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES"
 build_torch_xla $XLA_DIR
 
 popd
diff --git a/.github/upstream/Dockerfile b/.github/upstream/Dockerfile
@@ -15,11 +15,6 @@ ARG tpuvm=""
 # Disable CUDA for PyTorch
 ENV USE_CUDA "0"
 
-# Enable CUDA for XLA
-ENV XLA_CUDA "${cuda}"
-ENV TF_CUDA_COMPUTE_CAPABILITIES "${cuda_compute}"
-ENV TF_CUDA_PATHS "/usr/local/cuda,/usr/include,/usr"
-
 # CUDA build guidance
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility

diff --git a/benchmarks/nightly.sh b/benchmarks/nightly.sh
@@ -99,7 +99,7 @@ if [[ ${IS_FRESH_RUN?} ]]; then
   # Query local compute capability. If that fails, assign a sane default.
   LOCAL_CAP=compute_$(nvidia-smi --query-gpu=compute_cap --format=csv | \
     tail -1 | sed 's/\.//g' | grep -E '^[0-9]{2}$' || echo '80')
-  XLA_CUDA=1 TF_CUDA_COMPUTE_CAPABILITIES=${LOCAL_CAP:?} python setup.py develop
+  python setup.py develop
   cd ../..
 
   # Set up torchbench deps.

diff --git a/build_util.py b/build_util.py
@@ -43,8 +43,6 @@ def bazel_options_from_env() -> Iterable[str]:
   # Build configuration.
   if check_env_flag('BAZEL_VERBOSE'):
     bazel_flags.append('-s')
-  if check_env_flag('XLA_CUDA'):
-    bazel_flags.append('--config=cuda')
   if check_env_flag('XLA_CPU_USE_ACL'):
     bazel_flags.append('--config=acl')
 

diff --git a/configuration.yaml b/configuration.yaml
@@ -4,7 +4,7 @@ variables:
     PJRT_DEVICE:
       description:
         - Indicates which device is being used with PJRT. It can be either CPU,
-          TPU, or CUDA
+          or TPU
       type: string
     PJRT_SELECT_DEFAULT_DEVICE:
       description:
@@ -36,11 +36,6 @@ variables:
         - Verbosity level for GRPC, e.g. INFO, ERROR, etc.
       type: string
       default_value: "ERROR"
-    XLA_CUDA:
-      description:
-        - Build the xla client with CUDA enabled.
-      type: bool
-      default_value: false
     GIT_VERSIONED_XLA_BUILD:
       description:
         - Creates a versioned build. In particular, appends a git sha to the

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -29,10 +29,6 @@ RUN git clone https://github.com/pytorch/pytorch
 ENV USE_CUDA "0"
 ENV USE_MPI "0"
 
-# Enable CUDA for XLA
-ENV XLA_CUDA "${cuda}"
-ENV TF_CUDA_COMPUTE_CAPABILITIES "${cuda_compute}"
-
 # Whether to build for TPUVM mode
 ENV TPUVM_MODE "${tpuvm}"
 ENV BUNDLE_LIBTPU "${tpuvm}"

@@ -13,10 +13,6 @@ release_env:
     ACCELERATOR: tpu
     TPUVM_MODE: 1
 
-  cuda:
-    TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
-    XLA_CUDA: 1
-
 # Variables that will be passed to shell environment only for building PyTorch and XLA libs.
 build_env:
   common:
@@ -41,10 +37,6 @@ build_env:
 
   aarch64:
 
-  cuda:
-    TF_CUDA_COMPUTE_CAPABILITIES: "{{ cuda_compute_capabilities }}"
-    XLA_CUDA: 1
-
   tpu:
     ACCELERATOR: tpu
     TPUVM_MODE: 1

diff --git a/scripts/build_torch_wheels.sh b/scripts/build_torch_wheels.sh
@@ -56,28 +56,6 @@ function install_cudnn {
   rm -f "$CUDNN_FILE"
 }
 
-function maybe_install_cuda {
-  if [ "$XLA_CUDA" == "1" ]; then
-    if [ ! -d "/usr/local/cuda" ]; then
-      local CUDA_VER="10.2"
-      local CUDA_SUBVER="89_440.33.01"
-      local CUDA_FILE="cuda_${CUDA_VER}.${CUDA_SUBVER}_linux.run"
-      wget "http://developer.download.nvidia.com/compute/cuda/${CUDA_VER}/Prod/local_installers/${CUDA_FILE}"
-      sudo sh "${CUDA_FILE}" --silent --toolkit
-      rm -f "${CUDA_FILE}"
-    fi
-    if [ ! -f "/usr/local/cuda/include/cudnn.h" ] && [ ! -f "/usr/include/cudnn.h" ]; then
-      install_cudnn
-    fi
-    export TF_CUDA_PATHS="/usr/local/cuda,/usr/include,/usr"
-    maybe_append 'export TF_CUDA_PATHS="/usr/local/cuda,/usr/include,/usr"' ~/.bashrc
-    if [ "$TF_CUDA_COMPUTE_CAPABILITIES" == "" ]; then
-      export TF_CUDA_COMPUTE_CAPABILITIES="7.0"
-    fi
-    maybe_append "export TF_CUDA_COMPUTE_CAPABILITIES=\"$TF_CUDA_COMPUTE_CAPABILITIES\"" ~/.bashrc
-  fi
-}
-
 function maybe_install_sources {
   if [[ $(uname -m) == "aarch64" && ! -d "$HOME/ComputeLibrary" ]]; then
     # install arm compute library
@@ -148,7 +126,6 @@ function install_gcc() {
 
 function install_req_packages() {
   sudo apt-get -y install python3-pip git curl libopenblas-dev vim apt-transport-https ca-certificates wget procps
-  maybe_install_cuda
   install_bazel
   install_ninja
 }

diff --git a/setup.py b/setup.py
@@ -19,9 +19,6 @@
 #   BAZEL_VERBOSE=0
 #     turn on verbose messages during the bazel build of the xla/xrt client
 #
-#   XLA_CUDA=0
-#     build the xla/xrt client with CUDA enabled
-#
 #   XLA_CPU_USE_ACL=0
 #     whether to use ACL
 #

diff --git a/test/cpp/run_tests.sh b/test/cpp/run_tests.sh
@@ -78,9 +78,6 @@ if [[ "$BAZEL_REMOTE_CACHE" == "1" ]]; then
     EXTRA_FLAGS="$EXTRA_FLAGS --remote_default_exec_properties=cache-silo-key=$SILO_NAME"
   fi
 fi
-if [[ "$XLA_CUDA" == "1" ]]; then
-  EXTRA_FLAGS="$EXTRA_FLAGS --config=cuda"
-fi
 if [[ "$BAZEL_VERB" == "coverage" ]]; then
   EXTRA_FLAGS="$EXTRA_FLAGS --remote_download_outputs=all" # for lcov symlink
 fi

diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -164,8 +164,8 @@ function run_xla_op_tests1 {
   run_test "$_TEST_DIR/pjrt/test_runtime_multi_cpu.py"
   run_test "$_TEST_DIR/pjrt/test_internal_tpu.py"
 
-  PJRT_DEVICE=CPU XLA_CUDA=0 run_test "$_TEST_DIR/pjrt/test_ddp.py"
-  PJRT_DEVICE=CPU XLA_CUDA=0 run_test "$_TEST_DIR/pjrt/test_mesh_service.py"
+  PJRT_DEVICE=CPU run_test "$_TEST_DIR/pjrt/test_ddp.py"
+  PJRT_DEVICE=CPU run_test "$_TEST_DIR/pjrt/test_mesh_service.py"
 
   run_test "$_TEST_DIR/test_python_ops.py"
   run_test "$_TEST_DIR/test_ops.py"
@@ -199,7 +199,7 @@ function run_xla_op_tests2 {
   run_test "$_TEST_DIR/eager/test_eager_with_xla_compile.py"
   run_test "$_TEST_DIR/eager/test_eager_with_torch_compile.py"
 
-  PJRT_DEVICE=CPU XLA_CUDA=0 run_test "$_TEST_DIR/eager/test_eager_all_reduce_in_place.py"
+  PJRT_DEVICE=CPU run_test "$_TEST_DIR/eager/test_eager_all_reduce_in_place.py"
 
   run_test "$_TEST_DIR/eager/test_eager_spmd.py"
   run_test "$_TEST_DIR/test_callback.py"
@@ -332,7 +332,7 @@ function run_tests {
   elif [[ "$RUN_TORCH_MP_OP_TESTS" == "torch_mp_op" ]]; then
     echo "Running torch op tests..."
 
-    PJRT_DEVICE=CPU XLA_CUDA=0 run_mp_op_tests
+    PJRT_DEVICE=CPU run_mp_op_tests
   else
     # Run full tests without sharding, respects XLA_SKIP_*
     if [[ "$XLA_SKIP_XLA_OP_TESTS" != "1" ]]; then