From 1ccce715dc5340ae39e1bfc0a24f4d5106f6a19e Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Wed, 3 Sep 2025 16:09:22 -0300
Subject: [PATCH] Remove other CUDA usage in the repository.

---
 .circleci/common.sh                           | 25 +-----
 .devcontainer/gpu-internal/devcontainer.json  | 30 -------
 .github/ISSUE_TEMPLATE.md                     |  2 +-
 .github/ISSUE_TEMPLATE/bug-report.md          |  2 +-
 .github/ci.md                                 | 37 +++-----
 .github/scripts/run_tests.sh                  |  5 --
 .github/workflows/_test.yml                   | 53 ++++-------
 .github/workflows/setup/action.yml            | 33 -------
 CONTRIBUTING.md                               |  4 -
 README.md                                     | 88 +++----------------
 ...ributed-pytorch-xla-basics-with-pjrt.ipynb |  2 +-
 docs/source/accelerators/gpu.md               |  6 --
 docs/source/contribute/bazel.md               |  6 +-
 docs/source/contribute/plugins.md             |  3 +-
 docs/source/learn/_pjrt.md                    |  8 +-
 docs/source/perf/amp.md                       |  4 +-
 docs/source/perf/spmd_advanced.md             |  4 +-
 docs/source/perf/spmd_gpu.md                  | 48 ----------
 examples/train_resnet_amp.py                  |  3 +-
 infra/ansible/README.md                       |  4 +-
 infra/ansible/config/apt.yaml                 | 17 ----
 infra/ansible/config/cuda_deps.yaml           | 24 -----
 infra/ansible/config/vars.yaml                |  7 +-
 infra/ansible/playbook.yaml                   |  6 +-
 infra/tpu-pytorch-releases/README.md          | 25 +++---
 .../tpu-pytorch-releases/artifacts_builds.tf  | 27 ------
 .../dev_images.auto.tfvars                    | 12 ---
 infra/tpu-pytorch-releases/dev_images.tf      |  6 +-
 28 files changed, 74 insertions(+), 417 deletions(-)
 delete mode 100644 .devcontainer/gpu-internal/devcontainer.json
 delete mode 100644 docs/source/accelerators/gpu.md
 delete mode 100644 docs/source/perf/spmd_gpu.md
 delete mode 100644 infra/ansible/config/cuda_deps.yaml

diff --git a/.circleci/common.sh b/.circleci/common.sh
index 3093a8006942..50ec8eae1ade 100755
--- a/.circleci/common.sh
+++ b/.circleci/common.sh
@@ -158,26 +158,12 @@ function run_torch_xla_cpp_tests() {
     fi
 
     if [ "$USE_COVERAGE" != "0" ]; then
-      if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
-        lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
-      else
-        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
-      fi
+      PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+      cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
       genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
       mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
     else
-      # Shard GPU testing
-      if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
-      else
-        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-      fi
+      PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
     fi
   popd
 }
@@ -196,11 +182,6 @@ function run_torch_xla_tests() {
   RUN_CPP="${RUN_CPP_TESTS:0}"
   RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
 
-  if [ -x "$(command -v nvidia-smi)" ]; then
-    num_devices=$(nvidia-smi --list-gpus | wc -l)
-    echo "Found $num_devices GPU devices..."
-    export GPU_NUM_DEVICES=$num_devices
-  fi
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
   export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 
diff --git a/.devcontainer/gpu-internal/devcontainer.json b/.devcontainer/gpu-internal/devcontainer.json
deleted file mode 100644
index ce06bab9e2e7..000000000000
--- a/.devcontainer/gpu-internal/devcontainer.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "name": "gpu-internal",
-  "image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
-  "runArgs": [
-    "--gpus=all",
-    "--net=host",
-    "--shm-size=16G"
-  ],
-  "containerEnv": {
-    "BAZEL_REMOTE_CACHE": "1",
-    "SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm"
-  },
-  "initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "ms-vscode.cpptools-themes",
-        "BazelBuild.vscode-bazel",
-        "DevonDCarew.bazel-code",
-        "StackBuild.bazel-stack-vscode",
-        "StackBuild.bazel-stack-vscode-cc",
-        "xaver.clang-format",
-        "ryanluker.vscode-coverage-gutters",
-        "ms-azuretools.vscode-docker",
-        "ms-python.python"
-      ]
-    }
-  }
-}
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 6c37920bd137..b44f8dca7ad2 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -13,5 +13,5 @@ Error messages and stack traces are also helpful.
 
 ## System Info
 
-- reproducible on XLA backend [CPU/TPU/CUDA]:
+- reproducible on XLA backend [CPU/TPU]:
 - torch_xla version:
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 54f785623a50..3c10b58bfe5a 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -46,7 +46,7 @@ Steps to reproduce the behavior:
 
 ## Environment
 
- - Reproducible on XLA backend [CPU/TPU/CUDA]:
+ - Reproducible on XLA backend [CPU/TPU]:
  - torch_xla version:
 
 
diff --git a/.github/ci.md b/.github/ci.md
index 2cc72b5abf50..cc3994c884e7 100644
--- a/.github/ci.md
+++ b/.github/ci.md
@@ -44,20 +44,20 @@ fail. Steps for fixing and merging such breaking PyTorch change is as following:
 
 ### Running TPU tests on PRs
 
-The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU and
-GPU. The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
+The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU.
+The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
 
 ## CI Environment
 
 Before the CI in this repository runs, we build a base dev image. These are the
 same images we recommend in our VSCode `.devcontainer` setup and nightly build
-to ensure consistency between environments. We produce variants with and without
-CUDA, configured in `infra/ansible` (build config) and
-`infra/tpu-pytorch-releases/dev_images.tf` (build triggers).
+to ensure consistency between environments. We produce variants configured in
+`infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf`
+(build triggers).
 
 The CI runs in two environments:
 
-1. Organization self-hosted runners for CPU and GPU: used for almost every step
+1. Organization self-hosted runners for CPU: used for almost every step
    of the CI. These runners are managed by PyTorch and have access to the shared
    ECR repository.
 1. TPU self-hosted runners: these are managed by us and are only available in
@@ -68,24 +68,18 @@ The CI runs in two environments:
 
 We have two build paths for each CI run:
 
-- `torch_xla`: we build the main package to support both TPU and GPU[^1], along
+- `torch_xla`: we build the main package to support TPU, along
   with a CPU build of `torch` from HEAD. This build step exports the
   `torch-xla-wheels` artifact for downstream use in tests.
   - Some CI tests also require `torchvision`. To reduce flakiness, we compile
     `torchvision` from [`torch`'s CI pin][pytorch-vision-pin].
   - C++ tests are piggybacked onto the same build and uploaded in the
     `cpp-test-bin` artifact.
-- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of
-  either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus,
-  this build should be almost entirely cached, unless your PR changes the XLA
-  pin or adds a patch.
 
-Both the main package build and plugin build are configured with ansible at
-`infra/ansible`, although they run in separate stages (`stage=build_srcs` vs
-`stage=build_plugin`). This is the same configuration we use for our nightly and
-release builds.
+The main package build is configured with ansible at `infra/ansible`. This is
+the same configuration we use for our nightly and release builds.
 
-The CPU and GPU test configs are defined in the same file, `_test.yml`. Since
+The CPU test config is defined in the file `_test.yml`. Since
 some of the tests come from the upstream PyTorch repository, we check out
 PyTorch at the same git rev as the `build` step (taken from
 `torch_xla.version.__torch_gitrev__`). The tests are split up into multiple
@@ -93,23 +87,16 @@ groups that run in parallel; the `matrix` section of `_test.yml` corresponds to
 in `.github/scripts/run_tests.sh`.
 
 CPU tests run immediately after the `torch_xla` build completes. This will
-likely be the first test feedback on your commit. GPU tests will launch when
-both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is
-much slower due to the number of possible optimizations, and the GPU chips
-themselves are quite outdated, so these tests will take longer to run than the
-CPU tests.
+likely be the first test feedback on your commit. 
 
 ![CPU tests launch when `torch_xla` is
 complete](../docs/assets/ci_test_dependency.png)
 
-![GPU tests also depend on CUDA
-plugin](../docs/assets/ci_test_dependency_gpu.png)
-
 For the C++ test groups in either case, the test binaries are pre-built during
 the build phase and packaged in `cpp-test-bin`. This will only be downloaded if
 necessary.
 
-[^1]: Note: both GPU and TPU support require their respective plugins to be
+[^1]: Note: TPU support require its respective plugins to be
     installed. This package will _not_ work on either out of the box.
 
 ### TPU CI
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 7ae422c47953..65f46f9cf48c 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -77,11 +77,6 @@ PYTORCH_DIR=$1
 XLA_DIR=$2
 USE_COVERAGE="${3:-0}"
 
-if [ -x "$(command -v nvidia-smi)" ]; then
-  num_devices=$(nvidia-smi --list-gpus | wc -l)
-  echo "Found $num_devices GPU devices..."
-  export GPU_NUM_DEVICES=$num_devices
-fi
 export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
 export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 4ef00dcedaed..23ffe34f8a46 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -23,11 +23,6 @@ on:
         description: |
           Set the maximum (in minutes) how long the workflow should take to finish
             timeout-minutes:
-      install-cuda-plugin:
-        required: false
-        type: boolean
-        default: false
-        description: Whether to install CUDA plugin package
       torch-commit:
           required: true
           type: string
@@ -46,7 +41,7 @@ jobs:
     runs-on: ${{ inputs.runner }}
     container:
       image: ${{ inputs.dev-image }}
-      options: "${{ inputs.install-cuda-plugin == true && '--gpus all' || '' }} --shm-size 16g"
+      options: "--shm-size 16g"
     strategy:
       fail-fast: false
       matrix:
@@ -95,9 +90,7 @@ jobs:
         uses: ./.actions/.github/workflows/setup
         with:
           torch-commit: ${{ inputs.torch-commit }}
-          cuda: ${{ inputs.install-cuda-plugin && true || false }}
           wheels-artifact: torch-xla-wheels
-          cuda-plugin-artifact: ${{ inputs.install-cuda-plugin && 'cuda-plugin' || null }}
       - name: Fetch CPP test binaries
         if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests
         uses: actions/download-artifact@v4
@@ -111,9 +104,6 @@ jobs:
         run: |
           chmod +x /tmp/test/bin/*
           ls -l /tmp/test/bin
-      - name: Check GPU
-        if: inputs.has_code_changes == 'true' && inputs.install-cuda-plugin
-        run: nvidia-smi
       - name: Install test deps
         if: inputs.has_code_changes == 'true'
         shell: bash
@@ -164,35 +154,24 @@ jobs:
                 exit 0
             fi
             docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}"
-            if [ -n "${GPU_FLAG:-}" ]; then
-              if [ -n "${PYTHON_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
-              fi
-              if [ -n "${CPP_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
-              fi
-            else
-              if [ -n "${PYTHON_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
-              fi
+            if [ -n "${PYTHON_TEST_NAME}" ]; then
+              gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+              gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+            fi
 
-              if [ -n "${CPP_TEST_NAME}" ]; then
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
-                gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
-              fi
+            if [ -n "${CPP_TEST_NAME}" ]; then
+              gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+              gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+            fi
 
-              if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then
-                ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
-                echo $ABS_METADATA > abs_metadata.json
-                gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
+            if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then
+              ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+              echo $ABS_METADATA > abs_metadata.json
+              gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
 
-                INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
-                echo $INC_METADATA > inc_metadata.json
-                gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
-              fi
+              INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+              echo $INC_METADATA > inc_metadata.json
+              gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
             fi
       - name: Report no code changes
         if: inputs.has_code_changes == 'false'
diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml
index 574b85e5b0d5..e1d6fdb8599d 100644
--- a/.github/workflows/setup/action.yml
+++ b/.github/workflows/setup/action.yml
@@ -3,20 +3,10 @@ inputs:
   torch-commit:
     type: string
     description: PyTorch commit to check out, if provided
-  cuda:
-    type: boolean
-    description: Whether to set up CUDA library paths
-    default: false
   wheels-artifact:
     type: string
     description: |
       Artifact containing `torch` (cpu) and `torch-xla` wheels to install
-  cuda-plugin-artifact:
-    type: string
-    description: Artifact containing `torch-xla-cuda-plugin` to install
-  cuda-torch-artifact:
-    type: string
-    description: Artifact containing CUDA build of `torch`
 runs:
   using: "composite"
   steps:
@@ -26,12 +16,6 @@ runs:
       run: |
         ls -la
         rm -rvf ${GITHUB_WORKSPACE}/*
-    - name: Setup CUDA environment
-      shell: bash
-      run: |
-        echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV
-        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV
-      if: ${{ inputs.cuda }}
     - name: Setup gcloud
       shell: bash
       run: |
@@ -59,23 +43,6 @@ runs:
         name: ${{ inputs.wheels-artifact }}
         path: /tmp/wheels/
       if: ${{ inputs.wheels-artifact }}
-    - name: Fetch CUDA plugin
-      uses: actions/download-artifact@v4
-      with:
-        name: ${{ inputs.cuda-plugin-artifact }}
-        path: /tmp/wheels/
-      if: ${{ inputs.cuda-plugin-artifact }}
-    - name: Remove CPU `torch` build
-      shell: bash
-      run: |
-        rm -rf /tmp/wheels/torch-*
-      if: ${{ inputs.cuda-torch-artifact }}
-    - name: Fetch CUDA `torch` build
-      uses: actions/download-artifact@v4
-      with:
-        name: ${{ inputs.cuda-torch-artifact }}
-        path: /tmp/wheels/
-      if: ${{ inputs.cuda-torch-artifact }}
     - name: Install wheels
       shell: bash
       run: |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6c05fd88f747..b8d233c87002 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -238,10 +238,6 @@ first time, you may need to build everything again, for example, after a
   python setup.py develop
   ```
 
-### Additional steps for GPU
-
-Please refer to this [guide](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md).
-
 ## Before Creating a Pull Request
 
 In `pytorch/xla` repo we enforce coding style for both C++ and Python files.
diff --git a/README.md b/README.md
index 989858ef16fd..d02ae1a0968e 100644
--- a/README.md
+++ b/README.md
@@ -95,24 +95,23 @@ batch size 1024:
 Our github contains many useful docs on working with different aspects of PyTorch XLA, here is a list of useful docs spread around our repository:
 
 - [docs/source/learn](https://github.com/pytorch/xla/tree/master/docs/source/learn): docs for learning concepts associated with XLA, troubleshooting, pjrt, eager mode, and dynamic shape.
-- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `GPU` and `TPU` accelerator documents.
+- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `TPU` accelerator documents.
 - [docs/source/perf](https://github.com/pytorch/xla/tree/master/docs/source/perf): documentation about performance specific aspects of PyTorch/XLA such as: `AMP`, `DDP`, `Dynamo`, Fori loop, `FSDP`, quantization, recompilation, and `SPMD`
 - [docs/source/features](https://github.com/pytorch/xla/tree/master/docs/source/features): documentation on distributed torch, pallas, scan, and stable hlo.
 - [docs/source/contribute](https://github.com/pytorch/xla/tree/master/docs/source/contribute): documents on setting up PyTorch for development, and guides for lowering operations.
 - PJRT plugins:
   - [CPU](https://github.com/pytorch/xla/blob/master/plugins/cpu/README.md)
-  - [CUDA](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md)
 - [torchax/docs](https://github.com/pytorch/xla/tree/master/torchax/docs): torchax documents
   - [torchax/examples](https://github.com/pytorch/xla/tree/master/torchax/examples): torchax examples
 
 ## Getting Started
 
 Following here are guides for two modes:
-- Single process: one Python interpreter controlling a single GPU/TPU at a time
-- Multi process: N Python interpreters are launched, corresponding to N GPU/TPUs
+- Single process: one Python interpreter controlling a single TPU at a time
+- Multi process: N Python interpreters are launched, corresponding to N TPUs
 found on the system
 
-Another mode is SPMD, where one Python interpreter controls all N GPU/TPUs found on
+Another mode is SPMD, where one Python interpreter controls all N TPUs found on
 the system. Multi processing is more complex, and is not compatible with SPMD. This
 tutorial does not dive into SPMD. For more on that, check our
 [SPMD guide](https://github.com/pytorch/xla/blob/master/docs/source/perf/spmd_basic.md).
@@ -223,7 +222,7 @@ If you're using `DistributedDataParallel`, make the following changes:
 Additional information on PyTorch/XLA, including a description of its semantics
 and functions, is available at [PyTorch.org](http://pytorch.org/xla/). See the
 [API Guide](API_GUIDE.md) for best practices when writing networks that run on
-XLA devices (TPU, CUDA, CPU and...).
+XLA devices (TPU, CPU and...).
 
 Our comprehensive user guides are available at:
 
@@ -234,13 +233,9 @@ Our comprehensive user guides are available at:
 
 ## PyTorch/XLA tutorials
 
-* [Cloud TPU VM
-  quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch)
-* [Cloud TPU Pod slice
-  quickstart](https://cloud.google.com/tpu/docs/pytorch-pods)
-* [Profiling on TPU
-  VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)
-* [GPU guide](docs/gpu.md)
+* [Cloud TPU VM quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch)
+* [Cloud TPU Pod slice quickstart](https://cloud.google.com/tpu/docs/pytorch-pods)
+* [Profiling on TPU VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)
 
 ## Reference implementations
 
@@ -259,12 +254,10 @@ Cloud TPU plugin corresponding to your installed `torch_xla`, install the option
 pip install torch_xla[tpu]
 ```
 
-GPU release builds and GPU/TPU nightly builds are available in our public GCS bucket.
+TPU nightly builds are available in our public GCS bucket.
 
-| Version | Cloud GPU VM Wheels |
+| Version | Cloud TPU Nightly Wheels |
 | --- | ----------- |
-| 2.7 (CUDA 12.6 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.7 (CUDA 12.6 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
 | nightly (Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp311-cp311-linux_x86_64.whl` |
 | nightly (Python 3.12) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` |
 | nightly (Python 3.13) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` |
@@ -296,27 +289,6 @@ The torch wheel version `2.9.0.dev20250423+cpu` can be found at https://download
 | 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
 | 2.1 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.1.0-cp38-cp38-linux_x86_64.whl` |
 
-<br/>
-
-| Version | GPU Wheel |
-| --- | ----------- |
-| 2.5 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.1 + CUDA 11.8 | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/11.8/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| nightly + CUDA 12.0 >= 2023/06/27| `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
-
 </details>
 
 ### Docker
@@ -337,46 +309,6 @@ To use the above dockers, please pass `--privileged --net host --shm-size=16G` a
 ```bash
 docker run --privileged --net host --shm-size=16G -it us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm /bin/bash
 ```
-<br/>
-
-| Version | GPU CUDA 12.6 Docker |
-| --- | ----------- |
-| 2.7 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.7.0_3.10_cuda_12.6` |
-
-
-<br/>
-
-
-| Version | GPU CUDA 12.4 Docker |
-| --- | ----------- |
-| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.4` |
-| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.4` |
-
-<br/>
-
-
-| Version | GPU CUDA 12.1 Docker |
-| --- | ----------- |
-| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1` |
-| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.1` |
-| 2.3 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0_3.10_cuda_12.1` |
-| 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1` |
-| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` |
-| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` |
-| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1_YYYYMMDD` |
-
-<br/>
-
-| Version | GPU CUDA 11.8 + Docker |
-| --- | ----------- |
-| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_11.8` |
-| 2.0 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.0_3.8_cuda_11.8` |
-
-<br/>
-
-
-To run on [compute instances with
-GPUs](https://cloud.google.com/compute/docs/gpus/create-vm-with-gpus).
 
 ## Troubleshooting
 
diff --git a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
index 8d4fbd95bff7..f06e4a9b9f03 100644
--- a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
+++ b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
@@ -461,7 +461,7 @@
     "    torch.manual_seed(42)\n",
     "    model = nn.Linear(128, 10).to(device)\n",
     "\n",
-    "    # Optional for TPU v4 and GPU\n",
+    "    # Optional for TPU v4\n",
     "    xm.broadcast_master_param(model)\n",
     "\n",
     "    model = DDP(model, gradient_as_bucket_view=True)\n",
diff --git a/docs/source/accelerators/gpu.md b/docs/source/accelerators/gpu.md
deleted file mode 100644
index 56abb192a704..000000000000
--- a/docs/source/accelerators/gpu.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Learn about GPUs
-
-For information on GPUs on Google Cloud, see:
-
--   [About GPUs on Google Cloud](https://cloud.google.com/compute/docs/gpus/overview)
--   [GPU machine types](https://cloud.google.com/compute/docs/gpus)
diff --git a/docs/source/contribute/bazel.md b/docs/source/contribute/bazel.md
index 0e41ec837057..69e1d5954c82 100644
--- a/docs/source/contribute/bazel.md
+++ b/docs/source/contribute/bazel.md
@@ -22,9 +22,7 @@ http_archive(
     ],
     patch_tool = "patch",
     patches = [
-        "//openxla_patches:gpu_nvml.diff",
-        "//openxla_patches:gpu_race_condition.diff",
-        "//openxla_patches:count_down.diff",
+        "//openxla_patches:no_fortify.diff",
     ],
     strip_prefix = "xla-" + xla_hash,
     urls = [
@@ -223,7 +221,7 @@ The `xla_client` tests are pure hermetic tests that can be easily
 executed. The `torch_xla` plugin tests are more complex: they require
 `torch` and `torch_xla` to be installed, and they cannot run in
 parallel, since they are using either XRT server/client on the same
-port, or because they use a GPU or TPU device and there's only one
+port, or because they use a TPU device and there's only one
 available at the time. For that reason, all tests under
 `torch_xla/csrc/` are bundled into a single target `:main` that runs
 them all sequentially.
diff --git a/docs/source/contribute/plugins.md b/docs/source/contribute/plugins.md
index 40ae841e8d7b..84ca6fe1c9ea 100644
--- a/docs/source/contribute/plugins.md
+++ b/docs/source/contribute/plugins.md
@@ -1,8 +1,7 @@
 # Custom Hardware Plugins
 
 PyTorch/XLA supports custom hardware through OpenXLA's PJRT C API. The
-PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`) and
-GPU ([OpenXLA](https://github.com/openxla/xla/tree/main/xla/pjrt/gpu)).
+PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`).
 The same plugins may also be used by JAX and TF.
 
 ## Implementing a PJRT Plugin
diff --git a/docs/source/learn/_pjrt.md b/docs/source/learn/_pjrt.md
index 16300239353a..2f4f446991de 100644
--- a/docs/source/learn/_pjrt.md
+++ b/docs/source/learn/_pjrt.md
@@ -38,7 +38,7 @@ the `runtime` tag.
 ## TL;DR
 
 -   To use the PJRT preview runtime, set the `PJRT_DEVICE` environment
-    variable to `CPU`, `TPU`, or `CUDA`
+    variable to `CPU`, or `TPU`
 -   In XRT, all distributed workloads are multiprocess, with one process
     per device. On TPU v2 and v3 in PJRT, workloads are multiprocess and
     multithreaded (4 processes with 2 threads each), so your workload
@@ -57,7 +57,7 @@ the `runtime` tag.
     -   To use `torch.distributed`, import
         `torch_xla.experimental.pjrt_backend` and use the `xla://`
         `init_method`.
-    -   These steps are optional for GPU and TPU v4.
+    -   These steps are optional for TPU v4.
 
 Sample diff from XRT to PJRT:
 
@@ -84,7 +84,7 @@ def _mp_fn(index):
   torch.manual_seed(42)
   model = nn.Linear(128, 10).to(device)
 
-+  # Optional for TPU v4 and GPU
++  # Optional for TPU v4
 +  xm.broadcast_master_param(model)
   model = DDP(model, gradient_as_bucket_view=True)
 
@@ -119,7 +119,7 @@ if __name__ == '__main__':
 ## Benefits
 
 -   Simple runtime configuration: just set `PJRT_DEVICE` to `TPU`,
-    `CPU`, or `CUDA` and start using XLA! Or, let PJRT select a device
+    or `CPU` and start using XLA! Or, let PJRT select a device
     automatically based on your environment.
 -   Improved performance: reduced overhead from gRPC means faster
     end-to-end execution. On TorchBench 2.0, we observed a \>35%
diff --git a/docs/source/perf/amp.md b/docs/source/perf/amp.md
index 36d777fd865f..223e338f2135 100644
--- a/docs/source/perf/amp.md
+++ b/docs/source/perf/amp.md
@@ -2,7 +2,7 @@
 
 Pytorch/XLA's AMP extends [Pytorch's AMP
 package](https://pytorch.org/docs/stable/amp.html) with support for
-automatic mixed precision on `XLA:GPU` and `XLA:TPU` devices. AMP is
+automatic mixed precision on `XLA:TPU` devices. AMP is
 used to accelerate training and inference by executing certain
 operations in `float32` and other operations in a lower precision
 datatype (`float16` or `bfloat16` depending on hardware support). This
@@ -99,4 +99,4 @@ unlisted ops run if they're downstream from autocasted ops.
 
 Our [mnist training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist_amp.py)
 and [imagenet training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet_amp.py)
-demonstrate how AMP is used on both TPUs and GPUs.
+demonstrate how AMP is used on TPUs.
diff --git a/docs/source/perf/spmd_advanced.md b/docs/source/perf/spmd_advanced.md
index 7005ee5dd4c0..2a056dc3d693 100644
--- a/docs/source/perf/spmd_advanced.md
+++ b/docs/source/perf/spmd_advanced.md
@@ -110,7 +110,7 @@ torch.ops.xla.dynamo_mark_sharding(output, device_ids, mesh_shape, axis_names, p
 
 ### SPMD Debugging Tool
 
-We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/GPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`:
+We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`:
 - Code snippet used `visualize_tensor_sharding` and visualization result:
 
 ```python
@@ -141,7 +141,7 @@ generated_table = visualize_sharding(sharding, use_color=False)
   <img alt="visualize_sharding example on TPU v4-8(single-host)" src="../_static/img/spmd_debug_2_light.png">
 </picture>
 
-You could use these examples on TPU/GPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`.
+You could use these examples on TPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`.
 
 ### Auto-Sharding
 We are introducing a new PyTorch/XLA SPMD feature, called ``auto-sharding``, [RFC](https://github.com/pytorch/xla/issues/6322). This is an experimental feature in `r2.3` and `nightly`, that supports `XLA:TPU` and a single TPUVM host.
diff --git a/docs/source/perf/spmd_gpu.md b/docs/source/perf/spmd_gpu.md
deleted file mode 100644
index cda25723aaad..000000000000
--- a/docs/source/perf/spmd_gpu.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Running SPMD on GPU
-
-PyTorch/XLA supports SPMD on NVIDIA GPU (single-node or multi-nodes).
-The training/inference script remains the same as the one used for TPU,
-such as this [ResNet
-script](https://github.com/pytorch/xla/blob/1dc78948c0c9d018d8d0d2b4cce912552ab27083/test/spmd/test_train_spmd_imagenet.py).
-To execute the script using SPMD, we leverage `torchrun`:
-
-    PJRT_DEVICE=CUDA \
-    torchrun \
-    --nnodes=${NUM_GPU_MACHINES} \
-    --node_rank=${RANK_OF_CURRENT_MACHINE} \
-    --nproc_per_node=1 \
-    --rdzv_endpoint="<MACHINE_0_IP_ADDRESS>:<PORT>" \
-    training_or_inference_script_using_spmd.py
-
--   `--nnodes`: how many GPU machines to be used.
--   `--node_rank`: the index of the current GPU machines. The value can
-    be 0, 1, ..., \${NUMBER_GPU_VM}-1.
--   `--nproc_per_node`: the value must be 1 due to the SPMD requirement.
--   `--rdzv_endpoint`: the endpoint of the GPU machine with
-    node_rank==0, in the form `host:port`. The host will be the internal
-    IP address. The `port` can be any available port on the machine. For
-    single-node training/inference, this parameter can be omitted.
-
-For example, if you want to train a ResNet model on 2 GPU machines using
-SPMD, you can run the script below on the first machine:
-
-    XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \
-    torchrun \
-    --nnodes=2 \
-    --node_rank=0 \
-    --nproc_per_node=1 \
-    --rdzv_endpoint="<MACHINE_0_INTERNAL_IP_ADDRESS>:12355" \
-    pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128
-
-and run the following on the second machine:
-
-    XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \
-    torchrun \
-    --nnodes=2 \
-    --node_rank=1 \
-    --nproc_per_node=1 \
-    --rdzv_endpoint="<MACHINE_0_INTERNAL_IP_ADDRESS>:12355" \
-    pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128
-
-For more information, please refer to the [SPMD support on GPU
-RFC](https://github.com/pytorch/xla/issues/6256).
diff --git a/examples/train_resnet_amp.py b/examples/train_resnet_amp.py
index f5ca308bed75..f63c3cad8544 100644
--- a/examples/train_resnet_amp.py
+++ b/examples/train_resnet_amp.py
@@ -22,8 +22,7 @@ def train_loop_fn(self, loader, epoch):
       with autocast(torch_xla.device()):
         output = self.model(data)
         loss = self.loss_fn(output, target)
-      # TPU amp uses bf16 hence gradient scaling is not necessary. If runnign with XLA:GPU
-      # check https://github.com/pytorch/xla/blob/master/docs/amp.md#amp-for-xlagpu.
+      # TPU amp uses bf16 hence gradient scaling is not necessary.
       loss.backward()
       self.run_optimizer()
       tracker.add(self.batch_size)
diff --git a/infra/ansible/README.md b/infra/ansible/README.md
index 9094f645de30..9ce34d962cff 100644
--- a/infra/ansible/README.md
+++ b/infra/ansible/README.md
@@ -23,11 +23,11 @@ behavior (installed pip/apt packages and set environment variables):
 * `stage`: build or release. Different packages are installed depending on
   the chosen stage.
 * `arch`: aarch64 or amd64. Architecture of the built image and wheels.
-* `accelerator`: tpu or cuda. Available accelerator.
+* `accelerator`: tpu. Available accelerator.
 
 The variables can be passed through `-e` flag: `-e "<var>=<value>"`.
 
-Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"`
+Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64"`
 
 ## Config structure
 
diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml
index d026fea3e037..ae3d95468344 100644
--- a/infra/ansible/config/apt.yaml
+++ b/infra/ansible/config/apt.yaml
@@ -20,13 +20,6 @@ apt:
       - lcov
       - less
 
-    build_cuda:
-      - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
-      - "cuda-toolkit-{{ cuda_version | replace('.', '-') }}"
-      - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
-      - "{{ cuda_deps['libcudnn'][cuda_version] }}"
-      - "{{ cuda_deps['libcudnn-dev'][cuda_version] }}"
-
     build_aarch64:
       - scons
 
@@ -39,23 +32,13 @@ apt:
       - patch
       - vim
 
-    release_cuda:
-      - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
-      - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
-      - "{{ cuda_deps['libcudnn'][cuda_version] }}"
-
   # Specify objects with string fields `url` and `keyring`.
   # The keyring path should start with /usr/share/keyrings/ for debian and ubuntu.
   signing_keys:
     - url: https://apt.llvm.org/llvm-snapshot.gpg.key
       keyring: /usr/share/keyrings/llvm.pgp
-    # Get the recent key version from
-    # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-debian.
-    - url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub"
-      keyring: /usr/share/keyrings/cuda.pgp
 
   repos:
     # signed-by path should match the corresponding keyring path above.
     - "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
     - "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
-    - "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /"
diff --git a/infra/ansible/config/cuda_deps.yaml b/infra/ansible/config/cuda_deps.yaml
deleted file mode 100644
index 3732bb0f93ec..000000000000
--- a/infra/ansible/config/cuda_deps.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Versions of cuda dependencies for given cuda versions.
-# Note: wrap version in quotes to ensure they're treated as strings.
-cuda_deps:
-  # List all libcudnn8 versions with `apt list -a libcudnn8`
-  libcudnn:
-    "12.8": libcudnn9-cuda-12=9.1.1.17-1
-    "12.6": libcudnn9-cuda-12=9.1.1.17-1
-    "12.4": libcudnn9-cuda-12=9.1.1.17-1
-    "12.3": libcudnn9-cuda-12=9.1.1.17-1
-    "12.1": libcudnn8=8.9.2.26-1+cuda12.1
-    "12.0": libcudnn8=8.8.0.121-1+cuda12.0
-    "11.8": libcudnn8=8.7.0.84-1+cuda11.8
-    "11.7": libcudnn8=8.5.0.96-1+cuda11.7
-    "11.2": libcudnn8=8.1.1.33-1+cuda11.2
-  libcudnn-dev:
-    "12.8": libcudnn9-dev-cuda-12=9.1.1.17-1
-    "12.6": libcudnn9-dev-cuda-12=9.1.1.17-1
-    "12.4": libcudnn9-dev-cuda-12=9.1.1.17-1
-    "12.3": libcudnn9-dev-cuda-12=9.1.1.17-1
-    "12.1": libcudnn8-dev=8.9.2.26-1+cuda12.1
-    "12.0": libcudnn8-dev=8.8.0.121-1+cuda12.0
-    "11.8": libcudnn8-dev=8.7.0.84-1+cuda11.8
-    "11.7": libcudnn8-dev=8.5.0.96-1+cuda11.7
-    "11.2": libcudnn8-dev=8.1.1.33-1+cuda11.2
diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
index f34e2c3cb632..c336e7754f46 100644
--- a/infra/ansible/config/vars.yaml
+++ b/infra/ansible/config/vars.yaml
@@ -1,8 +1,3 @@
-# Used for fetching cuda from the right repo, see apt.yaml.
-cuda_repo: debian11
-cuda_version: "11.8"
-# Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus
-cuda_compute_capabilities: 5.2,7.0,7.5,8.0,9.0
 # Used for fetching clang from the right repo, see apt.yaml.
 llvm_debian_repo: bullseye
 clang_version: 17
@@ -10,7 +5,7 @@ clang_version: 17
 package_version: 2.9.0
 # If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
 nightly_release: false
-# Whether to preinstall libtpu in the PyTorch/XLA wheel. Ignored for GPU build.
+# Whether to preinstall libtpu in the PyTorch/XLA wheel.
 bundle_libtpu: 1
 # Suffix for bazel remote cache key
 cache_suffix: ""
diff --git a/infra/ansible/playbook.yaml b/infra/ansible/playbook.yaml
index 7626714e8d18..85153a43d3a2 100644
--- a/infra/ansible/playbook.yaml
+++ b/infra/ansible/playbook.yaml
@@ -6,7 +6,7 @@
   # - stage: build or release. Different packages are installed depending on
   #          the chosen stage.
   # - arch: aarch64 or amd64. Architecture of the built image and wheels.
-  # - accelerator: tpu or cuda. Available accelerator.
+  # - accelerator: tpu.
   pre_tasks:
     - name: "Validate required variables"
       ansible.builtin.assert:
@@ -20,7 +20,7 @@
         - name: arch
           pattern: ^(aarch64|amd64)$
         - name: accelerator
-          pattern: ^(tpu|cuda)$
+          pattern: ^tpu$
 
     - name: "Include vars from config files"
       ansible.builtin.include_vars:
@@ -28,8 +28,6 @@
       loop:
         # vars.yaml should be the first as other config files depend on it.
         - vars.yaml
-        # cuda_deps should be loaded before apt, since apt depends on it.
-        - cuda_deps.yaml
         - apt.yaml
         - pip.yaml
         - env.yaml
diff --git a/infra/tpu-pytorch-releases/README.md b/infra/tpu-pytorch-releases/README.md
index f173b3ee8575..a70e0b064a6e 100644
--- a/infra/tpu-pytorch-releases/README.md
+++ b/infra/tpu-pytorch-releases/README.md
@@ -39,13 +39,11 @@ consists of the following fields.
   sources when building image and wheels.
 * `package_version` (string) - Version of the built wheels. Passed to the
   build steps.
-* `accelerator` ("tpu"|"cuda") - Supported accelerator. Affects build
+* `accelerator` ("tpu") - Supported accelerator. Affects build
   process and installed dependencies, see [apt.yaml](../ansible/config/apt.yaml) and
   [pip.yaml](../ansible/config/pip.yaml).
 * `python_version` (optional, string, default = "3.8") - Python version used for
   the docker image base and build process.
-* `cuda_version` (optional, string, default = "11.8") - CUDA version to install.
-  Used only if `accelerator` is set to "cuda"
 * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture
   affects installed dependencies and build process, see [apt.yaml](../ansible/config/apt.yaml) and
   [pip.yaml](../ansible/config/pip.yaml).
@@ -71,7 +69,6 @@ unset properties of existing triggers.
         git_tag         = "v3.0.0"
         package_version = "3.0"
         accelerator     = "tpu"
-        cuda_version    = "11.8"  # optional
         python_version  = "3.8"   # optional
         arch            = "amd64" # optional
       },
@@ -95,12 +92,10 @@ at midnight (`America/Los_Angeles` time zone).
 Nightly builds in the `nightly_builds` variable in
 [artifacts.auto.tfvars](./artifacts.auto.tfvars)
 consists of the following fields.
-* `accelerator` ("tpu"|"cuda") - Supported accelerator. Impacts build
+* `accelerator` ("tpu") - Supported accelerator. Impacts build
   process and installed dependencies.
 * `python_version` (optional, string, default = "3.8") - Python version used for
   the docker images base and build process.
-* `cuda_version` (optional, string, default = "11.8") - CUDA version to install.
-  Used only if `accelerator` is set to "cuda"
 * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture
   influences installed dependencies and build process.
 * `cxx11_abi` (optional, "0"|"1", default = "0") - Whether to use C++11 ABI or
@@ -115,9 +110,8 @@ unset properties of existing triggers.
 
 #### Modify or add a new nightly release
 
-1. Modify or add an entry with specific `accelerator`, `python_version` and (optionally)
-   `cuda_version` to the `nightly_builds` variable in the
-   [artifacts.auto.tfvars](./artifacts.auto.tfvars) file.
+1. Modify or add an entry with specific `accelerator`, and `python_version`
+   to the `nightly_builds` variable in the [artifacts.auto.tfvars](./artifacts.auto.tfvars) file.
    See all variables in the section above.
 
     **Example**
@@ -125,10 +119,13 @@ unset properties of existing triggers.
     ```hcl
     nightly_builds = [
       {
-        accelerator    = "cuda"
-        cuda_version   = "11.8"  # optional
-        python_version = "3.8"   # optional
-        arch           = "amd64" # optional
+        git_tag         = "v2.8.0"
+        package_version = "2.8.0"
+        pytorch_git_rev = "v2.8.0"
+        accelerator     = "tpu"
+        python_version  = "3.10"
+        bundle_libtpu   = "0"
+        cxx11_abi       = "1"
       },
       # ...
     ]
diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf
index 099a2402afe9..b4e469b617be 100644
--- a/infra/tpu-pytorch-releases/artifacts_builds.tf
+++ b/infra/tpu-pytorch-releases/artifacts_builds.tf
@@ -6,10 +6,6 @@ locals {
   release_package_version = "2.8.0-rc5"
   release_pytorch_git_rev = "v2.8.0-rc8"
   nightly_package_version = "2.9.0"
-  cuda_versions = {
-    "nightly": [],
-    "r2.8": [] # Note: PyTorch 2.8 release doesn't have CUDA support
-  }
 
   # Built once a day from master
   generated_nightly_builds = concat(
@@ -22,16 +18,6 @@ locals {
         cxx11_abi      = "1"
       }
     ],
-    # CUDA builds
-    [
-      for pair in setproduct(local.tpu_python_versions, local.cuda_versions["nightly"]) : {
-        accelerator     = "cuda"
-        cuda_version    = pair[1]
-        python_version  = pair[0]
-        bundle_libtpu   = "0"
-        cxx11_abi       = "1"
-      }
-    ]
   )
 
   # Built on push to specific tag.
@@ -59,19 +45,6 @@ locals {
         bundle_libtpu   = "1"
       }
     ],
-
-    # cuda build for latest release
-    [
-    for pair in setproduct(local.tpu_python_versions, local.cuda_versions["r2.8"]) : {
-      git_tag         = local.release_git_tag
-      package_version = local.release_package_version
-      pytorch_git_rev = local.release_pytorch_git_rev
-      accelerator     = "cuda"
-      cuda_version    = pair[1]
-      python_version  = pair[0]
-      bundle_libtpu   = "0"
-    }
-    ]
   )
   versioned_builds = concat(local.generated_versioned_builds, var.manual_versioned_builds)
   nightly_builds = concat(local.generated_nightly_builds, var.manual_nightly_builds)
diff --git a/infra/tpu-pytorch-releases/dev_images.auto.tfvars b/infra/tpu-pytorch-releases/dev_images.auto.tfvars
index e1618f2a80c2..aee461990fd4 100644
--- a/infra/tpu-pytorch-releases/dev_images.auto.tfvars
+++ b/infra/tpu-pytorch-releases/dev_images.auto.tfvars
@@ -7,17 +7,5 @@ dev_images = [
     accelerator = "tpu"
     extra_tags  = ["tpu"]
     python_version = "3.12"
-  },
-  {
-    accelerator  = "cuda"
-    cuda_version = "12.1"
-    extra_tags   = ["cuda"]
-    python_version = "3.10"
-  },
-  {
-    accelerator  = "cuda"
-    cuda_version = "12.3"
-    extra_tags   = ["cuda"]
-    python_version = "3.10"
   }
 ]
diff --git a/infra/tpu-pytorch-releases/dev_images.tf b/infra/tpu-pytorch-releases/dev_images.tf
index 54c340809efb..03798c9dbefb 100644
--- a/infra/tpu-pytorch-releases/dev_images.tf
+++ b/infra/tpu-pytorch-releases/dev_images.tf
@@ -3,10 +3,9 @@ variable "dev_images" {
     accelerator    = string
     arch           = optional(string, "amd64")
     python_version = optional(string, "3.8")
-    cuda_version   = optional(string, "11.8")
 
     # Additional tags on top of uniquely generated tag based on accelerator,
-    # python and cuda versions.
+    # python versions.
     extra_tags = optional(list(string), [])
   }))
 }
@@ -16,7 +15,7 @@ locals {
     for di in var.dev_images :
     format("%s_%s",
       di.python_version,
-      di.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", di.cuda_version)
+      "tpuvm"
     ) => di
   }
 }
@@ -55,7 +54,6 @@ module "dev_images" {
     accelerator    = each.value.accelerator
     arch           = each.value.arch
     python_version = each.value.python_version
-    cuda_version   = each.value.cuda_version
   }
 
   docker_repo_url = module.docker_registry.url