diff --git a/.circleci/common.sh b/.circleci/common.sh
index 3093a8006942..50ec8eae1ade 100755
--- a/.circleci/common.sh
+++ b/.circleci/common.sh
@@ -158,26 +158,12 @@ function run_torch_xla_cpp_tests() {
fi
if [ "$USE_COVERAGE" != "0" ]; then
- if [ -x "$(command -v nvidia-smi)" ]; then
- PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
- cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
- PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
- cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
- lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
- else
- PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
- cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
- fi
+ PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+ cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
else
- # Shard GPU testing
- if [ -x "$(command -v nvidia-smi)" ]; then
- PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
- PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
- else
- PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
- fi
+ PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
fi
popd
}
@@ -196,11 +182,6 @@ function run_torch_xla_tests() {
RUN_CPP="${RUN_CPP_TESTS:0}"
RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
- if [ -x "$(command -v nvidia-smi)" ]; then
- num_devices=$(nvidia-smi --list-gpus | wc -l)
- echo "Found $num_devices GPU devices..."
- export GPU_NUM_DEVICES=$num_devices
- fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
diff --git a/.devcontainer/gpu-internal/devcontainer.json b/.devcontainer/gpu-internal/devcontainer.json
deleted file mode 100644
index ce06bab9e2e7..000000000000
--- a/.devcontainer/gpu-internal/devcontainer.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "name": "gpu-internal",
- "image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
- "runArgs": [
- "--gpus=all",
- "--net=host",
- "--shm-size=16G"
- ],
- "containerEnv": {
- "BAZEL_REMOTE_CACHE": "1",
- "SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm"
- },
- "initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1",
- "customizations": {
- "vscode": {
- "extensions": [
- "llvm-vs-code-extensions.vscode-clangd",
- "ms-vscode.cpptools-themes",
- "BazelBuild.vscode-bazel",
- "DevonDCarew.bazel-code",
- "StackBuild.bazel-stack-vscode",
- "StackBuild.bazel-stack-vscode-cc",
- "xaver.clang-format",
- "ryanluker.vscode-coverage-gutters",
- "ms-azuretools.vscode-docker",
- "ms-python.python"
- ]
- }
- }
-}
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 6c37920bd137..b44f8dca7ad2 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -13,5 +13,5 @@ Error messages and stack traces are also helpful.
## System Info
-- reproducible on XLA backend [CPU/TPU/CUDA]:
+- reproducible on XLA backend [CPU/TPU]:
- torch_xla version:
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 54f785623a50..3c10b58bfe5a 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -46,7 +46,7 @@ Steps to reproduce the behavior:
## Environment
- - Reproducible on XLA backend [CPU/TPU/CUDA]:
+ - Reproducible on XLA backend [CPU/TPU]:
- torch_xla version:
diff --git a/.github/ci.md b/.github/ci.md
index 2cc72b5abf50..cc3994c884e7 100644
--- a/.github/ci.md
+++ b/.github/ci.md
@@ -44,20 +44,20 @@ fail. Steps for fixing and merging such breaking PyTorch change is as following:
### Running TPU tests on PRs
-The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU and
-GPU. The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
+The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU.
+The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
## CI Environment
Before the CI in this repository runs, we build a base dev image. These are the
same images we recommend in our VSCode `.devcontainer` setup and nightly build
-to ensure consistency between environments. We produce variants with and without
-CUDA, configured in `infra/ansible` (build config) and
-`infra/tpu-pytorch-releases/dev_images.tf` (build triggers).
+to ensure consistency between environments. We produce variants configured in
+`infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf`
+(build triggers).
The CI runs in two environments:
-1. Organization self-hosted runners for CPU and GPU: used for almost every step
+1. Organization self-hosted runners for CPU: used for almost every step
of the CI. These runners are managed by PyTorch and have access to the shared
ECR repository.
1. TPU self-hosted runners: these are managed by us and are only available in
@@ -68,24 +68,18 @@ The CI runs in two environments:
We have two build paths for each CI run:
-- `torch_xla`: we build the main package to support both TPU and GPU[^1], along
+- `torch_xla`: we build the main package to support TPU, along
with a CPU build of `torch` from HEAD. This build step exports the
`torch-xla-wheels` artifact for downstream use in tests.
- Some CI tests also require `torchvision`. To reduce flakiness, we compile
`torchvision` from [`torch`'s CI pin][pytorch-vision-pin].
- C++ tests are piggybacked onto the same build and uploaded in the
`cpp-test-bin` artifact.
-- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of
- either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus,
- this build should be almost entirely cached, unless your PR changes the XLA
- pin or adds a patch.
-Both the main package build and plugin build are configured with ansible at
-`infra/ansible`, although they run in separate stages (`stage=build_srcs` vs
-`stage=build_plugin`). This is the same configuration we use for our nightly and
-release builds.
+The main package build is configured with ansible at `infra/ansible`. This is
+the same configuration we use for our nightly and release builds.
-The CPU and GPU test configs are defined in the same file, `_test.yml`. Since
+The CPU test config is defined in the file `_test.yml`. Since
some of the tests come from the upstream PyTorch repository, we check out
PyTorch at the same git rev as the `build` step (taken from
`torch_xla.version.__torch_gitrev__`). The tests are split up into multiple
@@ -93,23 +87,16 @@ groups that run in parallel; the `matrix` section of `_test.yml` corresponds to
in `.github/scripts/run_tests.sh`.
CPU tests run immediately after the `torch_xla` build completes. This will
-likely be the first test feedback on your commit. GPU tests will launch when
-both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is
-much slower due to the number of possible optimizations, and the GPU chips
-themselves are quite outdated, so these tests will take longer to run than the
-CPU tests.
+likely be the first test feedback on your commit.

-
-
For the C++ test groups in either case, the test binaries are pre-built during
the build phase and packaged in `cpp-test-bin`. This will only be downloaded if
necessary.
-[^1]: Note: both GPU and TPU support require their respective plugins to be
+[^1]: Note: TPU support require its respective plugins to be
installed. This package will _not_ work on either out of the box.
### TPU CI
diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 7ae422c47953..65f46f9cf48c 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -77,11 +77,6 @@ PYTORCH_DIR=$1
XLA_DIR=$2
USE_COVERAGE="${3:-0}"
-if [ -x "$(command -v nvidia-smi)" ]; then
- num_devices=$(nvidia-smi --list-gpus | wc -l)
- echo "Found $num_devices GPU devices..."
- export GPU_NUM_DEVICES=$num_devices
-fi
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
index 4ef00dcedaed..23ffe34f8a46 100644
--- a/.github/workflows/_test.yml
+++ b/.github/workflows/_test.yml
@@ -23,11 +23,6 @@ on:
description: |
Set the maximum (in minutes) how long the workflow should take to finish
timeout-minutes:
- install-cuda-plugin:
- required: false
- type: boolean
- default: false
- description: Whether to install CUDA plugin package
torch-commit:
required: true
type: string
@@ -46,7 +41,7 @@ jobs:
runs-on: ${{ inputs.runner }}
container:
image: ${{ inputs.dev-image }}
- options: "${{ inputs.install-cuda-plugin == true && '--gpus all' || '' }} --shm-size 16g"
+ options: "--shm-size 16g"
strategy:
fail-fast: false
matrix:
@@ -95,9 +90,7 @@ jobs:
uses: ./.actions/.github/workflows/setup
with:
torch-commit: ${{ inputs.torch-commit }}
- cuda: ${{ inputs.install-cuda-plugin && true || false }}
wheels-artifact: torch-xla-wheels
- cuda-plugin-artifact: ${{ inputs.install-cuda-plugin && 'cuda-plugin' || null }}
- name: Fetch CPP test binaries
if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests
uses: actions/download-artifact@v4
@@ -111,9 +104,6 @@ jobs:
run: |
chmod +x /tmp/test/bin/*
ls -l /tmp/test/bin
- - name: Check GPU
- if: inputs.has_code_changes == 'true' && inputs.install-cuda-plugin
- run: nvidia-smi
- name: Install test deps
if: inputs.has_code_changes == 'true'
shell: bash
@@ -164,35 +154,24 @@ jobs:
exit 0
fi
docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}"
- if [ -n "${GPU_FLAG:-}" ]; then
- if [ -n "${PYTHON_TEST_NAME}" ]; then
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out
- fi
- if [ -n "${CPP_TEST_NAME}" ]; then
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out
- fi
- else
- if [ -n "${PYTHON_TEST_NAME}" ]; then
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
- fi
+ if [ -n "${PYTHON_TEST_NAME}" ]; then
+ gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+ gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out
+ fi
- if [ -n "${CPP_TEST_NAME}" ]; then
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
- gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
- fi
+ if [ -n "${CPP_TEST_NAME}" ]; then
+ gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+ gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out
+ fi
- if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then
- ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
- echo $ABS_METADATA > abs_metadata.json
- gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
+ if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then
+ ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+ echo $ABS_METADATA > abs_metadata.json
+ gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
- INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
- echo $INC_METADATA > inc_metadata.json
- gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
- fi
+ INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}'
+ echo $INC_METADATA > inc_metadata.json
+ gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json
fi
- name: Report no code changes
if: inputs.has_code_changes == 'false'
diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml
index 574b85e5b0d5..e1d6fdb8599d 100644
--- a/.github/workflows/setup/action.yml
+++ b/.github/workflows/setup/action.yml
@@ -3,20 +3,10 @@ inputs:
torch-commit:
type: string
description: PyTorch commit to check out, if provided
- cuda:
- type: boolean
- description: Whether to set up CUDA library paths
- default: false
wheels-artifact:
type: string
description: |
Artifact containing `torch` (cpu) and `torch-xla` wheels to install
- cuda-plugin-artifact:
- type: string
- description: Artifact containing `torch-xla-cuda-plugin` to install
- cuda-torch-artifact:
- type: string
- description: Artifact containing CUDA build of `torch`
runs:
using: "composite"
steps:
@@ -26,12 +16,6 @@ runs:
run: |
ls -la
rm -rvf ${GITHUB_WORKSPACE}/*
- - name: Setup CUDA environment
- shell: bash
- run: |
- echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV
- echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV
- if: ${{ inputs.cuda }}
- name: Setup gcloud
shell: bash
run: |
@@ -59,23 +43,6 @@ runs:
name: ${{ inputs.wheels-artifact }}
path: /tmp/wheels/
if: ${{ inputs.wheels-artifact }}
- - name: Fetch CUDA plugin
- uses: actions/download-artifact@v4
- with:
- name: ${{ inputs.cuda-plugin-artifact }}
- path: /tmp/wheels/
- if: ${{ inputs.cuda-plugin-artifact }}
- - name: Remove CPU `torch` build
- shell: bash
- run: |
- rm -rf /tmp/wheels/torch-*
- if: ${{ inputs.cuda-torch-artifact }}
- - name: Fetch CUDA `torch` build
- uses: actions/download-artifact@v4
- with:
- name: ${{ inputs.cuda-torch-artifact }}
- path: /tmp/wheels/
- if: ${{ inputs.cuda-torch-artifact }}
- name: Install wheels
shell: bash
run: |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6c05fd88f747..b8d233c87002 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -238,10 +238,6 @@ first time, you may need to build everything again, for example, after a
python setup.py develop
```
-### Additional steps for GPU
-
-Please refer to this [guide](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md).
-
## Before Creating a Pull Request
In `pytorch/xla` repo we enforce coding style for both C++ and Python files.
diff --git a/README.md b/README.md
index 989858ef16fd..d02ae1a0968e 100644
--- a/README.md
+++ b/README.md
@@ -95,24 +95,23 @@ batch size 1024:
Our github contains many useful docs on working with different aspects of PyTorch XLA, here is a list of useful docs spread around our repository:
- [docs/source/learn](https://github.com/pytorch/xla/tree/master/docs/source/learn): docs for learning concepts associated with XLA, troubleshooting, pjrt, eager mode, and dynamic shape.
-- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `GPU` and `TPU` accelerator documents.
+- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `TPU` accelerator documents.
- [docs/source/perf](https://github.com/pytorch/xla/tree/master/docs/source/perf): documentation about performance specific aspects of PyTorch/XLA such as: `AMP`, `DDP`, `Dynamo`, Fori loop, `FSDP`, quantization, recompilation, and `SPMD`
- [docs/source/features](https://github.com/pytorch/xla/tree/master/docs/source/features): documentation on distributed torch, pallas, scan, and stable hlo.
- [docs/source/contribute](https://github.com/pytorch/xla/tree/master/docs/source/contribute): documents on setting up PyTorch for development, and guides for lowering operations.
- PJRT plugins:
- [CPU](https://github.com/pytorch/xla/blob/master/plugins/cpu/README.md)
- - [CUDA](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md)
- [torchax/docs](https://github.com/pytorch/xla/tree/master/torchax/docs): torchax documents
- [torchax/examples](https://github.com/pytorch/xla/tree/master/torchax/examples): torchax examples
## Getting Started
Following here are guides for two modes:
-- Single process: one Python interpreter controlling a single GPU/TPU at a time
-- Multi process: N Python interpreters are launched, corresponding to N GPU/TPUs
+- Single process: one Python interpreter controlling a single TPU at a time
+- Multi process: N Python interpreters are launched, corresponding to N TPUs
found on the system
-Another mode is SPMD, where one Python interpreter controls all N GPU/TPUs found on
+Another mode is SPMD, where one Python interpreter controls all N TPUs found on
the system. Multi processing is more complex, and is not compatible with SPMD. This
tutorial does not dive into SPMD. For more on that, check our
[SPMD guide](https://github.com/pytorch/xla/blob/master/docs/source/perf/spmd_basic.md).
@@ -223,7 +222,7 @@ If you're using `DistributedDataParallel`, make the following changes:
Additional information on PyTorch/XLA, including a description of its semantics
and functions, is available at [PyTorch.org](http://pytorch.org/xla/). See the
[API Guide](API_GUIDE.md) for best practices when writing networks that run on
-XLA devices (TPU, CUDA, CPU and...).
+XLA devices (TPU, CPU and...).
Our comprehensive user guides are available at:
@@ -234,13 +233,9 @@ Our comprehensive user guides are available at:
## PyTorch/XLA tutorials
-* [Cloud TPU VM
- quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch)
-* [Cloud TPU Pod slice
- quickstart](https://cloud.google.com/tpu/docs/pytorch-pods)
-* [Profiling on TPU
- VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)
-* [GPU guide](docs/gpu.md)
+* [Cloud TPU VM quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch)
+* [Cloud TPU Pod slice quickstart](https://cloud.google.com/tpu/docs/pytorch-pods)
+* [Profiling on TPU VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)
## Reference implementations
@@ -259,12 +254,10 @@ Cloud TPU plugin corresponding to your installed `torch_xla`, install the option
pip install torch_xla[tpu]
```
-GPU release builds and GPU/TPU nightly builds are available in our public GCS bucket.
+TPU nightly builds are available in our public GCS bucket.
-| Version | Cloud GPU VM Wheels |
+| Version | Cloud TPU Nightly Wheels |
| --- | ----------- |
-| 2.7 (CUDA 12.6 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.7 (CUDA 12.6 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
| nightly (Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp311-cp311-linux_x86_64.whl` |
| nightly (Python 3.12) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` |
| nightly (Python 3.13) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` |
@@ -296,27 +289,6 @@ The torch wheel version `2.9.0.dev20250423+cpu` can be found at https://download
| 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` |
| 2.1 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.1.0-cp38-cp38-linux_x86_64.whl` |
-
-
-| Version | GPU Wheel |
-| --- | ----------- |
-| 2.5 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.5 (CUDA 12.4 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp39-cp39-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.4 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.3 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| 2.2 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` |
-| 2.1 + CUDA 11.8 | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/11.8/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` |
-| nightly + CUDA 12.0 >= 2023/06/27| `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` |
-
### Docker
@@ -337,46 +309,6 @@ To use the above dockers, please pass `--privileged --net host --shm-size=16G` a
```bash
docker run --privileged --net host --shm-size=16G -it us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm /bin/bash
```
-
-
-| Version | GPU CUDA 12.6 Docker |
-| --- | ----------- |
-| 2.7 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.7.0_3.10_cuda_12.6` |
-
-
-
-
-
-| Version | GPU CUDA 12.4 Docker |
-| --- | ----------- |
-| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.4` |
-| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.4` |
-
-
-
-
-| Version | GPU CUDA 12.1 Docker |
-| --- | ----------- |
-| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1` |
-| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.1` |
-| 2.3 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0_3.10_cuda_12.1` |
-| 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1` |
-| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` |
-| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` |
-| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1_YYYYMMDD` |
-
-
-
-| Version | GPU CUDA 11.8 + Docker |
-| --- | ----------- |
-| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_11.8` |
-| 2.0 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.0_3.8_cuda_11.8` |
-
-
-
-
-To run on [compute instances with
-GPUs](https://cloud.google.com/compute/docs/gpus/create-vm-with-gpus).
## Troubleshooting
diff --git a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
index 8d4fbd95bff7..f06e4a9b9f03 100644
--- a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
+++ b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb
@@ -461,7 +461,7 @@
" torch.manual_seed(42)\n",
" model = nn.Linear(128, 10).to(device)\n",
"\n",
- " # Optional for TPU v4 and GPU\n",
+ " # Optional for TPU v4\n",
" xm.broadcast_master_param(model)\n",
"\n",
" model = DDP(model, gradient_as_bucket_view=True)\n",
diff --git a/docs/source/accelerators/gpu.md b/docs/source/accelerators/gpu.md
deleted file mode 100644
index 56abb192a704..000000000000
--- a/docs/source/accelerators/gpu.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Learn about GPUs
-
-For information on GPUs on Google Cloud, see:
-
-- [About GPUs on Google Cloud](https://cloud.google.com/compute/docs/gpus/overview)
-- [GPU machine types](https://cloud.google.com/compute/docs/gpus)
diff --git a/docs/source/contribute/bazel.md b/docs/source/contribute/bazel.md
index 0e41ec837057..69e1d5954c82 100644
--- a/docs/source/contribute/bazel.md
+++ b/docs/source/contribute/bazel.md
@@ -22,9 +22,7 @@ http_archive(
],
patch_tool = "patch",
patches = [
- "//openxla_patches:gpu_nvml.diff",
- "//openxla_patches:gpu_race_condition.diff",
- "//openxla_patches:count_down.diff",
+ "//openxla_patches:no_fortify.diff",
],
strip_prefix = "xla-" + xla_hash,
urls = [
@@ -223,7 +221,7 @@ The `xla_client` tests are pure hermetic tests that can be easily
executed. The `torch_xla` plugin tests are more complex: they require
`torch` and `torch_xla` to be installed, and they cannot run in
parallel, since they are using either XRT server/client on the same
-port, or because they use a GPU or TPU device and there's only one
+port, or because they use a TPU device and there's only one
available at the time. For that reason, all tests under
`torch_xla/csrc/` are bundled into a single target `:main` that runs
them all sequentially.
diff --git a/docs/source/contribute/plugins.md b/docs/source/contribute/plugins.md
index 40ae841e8d7b..84ca6fe1c9ea 100644
--- a/docs/source/contribute/plugins.md
+++ b/docs/source/contribute/plugins.md
@@ -1,8 +1,7 @@
# Custom Hardware Plugins
PyTorch/XLA supports custom hardware through OpenXLA's PJRT C API. The
-PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`) and
-GPU ([OpenXLA](https://github.com/openxla/xla/tree/main/xla/pjrt/gpu)).
+PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`).
The same plugins may also be used by JAX and TF.
## Implementing a PJRT Plugin
diff --git a/docs/source/learn/_pjrt.md b/docs/source/learn/_pjrt.md
index 16300239353a..2f4f446991de 100644
--- a/docs/source/learn/_pjrt.md
+++ b/docs/source/learn/_pjrt.md
@@ -38,7 +38,7 @@ the `runtime` tag.
## TL;DR
- To use the PJRT preview runtime, set the `PJRT_DEVICE` environment
- variable to `CPU`, `TPU`, or `CUDA`
+ variable to `CPU`, or `TPU`
- In XRT, all distributed workloads are multiprocess, with one process
per device. On TPU v2 and v3 in PJRT, workloads are multiprocess and
multithreaded (4 processes with 2 threads each), so your workload
@@ -57,7 +57,7 @@ the `runtime` tag.
- To use `torch.distributed`, import
`torch_xla.experimental.pjrt_backend` and use the `xla://`
`init_method`.
- - These steps are optional for GPU and TPU v4.
+ - These steps are optional for TPU v4.
Sample diff from XRT to PJRT:
@@ -84,7 +84,7 @@ def _mp_fn(index):
torch.manual_seed(42)
model = nn.Linear(128, 10).to(device)
-+ # Optional for TPU v4 and GPU
++ # Optional for TPU v4
+ xm.broadcast_master_param(model)
model = DDP(model, gradient_as_bucket_view=True)
@@ -119,7 +119,7 @@ if __name__ == '__main__':
## Benefits
- Simple runtime configuration: just set `PJRT_DEVICE` to `TPU`,
- `CPU`, or `CUDA` and start using XLA! Or, let PJRT select a device
+ or `CPU` and start using XLA! Or, let PJRT select a device
automatically based on your environment.
- Improved performance: reduced overhead from gRPC means faster
end-to-end execution. On TorchBench 2.0, we observed a \>35%
diff --git a/docs/source/perf/amp.md b/docs/source/perf/amp.md
index 36d777fd865f..223e338f2135 100644
--- a/docs/source/perf/amp.md
+++ b/docs/source/perf/amp.md
@@ -2,7 +2,7 @@
Pytorch/XLA's AMP extends [Pytorch's AMP
package](https://pytorch.org/docs/stable/amp.html) with support for
-automatic mixed precision on `XLA:GPU` and `XLA:TPU` devices. AMP is
+automatic mixed precision on `XLA:TPU` devices. AMP is
used to accelerate training and inference by executing certain
operations in `float32` and other operations in a lower precision
datatype (`float16` or `bfloat16` depending on hardware support). This
@@ -99,4 +99,4 @@ unlisted ops run if they're downstream from autocasted ops.
Our [mnist training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist_amp.py)
and [imagenet training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet_amp.py)
-demonstrate how AMP is used on both TPUs and GPUs.
+demonstrate how AMP is used on TPUs.
diff --git a/docs/source/perf/spmd_advanced.md b/docs/source/perf/spmd_advanced.md
index 7005ee5dd4c0..2a056dc3d693 100644
--- a/docs/source/perf/spmd_advanced.md
+++ b/docs/source/perf/spmd_advanced.md
@@ -110,7 +110,7 @@ torch.ops.xla.dynamo_mark_sharding(output, device_ids, mesh_shape, axis_names, p
### SPMD Debugging Tool
-We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/GPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`:
+We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`:
- Code snippet used `visualize_tensor_sharding` and visualization result:
```python
@@ -141,7 +141,7 @@ generated_table = visualize_sharding(sharding, use_color=False)
-You could use these examples on TPU/GPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`.
+You could use these examples on TPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`.
### Auto-Sharding
We are introducing a new PyTorch/XLA SPMD feature, called ``auto-sharding``, [RFC](https://github.com/pytorch/xla/issues/6322). This is an experimental feature in `r2.3` and `nightly`, that supports `XLA:TPU` and a single TPUVM host.
diff --git a/docs/source/perf/spmd_gpu.md b/docs/source/perf/spmd_gpu.md
deleted file mode 100644
index cda25723aaad..000000000000
--- a/docs/source/perf/spmd_gpu.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# Running SPMD on GPU
-
-PyTorch/XLA supports SPMD on NVIDIA GPU (single-node or multi-nodes).
-The training/inference script remains the same as the one used for TPU,
-such as this [ResNet
-script](https://github.com/pytorch/xla/blob/1dc78948c0c9d018d8d0d2b4cce912552ab27083/test/spmd/test_train_spmd_imagenet.py).
-To execute the script using SPMD, we leverage `torchrun`:
-
- PJRT_DEVICE=CUDA \
- torchrun \
- --nnodes=${NUM_GPU_MACHINES} \
- --node_rank=${RANK_OF_CURRENT_MACHINE} \
- --nproc_per_node=1 \
- --rdzv_endpoint=":" \
- training_or_inference_script_using_spmd.py
-
-- `--nnodes`: how many GPU machines to be used.
-- `--node_rank`: the index of the current GPU machines. The value can
- be 0, 1, ..., \${NUMBER_GPU_VM}-1.
-- `--nproc_per_node`: the value must be 1 due to the SPMD requirement.
-- `--rdzv_endpoint`: the endpoint of the GPU machine with
- node_rank==0, in the form `host:port`. The host will be the internal
- IP address. The `port` can be any available port on the machine. For
- single-node training/inference, this parameter can be omitted.
-
-For example, if you want to train a ResNet model on 2 GPU machines using
-SPMD, you can run the script below on the first machine:
-
- XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \
- torchrun \
- --nnodes=2 \
- --node_rank=0 \
- --nproc_per_node=1 \
- --rdzv_endpoint=":12355" \
- pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128
-
-and run the following on the second machine:
-
- XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \
- torchrun \
- --nnodes=2 \
- --node_rank=1 \
- --nproc_per_node=1 \
- --rdzv_endpoint=":12355" \
- pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128
-
-For more information, please refer to the [SPMD support on GPU
-RFC](https://github.com/pytorch/xla/issues/6256).
diff --git a/examples/train_resnet_amp.py b/examples/train_resnet_amp.py
index f5ca308bed75..f63c3cad8544 100644
--- a/examples/train_resnet_amp.py
+++ b/examples/train_resnet_amp.py
@@ -22,8 +22,7 @@ def train_loop_fn(self, loader, epoch):
with autocast(torch_xla.device()):
output = self.model(data)
loss = self.loss_fn(output, target)
- # TPU amp uses bf16 hence gradient scaling is not necessary. If runnign with XLA:GPU
- # check https://github.com/pytorch/xla/blob/master/docs/amp.md#amp-for-xlagpu.
+ # TPU amp uses bf16 hence gradient scaling is not necessary.
loss.backward()
self.run_optimizer()
tracker.add(self.batch_size)
diff --git a/infra/ansible/README.md b/infra/ansible/README.md
index 9094f645de30..9ce34d962cff 100644
--- a/infra/ansible/README.md
+++ b/infra/ansible/README.md
@@ -23,11 +23,11 @@ behavior (installed pip/apt packages and set environment variables):
* `stage`: build or release. Different packages are installed depending on
the chosen stage.
* `arch`: aarch64 or amd64. Architecture of the built image and wheels.
-* `accelerator`: tpu or cuda. Available accelerator.
+* `accelerator`: tpu. Available accelerator.
The variables can be passed through `-e` flag: `-e "="`.
-Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"`
+Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64"`
## Config structure
diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml
index d026fea3e037..ae3d95468344 100644
--- a/infra/ansible/config/apt.yaml
+++ b/infra/ansible/config/apt.yaml
@@ -20,13 +20,6 @@ apt:
- lcov
- less
- build_cuda:
- - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
- - "cuda-toolkit-{{ cuda_version | replace('.', '-') }}"
- - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
- - "{{ cuda_deps['libcudnn'][cuda_version] }}"
- - "{{ cuda_deps['libcudnn-dev'][cuda_version] }}"
-
build_aarch64:
- scons
@@ -39,23 +32,13 @@ apt:
- patch
- vim
- release_cuda:
- - "cuda-libraries-{{ cuda_version | replace('.', '-') }}"
- - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}"
- - "{{ cuda_deps['libcudnn'][cuda_version] }}"
-
# Specify objects with string fields `url` and `keyring`.
# The keyring path should start with /usr/share/keyrings/ for debian and ubuntu.
signing_keys:
- url: https://apt.llvm.org/llvm-snapshot.gpg.key
keyring: /usr/share/keyrings/llvm.pgp
- # Get the recent key version from
- # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-debian.
- - url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub"
- keyring: /usr/share/keyrings/cuda.pgp
repos:
# signed-by path should match the corresponding keyring path above.
- "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
- "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main"
- - "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /"
diff --git a/infra/ansible/config/cuda_deps.yaml b/infra/ansible/config/cuda_deps.yaml
deleted file mode 100644
index 3732bb0f93ec..000000000000
--- a/infra/ansible/config/cuda_deps.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Versions of cuda dependencies for given cuda versions.
-# Note: wrap version in quotes to ensure they're treated as strings.
-cuda_deps:
- # List all libcudnn8 versions with `apt list -a libcudnn8`
- libcudnn:
- "12.8": libcudnn9-cuda-12=9.1.1.17-1
- "12.6": libcudnn9-cuda-12=9.1.1.17-1
- "12.4": libcudnn9-cuda-12=9.1.1.17-1
- "12.3": libcudnn9-cuda-12=9.1.1.17-1
- "12.1": libcudnn8=8.9.2.26-1+cuda12.1
- "12.0": libcudnn8=8.8.0.121-1+cuda12.0
- "11.8": libcudnn8=8.7.0.84-1+cuda11.8
- "11.7": libcudnn8=8.5.0.96-1+cuda11.7
- "11.2": libcudnn8=8.1.1.33-1+cuda11.2
- libcudnn-dev:
- "12.8": libcudnn9-dev-cuda-12=9.1.1.17-1
- "12.6": libcudnn9-dev-cuda-12=9.1.1.17-1
- "12.4": libcudnn9-dev-cuda-12=9.1.1.17-1
- "12.3": libcudnn9-dev-cuda-12=9.1.1.17-1
- "12.1": libcudnn8-dev=8.9.2.26-1+cuda12.1
- "12.0": libcudnn8-dev=8.8.0.121-1+cuda12.0
- "11.8": libcudnn8-dev=8.7.0.84-1+cuda11.8
- "11.7": libcudnn8-dev=8.5.0.96-1+cuda11.7
- "11.2": libcudnn8-dev=8.1.1.33-1+cuda11.2
diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
index f34e2c3cb632..c336e7754f46 100644
--- a/infra/ansible/config/vars.yaml
+++ b/infra/ansible/config/vars.yaml
@@ -1,8 +1,3 @@
-# Used for fetching cuda from the right repo, see apt.yaml.
-cuda_repo: debian11
-cuda_version: "11.8"
-# Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus
-cuda_compute_capabilities: 5.2,7.0,7.5,8.0,9.0
# Used for fetching clang from the right repo, see apt.yaml.
llvm_debian_repo: bullseye
clang_version: 17
@@ -10,7 +5,7 @@ clang_version: 17
package_version: 2.9.0
# If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl.
nightly_release: false
-# Whether to preinstall libtpu in the PyTorch/XLA wheel. Ignored for GPU build.
+# Whether to preinstall libtpu in the PyTorch/XLA wheel.
bundle_libtpu: 1
# Suffix for bazel remote cache key
cache_suffix: ""
diff --git a/infra/ansible/playbook.yaml b/infra/ansible/playbook.yaml
index 7626714e8d18..85153a43d3a2 100644
--- a/infra/ansible/playbook.yaml
+++ b/infra/ansible/playbook.yaml
@@ -6,7 +6,7 @@
# - stage: build or release. Different packages are installed depending on
# the chosen stage.
# - arch: aarch64 or amd64. Architecture of the built image and wheels.
- # - accelerator: tpu or cuda. Available accelerator.
+ # - accelerator: tpu.
pre_tasks:
- name: "Validate required variables"
ansible.builtin.assert:
@@ -20,7 +20,7 @@
- name: arch
pattern: ^(aarch64|amd64)$
- name: accelerator
- pattern: ^(tpu|cuda)$
+ pattern: ^tpu$
- name: "Include vars from config files"
ansible.builtin.include_vars:
@@ -28,8 +28,6 @@
loop:
# vars.yaml should be the first as other config files depend on it.
- vars.yaml
- # cuda_deps should be loaded before apt, since apt depends on it.
- - cuda_deps.yaml
- apt.yaml
- pip.yaml
- env.yaml
diff --git a/infra/tpu-pytorch-releases/README.md b/infra/tpu-pytorch-releases/README.md
index f173b3ee8575..a70e0b064a6e 100644
--- a/infra/tpu-pytorch-releases/README.md
+++ b/infra/tpu-pytorch-releases/README.md
@@ -39,13 +39,11 @@ consists of the following fields.
sources when building image and wheels.
* `package_version` (string) - Version of the built wheels. Passed to the
build steps.
-* `accelerator` ("tpu"|"cuda") - Supported accelerator. Affects build
+* `accelerator` ("tpu") - Supported accelerator. Affects build
process and installed dependencies, see [apt.yaml](../ansible/config/apt.yaml) and
[pip.yaml](../ansible/config/pip.yaml).
* `python_version` (optional, string, default = "3.8") - Python version used for
the docker image base and build process.
-* `cuda_version` (optional, string, default = "11.8") - CUDA version to install.
- Used only if `accelerator` is set to "cuda"
* `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture
affects installed dependencies and build process, see [apt.yaml](../ansible/config/apt.yaml) and
[pip.yaml](../ansible/config/pip.yaml).
@@ -71,7 +69,6 @@ unset properties of existing triggers.
git_tag = "v3.0.0"
package_version = "3.0"
accelerator = "tpu"
- cuda_version = "11.8" # optional
python_version = "3.8" # optional
arch = "amd64" # optional
},
@@ -95,12 +92,10 @@ at midnight (`America/Los_Angeles` time zone).
Nightly builds in the `nightly_builds` variable in
[artifacts.auto.tfvars](./artifacts.auto.tfvars)
consists of the following fields.
-* `accelerator` ("tpu"|"cuda") - Supported accelerator. Impacts build
+* `accelerator` ("tpu") - Supported accelerator. Impacts build
process and installed dependencies.
* `python_version` (optional, string, default = "3.8") - Python version used for
the docker images base and build process.
-* `cuda_version` (optional, string, default = "11.8") - CUDA version to install.
- Used only if `accelerator` is set to "cuda"
* `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture
influences installed dependencies and build process.
* `cxx11_abi` (optional, "0"|"1", default = "0") - Whether to use C++11 ABI or
@@ -115,9 +110,8 @@ unset properties of existing triggers.
#### Modify or add a new nightly release
-1. Modify or add an entry with specific `accelerator`, `python_version` and (optionally)
- `cuda_version` to the `nightly_builds` variable in the
- [artifacts.auto.tfvars](./artifacts.auto.tfvars) file.
+1. Modify or add an entry with specific `accelerator`, and `python_version`
+ to the `nightly_builds` variable in the [artifacts.auto.tfvars](./artifacts.auto.tfvars) file.
See all variables in the section above.
**Example**
@@ -125,10 +119,13 @@ unset properties of existing triggers.
```hcl
nightly_builds = [
{
- accelerator = "cuda"
- cuda_version = "11.8" # optional
- python_version = "3.8" # optional
- arch = "amd64" # optional
+ git_tag = "v2.8.0"
+ package_version = "2.8.0"
+ pytorch_git_rev = "v2.8.0"
+ accelerator = "tpu"
+ python_version = "3.10"
+ bundle_libtpu = "0"
+ cxx11_abi = "1"
},
# ...
]
diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf
index 099a2402afe9..b4e469b617be 100644
--- a/infra/tpu-pytorch-releases/artifacts_builds.tf
+++ b/infra/tpu-pytorch-releases/artifacts_builds.tf
@@ -6,10 +6,6 @@ locals {
release_package_version = "2.8.0-rc5"
release_pytorch_git_rev = "v2.8.0-rc8"
nightly_package_version = "2.9.0"
- cuda_versions = {
- "nightly": [],
- "r2.8": [] # Note: PyTorch 2.8 release doesn't have CUDA support
- }
# Built once a day from master
generated_nightly_builds = concat(
@@ -22,16 +18,6 @@ locals {
cxx11_abi = "1"
}
],
- # CUDA builds
- [
- for pair in setproduct(local.tpu_python_versions, local.cuda_versions["nightly"]) : {
- accelerator = "cuda"
- cuda_version = pair[1]
- python_version = pair[0]
- bundle_libtpu = "0"
- cxx11_abi = "1"
- }
- ]
)
# Built on push to specific tag.
@@ -59,19 +45,6 @@ locals {
bundle_libtpu = "1"
}
],
-
- # cuda build for latest release
- [
- for pair in setproduct(local.tpu_python_versions, local.cuda_versions["r2.8"]) : {
- git_tag = local.release_git_tag
- package_version = local.release_package_version
- pytorch_git_rev = local.release_pytorch_git_rev
- accelerator = "cuda"
- cuda_version = pair[1]
- python_version = pair[0]
- bundle_libtpu = "0"
- }
- ]
)
versioned_builds = concat(local.generated_versioned_builds, var.manual_versioned_builds)
nightly_builds = concat(local.generated_nightly_builds, var.manual_nightly_builds)
diff --git a/infra/tpu-pytorch-releases/dev_images.auto.tfvars b/infra/tpu-pytorch-releases/dev_images.auto.tfvars
index e1618f2a80c2..aee461990fd4 100644
--- a/infra/tpu-pytorch-releases/dev_images.auto.tfvars
+++ b/infra/tpu-pytorch-releases/dev_images.auto.tfvars
@@ -7,17 +7,5 @@ dev_images = [
accelerator = "tpu"
extra_tags = ["tpu"]
python_version = "3.12"
- },
- {
- accelerator = "cuda"
- cuda_version = "12.1"
- extra_tags = ["cuda"]
- python_version = "3.10"
- },
- {
- accelerator = "cuda"
- cuda_version = "12.3"
- extra_tags = ["cuda"]
- python_version = "3.10"
}
]
diff --git a/infra/tpu-pytorch-releases/dev_images.tf b/infra/tpu-pytorch-releases/dev_images.tf
index 54c340809efb..03798c9dbefb 100644
--- a/infra/tpu-pytorch-releases/dev_images.tf
+++ b/infra/tpu-pytorch-releases/dev_images.tf
@@ -3,10 +3,9 @@ variable "dev_images" {
accelerator = string
arch = optional(string, "amd64")
python_version = optional(string, "3.8")
- cuda_version = optional(string, "11.8")
# Additional tags on top of uniquely generated tag based on accelerator,
- # python and cuda versions.
+ # python versions.
extra_tags = optional(list(string), [])
}))
}
@@ -16,7 +15,7 @@ locals {
for di in var.dev_images :
format("%s_%s",
di.python_version,
- di.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", di.cuda_version)
+ "tpuvm"
) => di
}
}
@@ -55,7 +54,6 @@ module "dev_images" {
accelerator = each.value.accelerator
arch = each.value.arch
python_version = each.value.python_version
- cuda_version = each.value.cuda_version
}
docker_repo_url = module.docker_registry.url