From 1ccce715dc5340ae39e1bfc0a24f4d5106f6a19e Mon Sep 17 00:00:00 2001 From: Yukio Siraichi Date: Wed, 3 Sep 2025 16:09:22 -0300 Subject: [PATCH] Remove other CUDA usage in the repository. --- .circleci/common.sh | 25 +----- .devcontainer/gpu-internal/devcontainer.json | 30 ------- .github/ISSUE_TEMPLATE.md | 2 +- .github/ISSUE_TEMPLATE/bug-report.md | 2 +- .github/ci.md | 37 +++----- .github/scripts/run_tests.sh | 5 -- .github/workflows/_test.yml | 53 ++++------- .github/workflows/setup/action.yml | 33 ------- CONTRIBUTING.md | 4 - README.md | 88 +++---------------- ...ributed-pytorch-xla-basics-with-pjrt.ipynb | 2 +- docs/source/accelerators/gpu.md | 6 -- docs/source/contribute/bazel.md | 6 +- docs/source/contribute/plugins.md | 3 +- docs/source/learn/_pjrt.md | 8 +- docs/source/perf/amp.md | 4 +- docs/source/perf/spmd_advanced.md | 4 +- docs/source/perf/spmd_gpu.md | 48 ---------- examples/train_resnet_amp.py | 3 +- infra/ansible/README.md | 4 +- infra/ansible/config/apt.yaml | 17 ---- infra/ansible/config/cuda_deps.yaml | 24 ----- infra/ansible/config/vars.yaml | 7 +- infra/ansible/playbook.yaml | 6 +- infra/tpu-pytorch-releases/README.md | 25 +++--- .../tpu-pytorch-releases/artifacts_builds.tf | 27 ------ .../dev_images.auto.tfvars | 12 --- infra/tpu-pytorch-releases/dev_images.tf | 6 +- 28 files changed, 74 insertions(+), 417 deletions(-) delete mode 100644 .devcontainer/gpu-internal/devcontainer.json delete mode 100644 docs/source/accelerators/gpu.md delete mode 100644 docs/source/perf/spmd_gpu.md delete mode 100644 infra/ansible/config/cuda_deps.yaml diff --git a/.circleci/common.sh b/.circleci/common.sh index 3093a8006942..50ec8eae1ade 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -158,26 +158,12 @@ function run_torch_xla_cpp_tests() { fi if [ "$USE_COVERAGE" != "0" ]; then - if [ -x "$(command -v nvidia-smi)" ]; then - PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L"" - cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat - PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS - cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat - lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat - else - PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L"" - cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat - fi + PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L"" + cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info else - # Shard GPU testing - if [ -x "$(command -v nvidia-smi)" ]; then - PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L"" - PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS - else - PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L"" - fi + PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L"" fi popd } @@ -196,11 +182,6 @@ function run_torch_xla_tests() { RUN_CPP="${RUN_CPP_TESTS:0}" RUN_PYTHON="${RUN_PYTHON_TESTS:0}" - if [ -x "$(command -v nvidia-smi)" ]; then - num_devices=$(nvidia-smi --list-gpus | wc -l) - echo "Found $num_devices GPU devices..." - export GPU_NUM_DEVICES=$num_devices - fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") diff --git a/.devcontainer/gpu-internal/devcontainer.json b/.devcontainer/gpu-internal/devcontainer.json deleted file mode 100644 index ce06bab9e2e7..000000000000 --- a/.devcontainer/gpu-internal/devcontainer.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "gpu-internal", - "image": "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1", - "runArgs": [ - "--gpus=all", - "--net=host", - "--shm-size=16G" - ], - "containerEnv": { - "BAZEL_REMOTE_CACHE": "1", - "SILO_NAME": "cache-silo-${localEnv:USER}-gpuvm" - }, - "initializeCommand": "docker pull us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.8_cuda_12.1", - "customizations": { - "vscode": { - "extensions": [ - "llvm-vs-code-extensions.vscode-clangd", - "ms-vscode.cpptools-themes", - "BazelBuild.vscode-bazel", - "DevonDCarew.bazel-code", - "StackBuild.bazel-stack-vscode", - "StackBuild.bazel-stack-vscode-cc", - "xaver.clang-format", - "ryanluker.vscode-coverage-gutters", - "ms-azuretools.vscode-docker", - "ms-python.python" - ] - } - } -} \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6c37920bd137..b44f8dca7ad2 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -13,5 +13,5 @@ Error messages and stack traces are also helpful. ## System Info -- reproducible on XLA backend [CPU/TPU/CUDA]: +- reproducible on XLA backend [CPU/TPU]: - torch_xla version: diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 54f785623a50..3c10b58bfe5a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -46,7 +46,7 @@ Steps to reproduce the behavior: ## Environment - - Reproducible on XLA backend [CPU/TPU/CUDA]: + - Reproducible on XLA backend [CPU/TPU]: - torch_xla version: diff --git a/.github/ci.md b/.github/ci.md index 2cc72b5abf50..cc3994c884e7 100644 --- a/.github/ci.md +++ b/.github/ci.md @@ -44,20 +44,20 @@ fail. Steps for fixing and merging such breaking PyTorch change is as following: ### Running TPU tests on PRs -The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU and -GPU. The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`. +The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU. +The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`. ## CI Environment Before the CI in this repository runs, we build a base dev image. These are the same images we recommend in our VSCode `.devcontainer` setup and nightly build -to ensure consistency between environments. We produce variants with and without -CUDA, configured in `infra/ansible` (build config) and -`infra/tpu-pytorch-releases/dev_images.tf` (build triggers). +to ensure consistency between environments. We produce variants configured in +`infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf` +(build triggers). The CI runs in two environments: -1. Organization self-hosted runners for CPU and GPU: used for almost every step +1. Organization self-hosted runners for CPU: used for almost every step of the CI. These runners are managed by PyTorch and have access to the shared ECR repository. 1. TPU self-hosted runners: these are managed by us and are only available in @@ -68,24 +68,18 @@ The CI runs in two environments: We have two build paths for each CI run: -- `torch_xla`: we build the main package to support both TPU and GPU[^1], along +- `torch_xla`: we build the main package to support TPU, along with a CPU build of `torch` from HEAD. This build step exports the `torch-xla-wheels` artifact for downstream use in tests. - Some CI tests also require `torchvision`. To reduce flakiness, we compile `torchvision` from [`torch`'s CI pin][pytorch-vision-pin]. - C++ tests are piggybacked onto the same build and uploaded in the `cpp-test-bin` artifact. -- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of - either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus, - this build should be almost entirely cached, unless your PR changes the XLA - pin or adds a patch. -Both the main package build and plugin build are configured with ansible at -`infra/ansible`, although they run in separate stages (`stage=build_srcs` vs -`stage=build_plugin`). This is the same configuration we use for our nightly and -release builds. +The main package build is configured with ansible at `infra/ansible`. This is +the same configuration we use for our nightly and release builds. -The CPU and GPU test configs are defined in the same file, `_test.yml`. Since +The CPU test config is defined in the file `_test.yml`. Since some of the tests come from the upstream PyTorch repository, we check out PyTorch at the same git rev as the `build` step (taken from `torch_xla.version.__torch_gitrev__`). The tests are split up into multiple @@ -93,23 +87,16 @@ groups that run in parallel; the `matrix` section of `_test.yml` corresponds to in `.github/scripts/run_tests.sh`. CPU tests run immediately after the `torch_xla` build completes. This will -likely be the first test feedback on your commit. GPU tests will launch when -both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is -much slower due to the number of possible optimizations, and the GPU chips -themselves are quite outdated, so these tests will take longer to run than the -CPU tests. +likely be the first test feedback on your commit. ![CPU tests launch when `torch_xla` is complete](../docs/assets/ci_test_dependency.png) -![GPU tests also depend on CUDA -plugin](../docs/assets/ci_test_dependency_gpu.png) - For the C++ test groups in either case, the test binaries are pre-built during the build phase and packaged in `cpp-test-bin`. This will only be downloaded if necessary. -[^1]: Note: both GPU and TPU support require their respective plugins to be +[^1]: Note: TPU support require its respective plugins to be installed. This package will _not_ work on either out of the box. ### TPU CI diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh index 7ae422c47953..65f46f9cf48c 100755 --- a/.github/scripts/run_tests.sh +++ b/.github/scripts/run_tests.sh @@ -77,11 +77,6 @@ PYTORCH_DIR=$1 XLA_DIR=$2 USE_COVERAGE="${3:-0}" -if [ -x "$(command -v nvidia-smi)" ]; then - num_devices=$(nvidia-smi --list-gpus | wc -l) - echo "Found $num_devices GPU devices..." - export GPU_NUM_DEVICES=$num_devices -fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index 4ef00dcedaed..23ffe34f8a46 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -23,11 +23,6 @@ on: description: | Set the maximum (in minutes) how long the workflow should take to finish timeout-minutes: - install-cuda-plugin: - required: false - type: boolean - default: false - description: Whether to install CUDA plugin package torch-commit: required: true type: string @@ -46,7 +41,7 @@ jobs: runs-on: ${{ inputs.runner }} container: image: ${{ inputs.dev-image }} - options: "${{ inputs.install-cuda-plugin == true && '--gpus all' || '' }} --shm-size 16g" + options: "--shm-size 16g" strategy: fail-fast: false matrix: @@ -95,9 +90,7 @@ jobs: uses: ./.actions/.github/workflows/setup with: torch-commit: ${{ inputs.torch-commit }} - cuda: ${{ inputs.install-cuda-plugin && true || false }} wheels-artifact: torch-xla-wheels - cuda-plugin-artifact: ${{ inputs.install-cuda-plugin && 'cuda-plugin' || null }} - name: Fetch CPP test binaries if: inputs.has_code_changes == 'true' && matrix.run_cpp_tests uses: actions/download-artifact@v4 @@ -111,9 +104,6 @@ jobs: run: | chmod +x /tmp/test/bin/* ls -l /tmp/test/bin - - name: Check GPU - if: inputs.has_code_changes == 'true' && inputs.install-cuda-plugin - run: nvidia-smi - name: Install test deps if: inputs.has_code_changes == 'true' shell: bash @@ -164,35 +154,24 @@ jobs: exit 0 fi docker cp "${pid}":/home/jenkins/htmlcov "${GITHUB_WORKSPACE}" - if [ -n "${GPU_FLAG:-}" ]; then - if [ -n "${PYTHON_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_python_coverage_${PYTHON_TEST_NAME}.out - fi - if [ -n "${CPP_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/gpu_cpp_coverage_${CPP_TEST_NAME}.out - fi - else - if [ -n "${PYTHON_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out - fi + if [ -n "${PYTHON_TEST_NAME}" ]; then + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_python_coverage_${PYTHON_TEST_NAME}.out + fi - if [ -n "${CPP_TEST_NAME}" ]; then - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out - fi + if [ -n "${CPP_TEST_NAME}" ]; then + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out + gsutil cp ${GITHUB_WORKSPACE}/htmlcov/cpp_lcov.info gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/cpu_cpp_coverage_${CPP_TEST_NAME}.out + fi - if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then - ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $ABS_METADATA > abs_metadata.json - gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json + if [ "${CPP_TEST_NAME}" == "cpp_tests" ]; then + ABS_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "commit_id": '\"${GITHUB_SHA}\"', "ref": "HEAD", "source": "https://github.com/pytorch/xla", "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $ABS_METADATA > abs_metadata.json + gsutil cp abs_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/absolute/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json - INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' - echo $INC_METADATA > inc_metadata.json - gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json - fi + INC_METADATA='{"host": "github", "project": "pytorchxla", "trace_type": "LCOV", "patchset_num": 1, "change_id": '${CIRCLE_BUILD_NUM}', "owner": "cloud-tpu-pt-dev", "bug_component": "587012"}' + echo $INC_METADATA > inc_metadata.json + gsutil cp inc_metadata.json gs://ng3-metrics/ng3-pytorchxla-coverage/incremental/pytorchxla/${CIRCLE_WORKFLOW_ID}/metadata.json fi - name: Report no code changes if: inputs.has_code_changes == 'false' diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index 574b85e5b0d5..e1d6fdb8599d 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -3,20 +3,10 @@ inputs: torch-commit: type: string description: PyTorch commit to check out, if provided - cuda: - type: boolean - description: Whether to set up CUDA library paths - default: false wheels-artifact: type: string description: | Artifact containing `torch` (cpu) and `torch-xla` wheels to install - cuda-plugin-artifact: - type: string - description: Artifact containing `torch-xla-cuda-plugin` to install - cuda-torch-artifact: - type: string - description: Artifact containing CUDA build of `torch` runs: using: "composite" steps: @@ -26,12 +16,6 @@ runs: run: | ls -la rm -rvf ${GITHUB_WORKSPACE}/* - - name: Setup CUDA environment - shell: bash - run: | - echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV - if: ${{ inputs.cuda }} - name: Setup gcloud shell: bash run: | @@ -59,23 +43,6 @@ runs: name: ${{ inputs.wheels-artifact }} path: /tmp/wheels/ if: ${{ inputs.wheels-artifact }} - - name: Fetch CUDA plugin - uses: actions/download-artifact@v4 - with: - name: ${{ inputs.cuda-plugin-artifact }} - path: /tmp/wheels/ - if: ${{ inputs.cuda-plugin-artifact }} - - name: Remove CPU `torch` build - shell: bash - run: | - rm -rf /tmp/wheels/torch-* - if: ${{ inputs.cuda-torch-artifact }} - - name: Fetch CUDA `torch` build - uses: actions/download-artifact@v4 - with: - name: ${{ inputs.cuda-torch-artifact }} - path: /tmp/wheels/ - if: ${{ inputs.cuda-torch-artifact }} - name: Install wheels shell: bash run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6c05fd88f747..b8d233c87002 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -238,10 +238,6 @@ first time, you may need to build everything again, for example, after a python setup.py develop ``` -### Additional steps for GPU - -Please refer to this [guide](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md). - ## Before Creating a Pull Request In `pytorch/xla` repo we enforce coding style for both C++ and Python files. diff --git a/README.md b/README.md index 989858ef16fd..d02ae1a0968e 100644 --- a/README.md +++ b/README.md @@ -95,24 +95,23 @@ batch size 1024: Our github contains many useful docs on working with different aspects of PyTorch XLA, here is a list of useful docs spread around our repository: - [docs/source/learn](https://github.com/pytorch/xla/tree/master/docs/source/learn): docs for learning concepts associated with XLA, troubleshooting, pjrt, eager mode, and dynamic shape. -- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `GPU` and `TPU` accelerator documents. +- [docs/source/accelerators](https://github.com/pytorch/xla/tree/master/docs/source/accelerators): references to `TPU` accelerator documents. - [docs/source/perf](https://github.com/pytorch/xla/tree/master/docs/source/perf): documentation about performance specific aspects of PyTorch/XLA such as: `AMP`, `DDP`, `Dynamo`, Fori loop, `FSDP`, quantization, recompilation, and `SPMD` - [docs/source/features](https://github.com/pytorch/xla/tree/master/docs/source/features): documentation on distributed torch, pallas, scan, and stable hlo. - [docs/source/contribute](https://github.com/pytorch/xla/tree/master/docs/source/contribute): documents on setting up PyTorch for development, and guides for lowering operations. - PJRT plugins: - [CPU](https://github.com/pytorch/xla/blob/master/plugins/cpu/README.md) - - [CUDA](https://github.com/pytorch/xla/blob/master/plugins/cuda/README.md) - [torchax/docs](https://github.com/pytorch/xla/tree/master/torchax/docs): torchax documents - [torchax/examples](https://github.com/pytorch/xla/tree/master/torchax/examples): torchax examples ## Getting Started Following here are guides for two modes: -- Single process: one Python interpreter controlling a single GPU/TPU at a time -- Multi process: N Python interpreters are launched, corresponding to N GPU/TPUs +- Single process: one Python interpreter controlling a single TPU at a time +- Multi process: N Python interpreters are launched, corresponding to N TPUs found on the system -Another mode is SPMD, where one Python interpreter controls all N GPU/TPUs found on +Another mode is SPMD, where one Python interpreter controls all N TPUs found on the system. Multi processing is more complex, and is not compatible with SPMD. This tutorial does not dive into SPMD. For more on that, check our [SPMD guide](https://github.com/pytorch/xla/blob/master/docs/source/perf/spmd_basic.md). @@ -223,7 +222,7 @@ If you're using `DistributedDataParallel`, make the following changes: Additional information on PyTorch/XLA, including a description of its semantics and functions, is available at [PyTorch.org](http://pytorch.org/xla/). See the [API Guide](API_GUIDE.md) for best practices when writing networks that run on -XLA devices (TPU, CUDA, CPU and...). +XLA devices (TPU, CPU and...). Our comprehensive user guides are available at: @@ -234,13 +233,9 @@ Our comprehensive user guides are available at: ## PyTorch/XLA tutorials -* [Cloud TPU VM - quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch) -* [Cloud TPU Pod slice - quickstart](https://cloud.google.com/tpu/docs/pytorch-pods) -* [Profiling on TPU - VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm) -* [GPU guide](docs/gpu.md) +* [Cloud TPU VM quickstart](https://cloud.google.com/tpu/docs/run-calculation-pytorch) +* [Cloud TPU Pod slice quickstart](https://cloud.google.com/tpu/docs/pytorch-pods) +* [Profiling on TPU VM](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm) ## Reference implementations @@ -259,12 +254,10 @@ Cloud TPU plugin corresponding to your installed `torch_xla`, install the option pip install torch_xla[tpu] ``` -GPU release builds and GPU/TPU nightly builds are available in our public GCS bucket. +TPU nightly builds are available in our public GCS bucket. -| Version | Cloud GPU VM Wheels | +| Version | Cloud TPU Nightly Wheels | | --- | ----------- | -| 2.7 (CUDA 12.6 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.7 (CUDA 12.6 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.6/torch_xla-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl` | | nightly (Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp311-cp311-linux_x86_64.whl` | | nightly (Python 3.12) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` | | nightly (Python 3.13) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev-cp312-cp312-linux_x86_64.whl` | @@ -296,27 +289,6 @@ The torch wheel version `2.9.0.dev20250423+cpu` can be found at https://download | 2.1 (XRT + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl` | | 2.1 (Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.1.0-cp38-cp38-linux_x86_64.whl` | -
- -| Version | GPU Wheel | -| --- | ----------- | -| 2.5 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` | -| 2.5 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.5 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` | -| 2.5 (CUDA 12.4 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp39-cp39-manylinux_2_28_x86_64.whl` | -| 2.5 (CUDA 12.4 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.5 (CUDA 12.4 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.4/torch_xla-2.5.0-cp311-cp311-manylinux_2_28_x86_64.whl` | -| 2.4 (CUDA 12.1 + Python 3.9) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp39-cp39-manylinux_2_28_x86_64.whl` | -| 2.4 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.4 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.4.0-cp311-cp311-manylinux_2_28_x86_64.whl` | -| 2.3 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp38-cp38-manylinux_2_28_x86_64.whl` | -| 2.3 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.3 (CUDA 12.1 + Python 3.11) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl` | -| 2.2 (CUDA 12.1 + Python 3.8) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp38-cp38-manylinux_2_28_x86_64.whl` | -| 2.2 (CUDA 12.1 + Python 3.10) | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.1/torch_xla-2.2.0-cp310-cp310-manylinux_2_28_x86_64.whl` | -| 2.1 + CUDA 11.8 | `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/11.8/torch_xla-2.1.0-cp38-cp38-manylinux_2_28_x86_64.whl` | -| nightly + CUDA 12.0 >= 2023/06/27| `https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.0/torch_xla-nightly-cp38-cp38-linux_x86_64.whl` | - ### Docker @@ -337,46 +309,6 @@ To use the above dockers, please pass `--privileged --net host --shm-size=16G` a ```bash docker run --privileged --net host --shm-size=16G -it us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm /bin/bash ``` -
- -| Version | GPU CUDA 12.6 Docker | -| --- | ----------- | -| 2.7 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.7.0_3.10_cuda_12.6` | - - -
- - -| Version | GPU CUDA 12.4 Docker | -| --- | ----------- | -| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.4` | -| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.4` | - -
- - -| Version | GPU CUDA 12.1 Docker | -| --- | ----------- | -| 2.5 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.5.0_3.10_cuda_12.1` | -| 2.4 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.4.0_3.10_cuda_12.1` | -| 2.3 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.3.0_3.10_cuda_12.1` | -| 2.2 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.2.0_3.10_cuda_12.1` | -| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_12.1` | -| nightly | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1` | -| nightly at date | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.8_cuda_12.1_YYYYMMDD` | - -
- -| Version | GPU CUDA 11.8 + Docker | -| --- | ----------- | -| 2.1 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.1.0_3.10_cuda_11.8` | -| 2.0 | `us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:r2.0_3.8_cuda_11.8` | - -
- - -To run on [compute instances with -GPUs](https://cloud.google.com/compute/docs/gpus/create-vm-with-gpus). ## Troubleshooting diff --git a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb index 8d4fbd95bff7..f06e4a9b9f03 100644 --- a/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb +++ b/contrib/kaggle/distributed-pytorch-xla-basics-with-pjrt.ipynb @@ -461,7 +461,7 @@ " torch.manual_seed(42)\n", " model = nn.Linear(128, 10).to(device)\n", "\n", - " # Optional for TPU v4 and GPU\n", + " # Optional for TPU v4\n", " xm.broadcast_master_param(model)\n", "\n", " model = DDP(model, gradient_as_bucket_view=True)\n", diff --git a/docs/source/accelerators/gpu.md b/docs/source/accelerators/gpu.md deleted file mode 100644 index 56abb192a704..000000000000 --- a/docs/source/accelerators/gpu.md +++ /dev/null @@ -1,6 +0,0 @@ -# Learn about GPUs - -For information on GPUs on Google Cloud, see: - -- [About GPUs on Google Cloud](https://cloud.google.com/compute/docs/gpus/overview) -- [GPU machine types](https://cloud.google.com/compute/docs/gpus) diff --git a/docs/source/contribute/bazel.md b/docs/source/contribute/bazel.md index 0e41ec837057..69e1d5954c82 100644 --- a/docs/source/contribute/bazel.md +++ b/docs/source/contribute/bazel.md @@ -22,9 +22,7 @@ http_archive( ], patch_tool = "patch", patches = [ - "//openxla_patches:gpu_nvml.diff", - "//openxla_patches:gpu_race_condition.diff", - "//openxla_patches:count_down.diff", + "//openxla_patches:no_fortify.diff", ], strip_prefix = "xla-" + xla_hash, urls = [ @@ -223,7 +221,7 @@ The `xla_client` tests are pure hermetic tests that can be easily executed. The `torch_xla` plugin tests are more complex: they require `torch` and `torch_xla` to be installed, and they cannot run in parallel, since they are using either XRT server/client on the same -port, or because they use a GPU or TPU device and there's only one +port, or because they use a TPU device and there's only one available at the time. For that reason, all tests under `torch_xla/csrc/` are bundled into a single target `:main` that runs them all sequentially. diff --git a/docs/source/contribute/plugins.md b/docs/source/contribute/plugins.md index 40ae841e8d7b..84ca6fe1c9ea 100644 --- a/docs/source/contribute/plugins.md +++ b/docs/source/contribute/plugins.md @@ -1,8 +1,7 @@ # Custom Hardware Plugins PyTorch/XLA supports custom hardware through OpenXLA's PJRT C API. The -PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`) and -GPU ([OpenXLA](https://github.com/openxla/xla/tree/main/xla/pjrt/gpu)). +PyTorch/XLA team directly supports plugins for Cloud TPU (`libtpu`). The same plugins may also be used by JAX and TF. ## Implementing a PJRT Plugin diff --git a/docs/source/learn/_pjrt.md b/docs/source/learn/_pjrt.md index 16300239353a..2f4f446991de 100644 --- a/docs/source/learn/_pjrt.md +++ b/docs/source/learn/_pjrt.md @@ -38,7 +38,7 @@ the `runtime` tag. ## TL;DR - To use the PJRT preview runtime, set the `PJRT_DEVICE` environment - variable to `CPU`, `TPU`, or `CUDA` + variable to `CPU`, or `TPU` - In XRT, all distributed workloads are multiprocess, with one process per device. On TPU v2 and v3 in PJRT, workloads are multiprocess and multithreaded (4 processes with 2 threads each), so your workload @@ -57,7 +57,7 @@ the `runtime` tag. - To use `torch.distributed`, import `torch_xla.experimental.pjrt_backend` and use the `xla://` `init_method`. - - These steps are optional for GPU and TPU v4. + - These steps are optional for TPU v4. Sample diff from XRT to PJRT: @@ -84,7 +84,7 @@ def _mp_fn(index): torch.manual_seed(42) model = nn.Linear(128, 10).to(device) -+ # Optional for TPU v4 and GPU ++ # Optional for TPU v4 + xm.broadcast_master_param(model) model = DDP(model, gradient_as_bucket_view=True) @@ -119,7 +119,7 @@ if __name__ == '__main__': ## Benefits - Simple runtime configuration: just set `PJRT_DEVICE` to `TPU`, - `CPU`, or `CUDA` and start using XLA! Or, let PJRT select a device + or `CPU` and start using XLA! Or, let PJRT select a device automatically based on your environment. - Improved performance: reduced overhead from gRPC means faster end-to-end execution. On TorchBench 2.0, we observed a \>35% diff --git a/docs/source/perf/amp.md b/docs/source/perf/amp.md index 36d777fd865f..223e338f2135 100644 --- a/docs/source/perf/amp.md +++ b/docs/source/perf/amp.md @@ -2,7 +2,7 @@ Pytorch/XLA's AMP extends [Pytorch's AMP package](https://pytorch.org/docs/stable/amp.html) with support for -automatic mixed precision on `XLA:GPU` and `XLA:TPU` devices. AMP is +automatic mixed precision on `XLA:TPU` devices. AMP is used to accelerate training and inference by executing certain operations in `float32` and other operations in a lower precision datatype (`float16` or `bfloat16` depending on hardware support). This @@ -99,4 +99,4 @@ unlisted ops run if they're downstream from autocasted ops. Our [mnist training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist_amp.py) and [imagenet training script](https://github.com/pytorch/xla/blob/master/test/test_train_mp_imagenet_amp.py) -demonstrate how AMP is used on both TPUs and GPUs. +demonstrate how AMP is used on TPUs. diff --git a/docs/source/perf/spmd_advanced.md b/docs/source/perf/spmd_advanced.md index 7005ee5dd4c0..2a056dc3d693 100644 --- a/docs/source/perf/spmd_advanced.md +++ b/docs/source/perf/spmd_advanced.md @@ -110,7 +110,7 @@ torch.ops.xla.dynamo_mark_sharding(output, device_ids, mesh_shape, axis_names, p ### SPMD Debugging Tool -We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/GPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`: +We provide a `shard placement visualization debug tool` for PyTorch/XLA SPMD user on TPU/CPU with single-host/multi-host: you could use `visualize_tensor_sharding` to visualize sharded tensor, or you could use `visualize_sharding` to visualize sharing string. Here are two code examples on TPU single-host(v4-8) with `visualize_tensor_sharding` or `visualize_sharding`: - Code snippet used `visualize_tensor_sharding` and visualization result: ```python @@ -141,7 +141,7 @@ generated_table = visualize_sharding(sharding, use_color=False) visualize_sharding example on TPU v4-8(single-host) -You could use these examples on TPU/GPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`. +You could use these examples on TPU/CPU single-host and modify it to run on multi-host. And you could modify it to sharding-style `tiled`, `partial_replication` and `replicated`. ### Auto-Sharding We are introducing a new PyTorch/XLA SPMD feature, called ``auto-sharding``, [RFC](https://github.com/pytorch/xla/issues/6322). This is an experimental feature in `r2.3` and `nightly`, that supports `XLA:TPU` and a single TPUVM host. diff --git a/docs/source/perf/spmd_gpu.md b/docs/source/perf/spmd_gpu.md deleted file mode 100644 index cda25723aaad..000000000000 --- a/docs/source/perf/spmd_gpu.md +++ /dev/null @@ -1,48 +0,0 @@ -# Running SPMD on GPU - -PyTorch/XLA supports SPMD on NVIDIA GPU (single-node or multi-nodes). -The training/inference script remains the same as the one used for TPU, -such as this [ResNet -script](https://github.com/pytorch/xla/blob/1dc78948c0c9d018d8d0d2b4cce912552ab27083/test/spmd/test_train_spmd_imagenet.py). -To execute the script using SPMD, we leverage `torchrun`: - - PJRT_DEVICE=CUDA \ - torchrun \ - --nnodes=${NUM_GPU_MACHINES} \ - --node_rank=${RANK_OF_CURRENT_MACHINE} \ - --nproc_per_node=1 \ - --rdzv_endpoint=":" \ - training_or_inference_script_using_spmd.py - -- `--nnodes`: how many GPU machines to be used. -- `--node_rank`: the index of the current GPU machines. The value can - be 0, 1, ..., \${NUMBER_GPU_VM}-1. -- `--nproc_per_node`: the value must be 1 due to the SPMD requirement. -- `--rdzv_endpoint`: the endpoint of the GPU machine with - node_rank==0, in the form `host:port`. The host will be the internal - IP address. The `port` can be any available port on the machine. For - single-node training/inference, this parameter can be omitted. - -For example, if you want to train a ResNet model on 2 GPU machines using -SPMD, you can run the script below on the first machine: - - XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \ - torchrun \ - --nnodes=2 \ - --node_rank=0 \ - --nproc_per_node=1 \ - --rdzv_endpoint=":12355" \ - pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128 - -and run the following on the second machine: - - XLA_USE_SPMD=1 PJRT_DEVICE=CUDA \ - torchrun \ - --nnodes=2 \ - --node_rank=1 \ - --nproc_per_node=1 \ - --rdzv_endpoint=":12355" \ - pytorch/xla/test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 128 - -For more information, please refer to the [SPMD support on GPU -RFC](https://github.com/pytorch/xla/issues/6256). diff --git a/examples/train_resnet_amp.py b/examples/train_resnet_amp.py index f5ca308bed75..f63c3cad8544 100644 --- a/examples/train_resnet_amp.py +++ b/examples/train_resnet_amp.py @@ -22,8 +22,7 @@ def train_loop_fn(self, loader, epoch): with autocast(torch_xla.device()): output = self.model(data) loss = self.loss_fn(output, target) - # TPU amp uses bf16 hence gradient scaling is not necessary. If runnign with XLA:GPU - # check https://github.com/pytorch/xla/blob/master/docs/amp.md#amp-for-xlagpu. + # TPU amp uses bf16 hence gradient scaling is not necessary. loss.backward() self.run_optimizer() tracker.add(self.batch_size) diff --git a/infra/ansible/README.md b/infra/ansible/README.md index 9094f645de30..9ce34d962cff 100644 --- a/infra/ansible/README.md +++ b/infra/ansible/README.md @@ -23,11 +23,11 @@ behavior (installed pip/apt packages and set environment variables): * `stage`: build or release. Different packages are installed depending on the chosen stage. * `arch`: aarch64 or amd64. Architecture of the built image and wheels. -* `accelerator`: tpu or cuda. Available accelerator. +* `accelerator`: tpu. Available accelerator. The variables can be passed through `-e` flag: `-e "="`. -Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64 accelerator=tpu"` +Example: `ansible-playbook playbook.yaml -e "stage=build arch=amd64"` ## Config structure diff --git a/infra/ansible/config/apt.yaml b/infra/ansible/config/apt.yaml index d026fea3e037..ae3d95468344 100644 --- a/infra/ansible/config/apt.yaml +++ b/infra/ansible/config/apt.yaml @@ -20,13 +20,6 @@ apt: - lcov - less - build_cuda: - - "cuda-libraries-{{ cuda_version | replace('.', '-') }}" - - "cuda-toolkit-{{ cuda_version | replace('.', '-') }}" - - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}" - - "{{ cuda_deps['libcudnn'][cuda_version] }}" - - "{{ cuda_deps['libcudnn-dev'][cuda_version] }}" - build_aarch64: - scons @@ -39,23 +32,13 @@ apt: - patch - vim - release_cuda: - - "cuda-libraries-{{ cuda_version | replace('.', '-') }}" - - "cuda-minimal-build-{{ cuda_version | replace('.', '-') }}" - - "{{ cuda_deps['libcudnn'][cuda_version] }}" - # Specify objects with string fields `url` and `keyring`. # The keyring path should start with /usr/share/keyrings/ for debian and ubuntu. signing_keys: - url: https://apt.llvm.org/llvm-snapshot.gpg.key keyring: /usr/share/keyrings/llvm.pgp - # Get the recent key version from - # https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-debian. - - url: "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub" - keyring: /usr/share/keyrings/cuda.pgp repos: # signed-by path should match the corresponding keyring path above. - "deb [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main" - "deb-src [signed-by=/usr/share/keyrings/llvm.pgp] http://apt.llvm.org/{{ llvm_debian_repo }}/ llvm-toolchain-{{ llvm_debian_repo }}-{{ clang_version }} main" - - "deb [signed-by=/usr/share/keyrings/cuda.pgp] https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_repo }}/x86_64/ /" diff --git a/infra/ansible/config/cuda_deps.yaml b/infra/ansible/config/cuda_deps.yaml deleted file mode 100644 index 3732bb0f93ec..000000000000 --- a/infra/ansible/config/cuda_deps.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Versions of cuda dependencies for given cuda versions. -# Note: wrap version in quotes to ensure they're treated as strings. -cuda_deps: - # List all libcudnn8 versions with `apt list -a libcudnn8` - libcudnn: - "12.8": libcudnn9-cuda-12=9.1.1.17-1 - "12.6": libcudnn9-cuda-12=9.1.1.17-1 - "12.4": libcudnn9-cuda-12=9.1.1.17-1 - "12.3": libcudnn9-cuda-12=9.1.1.17-1 - "12.1": libcudnn8=8.9.2.26-1+cuda12.1 - "12.0": libcudnn8=8.8.0.121-1+cuda12.0 - "11.8": libcudnn8=8.7.0.84-1+cuda11.8 - "11.7": libcudnn8=8.5.0.96-1+cuda11.7 - "11.2": libcudnn8=8.1.1.33-1+cuda11.2 - libcudnn-dev: - "12.8": libcudnn9-dev-cuda-12=9.1.1.17-1 - "12.6": libcudnn9-dev-cuda-12=9.1.1.17-1 - "12.4": libcudnn9-dev-cuda-12=9.1.1.17-1 - "12.3": libcudnn9-dev-cuda-12=9.1.1.17-1 - "12.1": libcudnn8-dev=8.9.2.26-1+cuda12.1 - "12.0": libcudnn8-dev=8.8.0.121-1+cuda12.0 - "11.8": libcudnn8-dev=8.7.0.84-1+cuda11.8 - "11.7": libcudnn8-dev=8.5.0.96-1+cuda11.7 - "11.2": libcudnn8-dev=8.1.1.33-1+cuda11.2 diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml index f34e2c3cb632..c336e7754f46 100644 --- a/infra/ansible/config/vars.yaml +++ b/infra/ansible/config/vars.yaml @@ -1,8 +1,3 @@ -# Used for fetching cuda from the right repo, see apt.yaml. -cuda_repo: debian11 -cuda_version: "11.8" -# Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus -cuda_compute_capabilities: 5.2,7.0,7.5,8.0,9.0 # Used for fetching clang from the right repo, see apt.yaml. llvm_debian_repo: bullseye clang_version: 17 @@ -10,7 +5,7 @@ clang_version: 17 package_version: 2.9.0 # If set to true, wheels will be renamed to $WHEEL_NAME-nightly-cp38-cp38-linux_x86_64.whl. nightly_release: false -# Whether to preinstall libtpu in the PyTorch/XLA wheel. Ignored for GPU build. +# Whether to preinstall libtpu in the PyTorch/XLA wheel. bundle_libtpu: 1 # Suffix for bazel remote cache key cache_suffix: "" diff --git a/infra/ansible/playbook.yaml b/infra/ansible/playbook.yaml index 7626714e8d18..85153a43d3a2 100644 --- a/infra/ansible/playbook.yaml +++ b/infra/ansible/playbook.yaml @@ -6,7 +6,7 @@ # - stage: build or release. Different packages are installed depending on # the chosen stage. # - arch: aarch64 or amd64. Architecture of the built image and wheels. - # - accelerator: tpu or cuda. Available accelerator. + # - accelerator: tpu. pre_tasks: - name: "Validate required variables" ansible.builtin.assert: @@ -20,7 +20,7 @@ - name: arch pattern: ^(aarch64|amd64)$ - name: accelerator - pattern: ^(tpu|cuda)$ + pattern: ^tpu$ - name: "Include vars from config files" ansible.builtin.include_vars: @@ -28,8 +28,6 @@ loop: # vars.yaml should be the first as other config files depend on it. - vars.yaml - # cuda_deps should be loaded before apt, since apt depends on it. - - cuda_deps.yaml - apt.yaml - pip.yaml - env.yaml diff --git a/infra/tpu-pytorch-releases/README.md b/infra/tpu-pytorch-releases/README.md index f173b3ee8575..a70e0b064a6e 100644 --- a/infra/tpu-pytorch-releases/README.md +++ b/infra/tpu-pytorch-releases/README.md @@ -39,13 +39,11 @@ consists of the following fields. sources when building image and wheels. * `package_version` (string) - Version of the built wheels. Passed to the build steps. -* `accelerator` ("tpu"|"cuda") - Supported accelerator. Affects build +* `accelerator` ("tpu") - Supported accelerator. Affects build process and installed dependencies, see [apt.yaml](../ansible/config/apt.yaml) and [pip.yaml](../ansible/config/pip.yaml). * `python_version` (optional, string, default = "3.8") - Python version used for the docker image base and build process. -* `cuda_version` (optional, string, default = "11.8") - CUDA version to install. - Used only if `accelerator` is set to "cuda" * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture affects installed dependencies and build process, see [apt.yaml](../ansible/config/apt.yaml) and [pip.yaml](../ansible/config/pip.yaml). @@ -71,7 +69,6 @@ unset properties of existing triggers. git_tag = "v3.0.0" package_version = "3.0" accelerator = "tpu" - cuda_version = "11.8" # optional python_version = "3.8" # optional arch = "amd64" # optional }, @@ -95,12 +92,10 @@ at midnight (`America/Los_Angeles` time zone). Nightly builds in the `nightly_builds` variable in [artifacts.auto.tfvars](./artifacts.auto.tfvars) consists of the following fields. -* `accelerator` ("tpu"|"cuda") - Supported accelerator. Impacts build +* `accelerator` ("tpu") - Supported accelerator. Impacts build process and installed dependencies. * `python_version` (optional, string, default = "3.8") - Python version used for the docker images base and build process. -* `cuda_version` (optional, string, default = "11.8") - CUDA version to install. - Used only if `accelerator` is set to "cuda" * `arch` (optional, "amd64"|"aarch64", default = "amd64") - Architecture influences installed dependencies and build process. * `cxx11_abi` (optional, "0"|"1", default = "0") - Whether to use C++11 ABI or @@ -115,9 +110,8 @@ unset properties of existing triggers. #### Modify or add a new nightly release -1. Modify or add an entry with specific `accelerator`, `python_version` and (optionally) - `cuda_version` to the `nightly_builds` variable in the - [artifacts.auto.tfvars](./artifacts.auto.tfvars) file. +1. Modify or add an entry with specific `accelerator`, and `python_version` + to the `nightly_builds` variable in the [artifacts.auto.tfvars](./artifacts.auto.tfvars) file. See all variables in the section above. **Example** @@ -125,10 +119,13 @@ unset properties of existing triggers. ```hcl nightly_builds = [ { - accelerator = "cuda" - cuda_version = "11.8" # optional - python_version = "3.8" # optional - arch = "amd64" # optional + git_tag = "v2.8.0" + package_version = "2.8.0" + pytorch_git_rev = "v2.8.0" + accelerator = "tpu" + python_version = "3.10" + bundle_libtpu = "0" + cxx11_abi = "1" }, # ... ] diff --git a/infra/tpu-pytorch-releases/artifacts_builds.tf b/infra/tpu-pytorch-releases/artifacts_builds.tf index 099a2402afe9..b4e469b617be 100644 --- a/infra/tpu-pytorch-releases/artifacts_builds.tf +++ b/infra/tpu-pytorch-releases/artifacts_builds.tf @@ -6,10 +6,6 @@ locals { release_package_version = "2.8.0-rc5" release_pytorch_git_rev = "v2.8.0-rc8" nightly_package_version = "2.9.0" - cuda_versions = { - "nightly": [], - "r2.8": [] # Note: PyTorch 2.8 release doesn't have CUDA support - } # Built once a day from master generated_nightly_builds = concat( @@ -22,16 +18,6 @@ locals { cxx11_abi = "1" } ], - # CUDA builds - [ - for pair in setproduct(local.tpu_python_versions, local.cuda_versions["nightly"]) : { - accelerator = "cuda" - cuda_version = pair[1] - python_version = pair[0] - bundle_libtpu = "0" - cxx11_abi = "1" - } - ] ) # Built on push to specific tag. @@ -59,19 +45,6 @@ locals { bundle_libtpu = "1" } ], - - # cuda build for latest release - [ - for pair in setproduct(local.tpu_python_versions, local.cuda_versions["r2.8"]) : { - git_tag = local.release_git_tag - package_version = local.release_package_version - pytorch_git_rev = local.release_pytorch_git_rev - accelerator = "cuda" - cuda_version = pair[1] - python_version = pair[0] - bundle_libtpu = "0" - } - ] ) versioned_builds = concat(local.generated_versioned_builds, var.manual_versioned_builds) nightly_builds = concat(local.generated_nightly_builds, var.manual_nightly_builds) diff --git a/infra/tpu-pytorch-releases/dev_images.auto.tfvars b/infra/tpu-pytorch-releases/dev_images.auto.tfvars index e1618f2a80c2..aee461990fd4 100644 --- a/infra/tpu-pytorch-releases/dev_images.auto.tfvars +++ b/infra/tpu-pytorch-releases/dev_images.auto.tfvars @@ -7,17 +7,5 @@ dev_images = [ accelerator = "tpu" extra_tags = ["tpu"] python_version = "3.12" - }, - { - accelerator = "cuda" - cuda_version = "12.1" - extra_tags = ["cuda"] - python_version = "3.10" - }, - { - accelerator = "cuda" - cuda_version = "12.3" - extra_tags = ["cuda"] - python_version = "3.10" } ] diff --git a/infra/tpu-pytorch-releases/dev_images.tf b/infra/tpu-pytorch-releases/dev_images.tf index 54c340809efb..03798c9dbefb 100644 --- a/infra/tpu-pytorch-releases/dev_images.tf +++ b/infra/tpu-pytorch-releases/dev_images.tf @@ -3,10 +3,9 @@ variable "dev_images" { accelerator = string arch = optional(string, "amd64") python_version = optional(string, "3.8") - cuda_version = optional(string, "11.8") # Additional tags on top of uniquely generated tag based on accelerator, - # python and cuda versions. + # python versions. extra_tags = optional(list(string), []) })) } @@ -16,7 +15,7 @@ locals { for di in var.dev_images : format("%s_%s", di.python_version, - di.accelerator == "tpu" ? "tpuvm" : format("cuda_%s", di.cuda_version) + "tpuvm" ) => di } } @@ -55,7 +54,6 @@ module "dev_images" { accelerator = each.value.accelerator arch = each.value.arch python_version = each.value.python_version - cuda_version = each.value.cuda_version } docker_repo_url = module.docker_registry.url