diff --git a/.github/workflows/install_nvidia_utils_linux.sh b/.github/workflows/install_nvidia_utils_linux.sh deleted file mode 100755 index aa3a25182..000000000 --- a/.github/workflows/install_nvidia_utils_linux.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -set -ou pipefail - -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \ -DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run" -YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" - -install_nvidia_docker2_amzn2() { - ( - set -x - # Needed for yum-config-manager - sudo yum install -y yum-utils - sudo yum-config-manager --add-repo "${YUM_REPO_URL}" - sudo yum install -y nvidia-docker2 - sudo systemctl restart docker - ) -} - -install_nvidia_driver_amzn2() { - ( - set -x - sudo yum groupinstall -y "Development Tools" - # ensure our kernel install is the same as our underlying kernel, - # groupinstall "Development Tools" has a habit of mismatching kernel headers - sudo yum install -y "kernel-devel-uname-r == $(uname -r)" - sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" - sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) - sudo rm -fv /tmp/nvidia_driver - nvidia-smi - ) -} - -# Install container toolkit based on distribution -echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" -case "${DISTRIBUTION}" in - amzn*) - install_nvidia_docker2_amzn2 - ;; - *) - echo "ERROR: Unknown distribution ${DISTRIBUTION}" - exit 1 - ;; -esac - -echo "== Installing nvidia driver ${DRIVER_FN} ==" -case "${DISTRIBUTION}" in - amzn*) - install_nvidia_driver_amzn2 - ;; - *) - echo "ERROR: Unknown distribution ${DISTRIBUTION}" - exit 1 - ;; -esac diff --git a/.github/workflows/pippy_tests.yaml b/.github/workflows/pippy_tests.yaml index 4788a3fa2..d91bae1f4 100644 --- a/.github/workflows/pippy_tests.yaml +++ b/.github/workflows/pippy_tests.yaml @@ -212,9 +212,7 @@ jobs: sudo yum remove -y cuda-drivers || true sudo yum remove -y "*nvidia*" || true - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/workflows/install_nvidia_utils_linux.sh || true - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Pull Docker image run: | retry () { diff --git a/.github/workflows/spmd_tests.yaml b/.github/workflows/spmd_tests.yaml index 6f7f70787..e242d8c61 100644 --- a/.github/workflows/spmd_tests.yaml +++ b/.github/workflows/spmd_tests.yaml @@ -71,9 +71,7 @@ jobs: with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/workflows/install_nvidia_utils_linux.sh || true - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Pull Docker image run: | retry () {