From c47eef86d0bc0d0f5827ae232d8ef856271a2cf7 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 3 Nov 2022 10:10:55 -0700 Subject: [PATCH 1/4] Update NVIDIA driver installation script --- .../workflows/install_nvidia_utils_linux.sh | 87 +++++++++++++++---- 1 file changed, 71 insertions(+), 16 deletions(-) diff --git a/.github/workflows/install_nvidia_utils_linux.sh b/.github/workflows/install_nvidia_utils_linux.sh index aa3a25182..7806dced2 100755 --- a/.github/workflows/install_nvidia_utils_linux.sh +++ b/.github/workflows/install_nvidia_utils_linux.sh @@ -1,9 +1,11 @@ #!/usr/bin/env bash -set -ou pipefail +set -eou pipefail -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) \ -DRIVER_FN="NVIDIA-Linux-x86_64-510.60.02.run" + +DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) +DRIVER_VERSION="515.57" +DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" install_nvidia_docker2_amzn2() { @@ -20,22 +22,74 @@ install_nvidia_docker2_amzn2() { install_nvidia_driver_amzn2() { ( set -x - sudo yum groupinstall -y "Development Tools" - # ensure our kernel install is the same as our underlying kernel, - # groupinstall "Development Tools" has a habit of mismatching kernel headers - sudo yum install -y "kernel-devel-uname-r == $(uname -r)" - sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" - sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) - sudo rm -fv /tmp/nvidia_driver - nvidia-smi + + # Purge any nvidia driver installed from RHEL repo + sudo yum remove -y nvidia-driver-latest-dkms + + # Try to gather more information about the runner and its existing NVIDIA driver if any + echo "Before installing NVIDIA driver" + lspci + lsmod + modinfo nvidia || true + + HAS_NVIDIA_DRIVER=0 + # Check if NVIDIA driver has already been installed + if [ -x "$(command -v nvidia-smi)" ]; then + set +e + # The driver exists, check its version next. Also check only the first GPU if there are more than one of them + # so that the same driver version is not print over multiple lines + INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0) + NVIDIA_SMI_STATUS=$? + + if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then + echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing" + elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then + echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing" + else + HAS_NVIDIA_DRIVER=1 + echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" + fi + set -e + fi + + if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then + sudo yum groupinstall -y "Development Tools" + # ensure our kernel install is the same as our underlying kernel, + # groupinstall "Development Tools" has a habit of mismatching kernel headers + sudo yum install -y "kernel-devel-uname-r == $(uname -r)" + sudo modprobe backlight + sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" + sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) + sudo rm -fv /tmp/nvidia_driver + fi + + sudo modprobe nvidia || true + echo "After installing NVIDIA driver" + lspci + lsmod + modinfo nvidia || true + + ( + set +e + nvidia-smi + NVIDIA_SMI_STATUS=$? + + # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285 + if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then + echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}" + else + echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}" + exit ${NVIDIA_SMI_STATUS} + fi + set -e + ) ) } -# Install container toolkit based on distribution -echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" +echo "== Installing nvidia driver ${DRIVER_FN} ==" case "${DISTRIBUTION}" in amzn*) - install_nvidia_docker2_amzn2 + install_nvidia_driver_amzn2 ;; *) echo "ERROR: Unknown distribution ${DISTRIBUTION}" @@ -43,10 +97,11 @@ case "${DISTRIBUTION}" in ;; esac -echo "== Installing nvidia driver ${DRIVER_FN} ==" +# Install container toolkit based on distribution +echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" case "${DISTRIBUTION}" in amzn*) - install_nvidia_driver_amzn2 + install_nvidia_docker2_amzn2 ;; *) echo "ERROR: Unknown distribution ${DISTRIBUTION}" From 7957a45da8d5a3cba6ac9c91ee55724b08e23306 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 3 Nov 2022 16:54:39 -0700 Subject: [PATCH 2/4] Sync with pytorch script --- .../workflows/install_nvidia_utils_linux.sh | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/.github/workflows/install_nvidia_utils_linux.sh b/.github/workflows/install_nvidia_utils_linux.sh index 7806dced2..b1fdd468a 100755 --- a/.github/workflows/install_nvidia_utils_linux.sh +++ b/.github/workflows/install_nvidia_utils_linux.sh @@ -59,8 +59,28 @@ install_nvidia_driver_amzn2() { sudo yum install -y "kernel-devel-uname-r == $(uname -r)" sudo modprobe backlight sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" - sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) + + set +e + sudo /bin/bash /tmp/nvidia_driver -s --no-drm + NVIDIA_INSTALLATION_STATUS=$? + + if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then + sudo cat /var/log/nvidia-installer.log + + NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1) + # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this + # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388 + for PCI_ID in "$NVIDIA_DEVICES"; do + DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable) + + echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)" + echo "1" > /sys/bus/pci/devices/$PCI_ID/reset + sleep 1 + done + fi + sudo rm -fv /tmp/nvidia_driver + set -e fi sudo modprobe nvidia || true From 04ec71b8b5d0ff9ebc180bd79107522e41208c82 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 4 Nov 2022 13:29:26 -0700 Subject: [PATCH 3/4] Upgrade to 515.76 driver --- .github/workflows/install_nvidia_utils_linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/install_nvidia_utils_linux.sh b/.github/workflows/install_nvidia_utils_linux.sh index b1fdd468a..3a2805d91 100755 --- a/.github/workflows/install_nvidia_utils_linux.sh +++ b/.github/workflows/install_nvidia_utils_linux.sh @@ -4,7 +4,7 @@ set -eou pipefail DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) -DRIVER_VERSION="515.57" +DRIVER_VERSION="515.76" DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" From 5bf0934de9cf16d9f60bb7ead40337f782b91b11 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Wed, 9 Nov 2022 15:38:51 -0800 Subject: [PATCH 4/4] Switch to setup-nvidia GitHub action --- .../workflows/install_nvidia_utils_linux.sh | 130 ------------------ .github/workflows/pippy_tests.yaml | 4 +- .github/workflows/spmd_tests.yaml | 4 +- 3 files changed, 2 insertions(+), 136 deletions(-) delete mode 100755 .github/workflows/install_nvidia_utils_linux.sh diff --git a/.github/workflows/install_nvidia_utils_linux.sh b/.github/workflows/install_nvidia_utils_linux.sh deleted file mode 100755 index 3a2805d91..000000000 --- a/.github/workflows/install_nvidia_utils_linux.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - - -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID) -DRIVER_VERSION="515.76" -DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" -YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo" - -install_nvidia_docker2_amzn2() { - ( - set -x - # Needed for yum-config-manager - sudo yum install -y yum-utils - sudo yum-config-manager --add-repo "${YUM_REPO_URL}" - sudo yum install -y nvidia-docker2 - sudo systemctl restart docker - ) -} - -install_nvidia_driver_amzn2() { - ( - set -x - - # Purge any nvidia driver installed from RHEL repo - sudo yum remove -y nvidia-driver-latest-dkms - - # Try to gather more information about the runner and its existing NVIDIA driver if any - echo "Before installing NVIDIA driver" - lspci - lsmod - modinfo nvidia || true - - HAS_NVIDIA_DRIVER=0 - # Check if NVIDIA driver has already been installed - if [ -x "$(command -v nvidia-smi)" ]; then - set +e - # The driver exists, check its version next. Also check only the first GPU if there are more than one of them - # so that the same driver version is not print over multiple lines - INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0) - NVIDIA_SMI_STATUS=$? - - if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then - echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing" - elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then - echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing" - else - HAS_NVIDIA_DRIVER=1 - echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" - fi - set -e - fi - - if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then - sudo yum groupinstall -y "Development Tools" - # ensure our kernel install is the same as our underlying kernel, - # groupinstall "Development Tools" has a habit of mismatching kernel headers - sudo yum install -y "kernel-devel-uname-r == $(uname -r)" - sudo modprobe backlight - sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" - - set +e - sudo /bin/bash /tmp/nvidia_driver -s --no-drm - NVIDIA_INSTALLATION_STATUS=$? - - if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then - sudo cat /var/log/nvidia-installer.log - - NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1) - # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this - # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388 - for PCI_ID in "$NVIDIA_DEVICES"; do - DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable) - - echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)" - echo "1" > /sys/bus/pci/devices/$PCI_ID/reset - sleep 1 - done - fi - - sudo rm -fv /tmp/nvidia_driver - set -e - fi - - sudo modprobe nvidia || true - echo "After installing NVIDIA driver" - lspci - lsmod - modinfo nvidia || true - - ( - set +e - nvidia-smi - NVIDIA_SMI_STATUS=$? - - # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285 - if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then - echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}" - else - echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}" - exit ${NVIDIA_SMI_STATUS} - fi - set -e - ) - ) -} - -echo "== Installing nvidia driver ${DRIVER_FN} ==" -case "${DISTRIBUTION}" in - amzn*) - install_nvidia_driver_amzn2 - ;; - *) - echo "ERROR: Unknown distribution ${DISTRIBUTION}" - exit 1 - ;; -esac - -# Install container toolkit based on distribution -echo "== Installing nvidia container toolkit for ${DISTRIBUTION} ==" -case "${DISTRIBUTION}" in - amzn*) - install_nvidia_docker2_amzn2 - ;; - *) - echo "ERROR: Unknown distribution ${DISTRIBUTION}" - exit 1 - ;; -esac diff --git a/.github/workflows/pippy_tests.yaml b/.github/workflows/pippy_tests.yaml index 4788a3fa2..d91bae1f4 100644 --- a/.github/workflows/pippy_tests.yaml +++ b/.github/workflows/pippy_tests.yaml @@ -212,9 +212,7 @@ jobs: sudo yum remove -y cuda-drivers || true sudo yum remove -y "*nvidia*" || true - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/workflows/install_nvidia_utils_linux.sh || true - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Pull Docker image run: | retry () { diff --git a/.github/workflows/spmd_tests.yaml b/.github/workflows/spmd_tests.yaml index 6f7f70787..e242d8c61 100644 --- a/.github/workflows/spmd_tests.yaml +++ b/.github/workflows/spmd_tests.yaml @@ -71,9 +71,7 @@ jobs: with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/workflows/install_nvidia_utils_linux.sh || true - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + uses: pytorch/test-infra/.github/actions/setup-nvidia@main - name: Pull Docker image run: | retry () {