From 5d3fd8fbc76dc77d645ab9391d566772b21fa6e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 2 Oct 2025 08:59:20 +0000 Subject: [PATCH 1/8] ci: Bump runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 38d1cb72..0d5b0e2a 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -42,7 +42,7 @@ jobs: matrix: # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. - os: [ubuntu-20.04] + os: [ubuntu-22.04] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] torch-version: ["2.1.2", "2.2.2", "2.3.1", "2.4.0", "2.5.1", "2.6.0.dev20241001"] cuda-version: ["11.8.0", "12.3.2"] From 9d2855784167fee035a1241b954fa47c4b37837f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 2 Oct 2025 09:04:30 +0000 Subject: [PATCH 2/8] check for ngc wheels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/scripts/check_for_ngc_images.sh | 65 +++++++++++++++++++++++++ .github/workflows/publish.yaml | 16 +++++- 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 .github/scripts/check_for_ngc_images.sh diff --git a/.github/scripts/check_for_ngc_images.sh b/.github/scripts/check_for_ngc_images.sh new file mode 100644 index 00000000..9e17601f --- /dev/null +++ b/.github/scripts/check_for_ngc_images.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Configuration +BASE_IMAGE="nvcr.io/nvidia/pytorch" +TAG_SUFFIX="-py3" +MONTHS_TO_CHECK=7 # Check current month and previous 6 months (total 7) + +# Initialize an array to store existing tags +EXISTING_TAGS=() + +echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}" +echo "---------------------------------------------------------------------" + +# Loop through the last N months +for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do + # Calculate Year and Month for the tag + CURRENT_YEAR=$(date +%Y) + CURRENT_MONTH=$(date +%m) + + # Calculate target month and year + TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m) + + # Construct the full image tag and the tag-only string + IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}" + FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}" + + echo "Checking: ${FULL_IMAGE}" + + # Use 'docker manifest inspect' to check for image existence without pulling. + if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then + echo "✅ EXISTS: Found." + # Add the tag-only string to the array + EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}") + else + echo "❌ MISSING: Not found." + fi +done + +echo "---------------------------------------------------------------------" + +## JSON Output Generation +# This uses the collected array to build a JSON string. + +# 1. Convert the shell array to a newline-separated string. +TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}") + +# 2. Use jq to read the newline-separated list and format it into a JSON array. +# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element. +if command -v jq &> /dev/null; then + JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]') + + echo "Generated JSON String of Existing Tags:" + echo "${JSON_STRING}" + + # Optional: Save the JSON string to a variable for further use + # echo "JSON_STRING is now available in the shell if you source this script." +else + echo "WARNING: 'jq' is not installed. Cannot format output as JSON." + echo "Found Tags: ${EXISTING_TAGS[*]}" +fi + +echo "---" +echo "Check complete." + +echo "${JSON_STRING}" > ngc_images.json \ No newline at end of file diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 0d5b0e2a..a436545f 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -76,9 +76,23 @@ jobs: release-version: ${{ needs.setup_release.outputs.release-version }} upload-to-release: true + check_for_ngc_images: + runs-on: ubuntu-latest + outputs: + images: ${{ steps.check_for_ngc_images.outputs.IMAGES }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Check for NGC PyTorch images + id: check_for_ngc_images + run: | + bash ./.github/scripts/check_for_ngc_images.sh + echo "IMAGES=$(cat ngc_images.json| jq -cr)" >> $GITHUB_OUTPUT + build_ngc_wheels: name: Build Wheel for NGC PyTorch - needs: setup_release + needs: [setup_release, check_for_ngc_images] strategy: fail-fast: false matrix: From 7624c803e3b7b8d1206433636fa3882d9aa01bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 2 Oct 2025 09:16:52 +0000 Subject: [PATCH 3/8] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index a436545f..09a9985b 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -96,7 +96,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] container-image: - nvcr.io/nvidia/pytorch:25.05-py3 - nvcr.io/nvidia/pytorch:25.06-py3 From a7b82a25ab31c52ba26340c6e6044682a3ddf6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 2 Oct 2025 09:25:54 +0000 Subject: [PATCH 4/8] fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 09a9985b..ace53c6a 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -88,7 +88,7 @@ jobs: id: check_for_ngc_images run: | bash ./.github/scripts/check_for_ngc_images.sh - echo "IMAGES=$(cat ngc_images.json| jq -cr)" >> $GITHUB_OUTPUT + echo "IMAGES=$(cat ngc_images.json| jq -cr)" | tee -a $GITHUB_OUTPUT build_ngc_wheels: name: Build Wheel for NGC PyTorch @@ -97,10 +97,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04] - container-image: - - nvcr.io/nvidia/pytorch:25.05-py3 - - nvcr.io/nvidia/pytorch:25.06-py3 - - nvcr.io/nvidia/pytorch:25.08-py3 + container-image: ${{ fromJson(needs.check_for_ngc_images.outputs.images) }} uses: ./.github/workflows/_build_in_container.yml with: runs-on: ${{ matrix.runs-on }} From cf8f1f420d36ab6b15378370041f7c2f880b9ef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 2 Oct 2025 09:31:16 +0000 Subject: [PATCH 5/8] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index ace53c6a..9dd501ec 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -100,7 +100,7 @@ jobs: container-image: ${{ fromJson(needs.check_for_ngc_images.outputs.images) }} uses: ./.github/workflows/_build_in_container.yml with: - runs-on: ${{ matrix.runs-on }} + runs-on: ${{ matrix.os }} container-image: ${{ matrix.container-image }} release-version: ${{ needs.setup_release.outputs.release-version }} upload-to-release: true From d1d627a30ce0ea0d6bea40d90c75c0c9b7c6a458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 3 Oct 2025 09:19:22 +0000 Subject: [PATCH 6/8] ci: Fix for cuda13 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b0ad57be..3a443f95 100755 --- a/setup.py +++ b/setup.py @@ -293,7 +293,15 @@ def get_wheel_url(): torch_cuda_version = parse(torch.version.cuda) # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3 # to save CI time. Minor versions should be compatible. - torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.3") + if torch_cuda_version.major == 11: + torch_cuda_version = parse("11.8") + elif torch_cuda_version.major == 12: + torch_cuda_version = parse("12.3") + elif torch_cuda_version.major == 13: + torch_cuda_version = parse("13.0") + else: + raise ValueError(f"CUDA version {torch_cuda_version} not supported") + cuda_version = f"{torch_cuda_version.major}" gpu_compute_version = hip_ver if HIP_BUILD else cuda_version From 54a4079231c30a1aeb3c8d0b16f786ebc6c16909 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 3 Oct 2025 09:21:44 +0000 Subject: [PATCH 7/8] remove old versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/publish.yaml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 9dd501ec..0689b7c9 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -40,12 +40,12 @@ jobs: strategy: fail-fast: false matrix: - # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the + # Using ubuntu-22.04 instead of 24.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-22.04] python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] - torch-version: ["2.1.2", "2.2.2", "2.3.1", "2.4.0", "2.5.1", "2.6.0.dev20241001"] - cuda-version: ["11.8.0", "12.3.2"] + torch-version: ["2.4.0", "2.5.1", "2.6.0", "2.7.1"] + cuda-version: ["11.8.0", "12.9.1"] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) @@ -53,19 +53,8 @@ jobs: cxx11_abi: ["FALSE", "TRUE"] exclude: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix - # Pytorch < 2.2 does not support Python 3.12 - - torch-version: "2.1.2" - python-version: "3.12" - # Pytorch < 2.5 does not support Python 3.13 - - torch-version: "2.1.2" - python-version: "3.13" - - torch-version: "2.2.2" - python-version: "3.13" - - torch-version: "2.3.1" - python-version: "3.13" - torch-version: "2.4.0" python-version: "3.13" - uses: ./.github/workflows/_build.yml with: runs-on: ${{ matrix.os }} From 10eada186d95e11233351850e89c85fd4f7f8c78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 3 Oct 2025 09:41:48 +0000 Subject: [PATCH 8/8] PIP_CONSTRAINT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_build_in_container.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/_build_in_container.yml b/.github/workflows/_build_in_container.yml index 675fd02c..029eea94 100644 --- a/.github/workflows/_build_in_container.yml +++ b/.github/workflows/_build_in_container.yml @@ -39,11 +39,11 @@ jobs: root-reserve-mb: 5120 temp-reserve-mb: 32 swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' + remove-dotnet: "true" + remove-android: "true" + remove-haskell: "true" + remove-codeql: "true" + build-mount-path: "/var/lib/docker/" - name: Restore /var/lib/docker/ run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" @@ -63,9 +63,9 @@ jobs: run: | echo "Free space:" df -h - + - name: Pull the container - run: docker pull ${{ inputs.container-image }} + run: docker pull ${{ inputs.container-image }} - name: Set CUDA and PyTorch versions run: | @@ -94,7 +94,7 @@ jobs: - name: Build wheel id: build_wheel - env: + env: CXX11_ABI: ${{ env.CXX11_ABI }} MATRIX_TORCH_VERSION: ${{ env.MATRIX_TORCH_VERSION}} WHEEL_CUDA_VERSION: ${{ env.WHEEL_CUDA_VERSION }} @@ -106,6 +106,7 @@ jobs: --workdir /workspace \ --volume $(pwd):/workspace \ --volume $GITHUB_ENV:$GITHUB_ENV \ + -e PIP_CONSTRAINT= \ -e GITHUB_ENV=$GITHUB_ENV \ -e CXX11_ABI=$CXX11_ABI \ -e MATRIX_TORCH_VERSION=$MATRIX_TORCH_VERSION \