From b29d77b54f5cee9e786f8a72329837043f36a349 Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 7 May 2024 11:50:54 +0000 Subject: [PATCH] Separate arm64 and amd64 docker builds (#125617) Fixes https://github.com/pytorch/pytorch/issues/125094 Please note: Docker CUDa 12.4 failure is existing issue, related to docker image not being available on gitlab: ``` docker.io/nvidia/cuda:12.4.0-cudnn8-devel-ubuntu22.04: docker.io/nvidia/cuda:12.4.0-cudnn8-devel-ubuntu22.04: not found ``` https://github.com/pytorch/pytorch/actions/runs/8974959068/job/24648540236?pr=125617 Here is the reference issue: https://gitlab.com/nvidia/container-images/cuda/-/issues/225 Tracked on our side: https://github.com/pytorch/builder/issues/1811 Pull Request resolved: https://github.com/pytorch/pytorch/pull/125617 Approved by: https://github.com/huydhn, https://github.com/malfet --- .../scripts/generate_docker_release_matrix.py | 14 ++++++++++++- .github/workflows/docker-release.yml | 21 ++++++++++++++----- docker.Makefile | 18 ++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/.github/scripts/generate_docker_release_matrix.py b/.github/scripts/generate_docker_release_matrix.py index 68662b191ed10..49d567ceadf8d 100644 --- a/.github/scripts/generate_docker_release_matrix.py +++ b/.github/scripts/generate_docker_release_matrix.py @@ -21,6 +21,8 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]: ret: List[Dict[str, str]] = [] + # CUDA amd64 Docker images are available as both runtime and devel while + # CPU arm64 image is only available as runtime. for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items(): for image in DOCKER_IMAGE_TYPES: ret.append( @@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]: cuda ], "image_type": image, - "platform": "linux/arm64,linux/amd64", + "platform": "linux/amd64", } ) + ret.append( + { + "cuda": "cpu", + "cuda_full_version": "", + "cudnn_version": "", + "image_type": "runtime", + "platform": "linux/arm64", + } + ) + return {"include": ret} diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 1d566cb21099d..4ece88d5e47da 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -7,6 +7,7 @@ on: - Dockerfile - docker.Makefile - .github/workflows/docker-release.yml + - .github/scripts/generate_docker_release_matrix.py push: branches: - nightly @@ -129,17 +130,27 @@ jobs: if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }} run: | PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime" + CUDA_SUFFIX="-cu${CUDA_VERSION}" + if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then + PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime" + CUDA_SUFFIX="" + fi PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \ python -c 'import torch; print(torch.version.git_version[:7],end="")') docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \ - ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" - docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" + ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" + + docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" + + # Please note, here we ned to pin specific verison of CUDA as with latest label + if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then + docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \ + ghcr.io/pytorch/pytorch-nightly:latest + docker push ghcr.io/pytorch/pytorch-nightly:latest + fi - docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \ - ghcr.io/pytorch/pytorch-nightly:latest - docker push ghcr.io/pytorch/pytorch-nightly:latest - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() diff --git a/docker.Makefile b/docker.Makefile index 3975091c2de03..62ba7168d9ba9 100644 --- a/docker.Makefile +++ b/docker.Makefile @@ -83,6 +83,22 @@ devel-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CU devel-push: $(DOCKER_PUSH) +ifeq ("$(CUDA_VERSION_SHORT)","cpu") + +.PHONY: runtime-image +runtime-image: BASE_IMAGE := $(BASE_RUNTIME) +runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-runtime +runtime-image: + $(DOCKER_BUILD) + +.PHONY: runtime-push +runtime-push: BASE_IMAGE := $(BASE_RUNTIME) +runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-runtime +runtime-push: + $(DOCKER_PUSH) + +else + .PHONY: runtime-image runtime-image: BASE_IMAGE := $(BASE_RUNTIME) runtime-image: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$(CUDNN_VERSION)-runtime @@ -95,6 +111,8 @@ runtime-push: DOCKER_TAG := $(PYTORCH_VERSION)-cuda$(CUDA_VERSION_SHORT)-cudnn$( runtime-push: $(DOCKER_PUSH) +endif + .PHONY: clean clean: -docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))