From 661c3de2a71f397b565f2743ca9cdd4b8cee33b7 Mon Sep 17 00:00:00 2001 From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Date: Thu, 23 May 2024 15:49:49 -0500 Subject: [PATCH] [release/2.3] Fix miopenStatusInternalError caused in new ROCm6.0 CI docker images (#126942) * Update setting of journal_mode to delete * Revert "[Release only] Pin rocm docker images (#126452)" This reverts commit ee68b41571287aaecf4216f752fb592496fea49e. * Replace tabs with spaces for lint --- .ci/docker/common/install_rocm.sh | 12 ++++++++++-- .github/workflows/_linux-build.yml | 27 ++++----------------------- .github/workflows/inductor.yml | 1 - .github/workflows/periodic.yml | 1 - .github/workflows/pull.yml | 1 - .github/workflows/rocm.yml | 1 - .github/workflows/slow.yml | 1 - .github/workflows/trunk.yml | 1 - 8 files changed, 14 insertions(+), 31 deletions(-) diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index 1c56918ac9b67..d098922e4ed00 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -84,7 +84,11 @@ install_ubuntu() { if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then for kdb in /opt/rocm/share/miopen/db/*.kdb do - sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + # journal_mode=delete seems to work on some kdbs that have "wal" as initial journal_mode + sqlite3 $kdb "PRAGMA journal_mode=delete; PRAGMA VACUUM;" + JOURNAL_MODE=$(sqlite3 $kdb "PRAGMA journal_mode;") + # Both "delete and "off" work in cases where user doesn't have write permissions to directory where kdbs are installed + if [[ $JOURNAL_MODE != "delete" ]] && [[ $JOURNAL_MODE != "off" ]]; then echo "kdb journal_mode change failed" && exit 1; fi done fi @@ -163,7 +167,11 @@ install_centos() { if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then for kdb in /opt/rocm/share/miopen/db/*.kdb do - sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + # journal_mode=delete seems to work on some kdbs that have "wal" as initial journal_mode + sqlite3 $kdb "PRAGMA journal_mode=delete; PRAGMA VACUUM;" + JOURNAL_MODE=$(sqlite3 $kdb "PRAGMA journal_mode;") + # Both "delete" and "off" work in cases where user doesn't have write permissions to directory where kdbs are installed + if [[ $JOURNAL_MODE != "delete" ]] && [[ $JOURNAL_MODE != "off" ]]; then echo "kdb journal_mode change failed" && exit 1; fi done fi diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 9bd7a6dfd1c7f..6af95927ae1dd 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -11,11 +11,6 @@ on: required: true type: string description: Name of the base docker image to build with. - docker-image-tag: - required: false - type: string - description: Name of the base docker image tag - default: "" build-generates-artifacts: required: false type: boolean @@ -74,7 +69,7 @@ jobs: runs-on: ${{ inputs.runner }} timeout-minutes: 240 outputs: - docker-image: ${{ steps.calculate-docker.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Setup SSH (Click me for login details) @@ -98,24 +93,10 @@ jobs: with: docker-image-name: ${{ inputs.docker-image-name }} - - name: Override docker image tag if pinned - id: calculate-docker - env: - ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - shell: bash - run: | - export NEW_TAG=${{ inputs.docker-image-tag }} - if [[ ${NEW_TAG} != '' ]]; then - IMAGE=${ECR_DOCKER_IMAGE%:*} - echo "docker-image=${IMAGE}:${NEW_TAG}" >> "${GITHUB_OUTPUT}" - else - echo "docker-image=${ECR_DOCKER_IMAGE}" >> "${GITHUB_OUTPUT}" - fi - - name: Use following to pull public copy of the image id: print-ghcr-mirror env: - ECR_DOCKER_IMAGE: ${{ steps.calculate-docker.outputs.docker-image }} + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | tag=${ECR_DOCKER_IMAGE##*/} @@ -124,7 +105,7 @@ jobs: - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.3 with: - docker-image: ${{ steps.calculate-docker.outputs.docker-image }} + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Parse ref id: parse-ref @@ -168,7 +149,7 @@ jobs: XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} - DOCKER_IMAGE: ${{ steps.calculate-docker.outputs.docker-image }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index af08fc3e9947e..1934d7fd86428 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -22,7 +22,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 test-matrix: | { include: [ { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" }, diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index b6a4eab8762cb..99f4dd99395fc 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -215,7 +215,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 test-matrix: | { include: [ { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" }, diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index a604b095158c3..887902bb39ee5 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -409,7 +409,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 sync-tag: rocm-build test-matrix: | { include: [ diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 1d3b5a53569e9..24542c3ddc47a 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -31,7 +31,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 sync-tag: rocm-build test-matrix: | { include: [ diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 04858e89b3759..33577986f643c 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -107,7 +107,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 8f5596700a965..c0538a8600d9e 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -196,7 +196,6 @@ jobs: with: build-environment: linux-focal-rocm6.0-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 - docker-image-tag: cea4be730564c18dd285a12828c7c449490b10b9 sync-tag: rocm-build test-matrix: | { include: [