From dd953929a5d88b13027e8707de09aa6e87053778 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 00:12:29 -0700 Subject: [PATCH 01/11] Fix eval sanity check CI Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .github/workflows/pull.yml | 55 +++++++++++++++++++------------------- torchchat/usages/eval.py | 4 +-- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 839d3b9b2..237f5b9cf 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -123,6 +123,7 @@ jobs: bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" - name: Run validation run: | + # @NOCOMMIT Debug python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' pushd ${TORCHCHAT_ROOT} bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} @@ -164,39 +165,37 @@ jobs: bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float16" test-cpu-eval-sanity-check-float32: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main name: test-cpu-eval-sanity-check-float32 (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-cpu strategy: matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} fail-fast: false - runs-on: ${{ matrix.runner }} - env: - TORCHCHAT_ROOT: ${{ github.workspace }} - REPO_NAME: ${{ matrix.repo_name }} - steps: - - name: Checkout repo - uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.10.11' - - name: Print machine info - run: | - echo "$(uname -a)" - - name: Install dependencies - run: | - ./install/install_requirements.sh - pip3 list - python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' - - name: Download checkpoints - run: | - bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" - - name: Run validation - run: | - python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' - pushd ${TORCHCHAT_ROOT} - bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} - bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32" + with: + runner: linux.4xlarge + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install dependencies" + ./install/install_requirements.sh + pip3 list + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run eval" + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32" + echo "::endgroup::" gather-models-gpu: runs-on: ubuntu-22.04 diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py index 5993c3781..9fec815bd 100644 --- a/torchchat/usages/eval.py +++ b/torchchat/usages/eval.py @@ -10,6 +10,8 @@ import torch._dynamo.config import torch._inductor.config +import lm_eval # noqa + from torchchat.cli.builder import ( _initialize_model, _initialize_tokenizer, @@ -28,8 +30,6 @@ torch._inductor.config.triton.cudagraphs = True torch._dynamo.config.cache_size_limit = 100000 -import lm_eval - from lm_eval.evaluator import evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper from lm_eval.tasks import get_task_dict From b6e3db1eb53d1fcdea7af8483f5aaed1057feed4 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 01:19:08 -0700 Subject: [PATCH 02/11] Fixate psutil to 6.0.0 Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- install/install_requirements.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/install_requirements.sh b/install/install_requirements.sh index b483acae4..d3b19d6b7 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -103,5 +103,5 @@ fi ( set -x - $PIP_EXECUTABLE install lm-eval=="0.4.2" + $PIP_EXECUTABLE install lm-eval=="0.4.2" psutil=="6.0.0" ) From 775fd60ac834b7f91e6e84ff954ca7517f090666 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 01:34:31 -0700 Subject: [PATCH 03/11] Pre-import evaluate Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- torchchat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchchat.py b/torchchat.py index 9f85f0692..c3e49e050 100644 --- a/torchchat.py +++ b/torchchat.py @@ -8,6 +8,7 @@ import logging import subprocess import sys +import evaluate # noqa from torchchat.cli.cli import ( add_arguments_for_verb, From 2f6acd27c0bbb3f24a635fb40239e5dc5b0abeed Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 09:23:48 -0700 Subject: [PATCH 04/11] Force install evaluate Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- install/install_requirements.sh | 2 +- torchchat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/install/install_requirements.sh b/install/install_requirements.sh index d3b19d6b7..7031c7afa 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -103,5 +103,5 @@ fi ( set -x - $PIP_EXECUTABLE install lm-eval=="0.4.2" psutil=="6.0.0" + $PIP_EXECUTABLE install lm-eval=="0.4.2" evaluate=="0.4.3" psutil=="6.0.0" ) diff --git a/torchchat.py b/torchchat.py index c3e49e050..a84fdac20 100644 --- a/torchchat.py +++ b/torchchat.py @@ -8,7 +8,7 @@ import logging import subprocess import sys -import evaluate # noqa +import lm_eval # noqa from torchchat.cli.cli import ( add_arguments_for_verb, From b9d7bd2793aa7ca798a47d298ed6c89d57f1306a Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 10:19:44 -0700 Subject: [PATCH 05/11] Revert test-cpu-eval-sanity-check-float32 changes since it breaks AOTI Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- install/install_requirements.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 7031c7afa..afc7e898d 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -103,5 +103,5 @@ fi ( set -x - $PIP_EXECUTABLE install lm-eval=="0.4.2" evaluate=="0.4.3" psutil=="6.0.0" + $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" "numpy>=1.17,<2.0" psutil=="6.0.0" ) From 1d6c13dfdbfd32421e1541336d4a2fe7a8a579e8 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 10:20:55 -0700 Subject: [PATCH 06/11] Remove debug log Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .github/workflows/pull.yml | 54 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 237f5b9cf..335b4da5e 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -123,7 +123,6 @@ jobs: bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" - name: Run validation run: | - # @NOCOMMIT Debug python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' pushd ${TORCHCHAT_ROOT} bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} @@ -171,31 +170,34 @@ jobs: strategy: matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} fail-fast: false - with: - runner: linux.4xlarge - script: | - echo "::group::Print machine info" - uname -a - echo "::endgroup::" - - echo "::group::Install dependencies" - ./install/install_requirements.sh - pip3 list - python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' - echo "::endgroup::" - - echo "::group::Download checkpoint" - export REPO_NAME=${{ matrix.repo_name }} - bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} - echo "::endgroup::" - - echo "::group::Convert checkpoint" - bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} - echo "::endgroup::" - - echo "::group::Run eval" - bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32" - echo "::endgroup::" + runs-on: ${{ matrix.runner }} + env: + TORCHCHAT_ROOT: ${{ github.workspace }} + REPO_NAME: ${{ matrix.repo_name }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.10.11' + - name: Print machine info + run: | + echo "$(uname -a)" + - name: Install dependencies + run: | + ./install/install_requirements.sh + pip3 list + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + - name: Download checkpoints + run: | + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + - name: Run validation + run: | + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + pushd ${TORCHCHAT_ROOT} + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float32" gather-models-gpu: runs-on: ubuntu-22.04 From a8c1e1ffb456e188ea5b07e6e10997161c0cea3e Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 10:22:00 -0700 Subject: [PATCH 07/11] Cleanup Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .github/workflows/pull.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 335b4da5e..839d3b9b2 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -164,7 +164,6 @@ jobs: bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "eval_sanity_check-float16" test-cpu-eval-sanity-check-float32: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main name: test-cpu-eval-sanity-check-float32 (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-cpu strategy: From 477833286e0e82efce0807f358d2038737ccc195 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 10:27:12 -0700 Subject: [PATCH 08/11] Remove redundant import Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- torchchat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchchat.py b/torchchat.py index a84fdac20..9f85f0692 100644 --- a/torchchat.py +++ b/torchchat.py @@ -8,7 +8,6 @@ import logging import subprocess import sys -import lm_eval # noqa from torchchat.cli.cli import ( add_arguments_for_verb, From 1f7748da0e19d0738204170d7234336b853f5616 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 10:31:23 -0700 Subject: [PATCH 09/11] Ok can't remove import lm_eval. Move it inside eval condition Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- install/install_requirements.sh | 2 +- torchchat/model.py | 1 + torchchat/usages/eval.py | 4 ++-- torchchat/utils/scripts/install_et.sh | 5 +++++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/install/install_requirements.sh b/install/install_requirements.sh index afc7e898d..fa921cc69 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -103,5 +103,5 @@ fi ( set -x - $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" "numpy>=1.17,<2.0" psutil=="6.0.0" + $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0" ) diff --git a/torchchat/model.py b/torchchat/model.py index ab0bc7e21..313f5f45a 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -31,6 +31,7 @@ ) from torch.nn import functional as F +import lm_eval # noqa from torchtune.models.clip import clip_vision_encoder from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder from torchtune.models.llama3_2_vision._component_builders import ( diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py index 9fec815bd..5993c3781 100644 --- a/torchchat/usages/eval.py +++ b/torchchat/usages/eval.py @@ -10,8 +10,6 @@ import torch._dynamo.config import torch._inductor.config -import lm_eval # noqa - from torchchat.cli.builder import ( _initialize_model, _initialize_tokenizer, @@ -30,6 +28,8 @@ torch._inductor.config.triton.cudagraphs = True torch._dynamo.config.cache_size_limit = 100000 +import lm_eval + from lm_eval.evaluator import evaluate from lm_eval.models.huggingface import HFLM as eval_wrapper from lm_eval.tasks import get_task_dict diff --git a/torchchat/utils/scripts/install_et.sh b/torchchat/utils/scripts/install_et.sh index 1d8c6e2b2..04db3b287 100755 --- a/torchchat/utils/scripts/install_et.sh +++ b/torchchat/utils/scripts/install_et.sh @@ -20,4 +20,9 @@ find_cmake_prefix_path clone_executorch install_executorch_libs $ENABLE_ET_PYBIND install_executorch_python_libs $ENABLE_ET_PYBIND +# TODO: figure out the root cause of 'AttributeError: module 'evaluate' +# has no attribute 'utils'' error from evaluate CI jobs and remove +# `import lm_eval` from torchchat.py since it requires a specific version +# of numpy. +pip install numpy=='1.26.4' popd From a602e23ca0efdbcb1507b3172437422beb18fd06 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 12:04:02 -0700 Subject: [PATCH 10/11] Add file hash to cache key Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .github/workflows/pull.yml | 4 ++-- torchchat/model.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 839d3b9b2..ea4cc60a1 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -904,9 +904,9 @@ jobs: echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV" - name: Load or install ET id: install-et - uses: actions/cache@v3 + uses: actions/cache@v4 env: - cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}} + cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('torchchat/utils/scripts/install_et.sh') }} with: path: ./et-build key: ${{env.cache-key}} diff --git a/torchchat/model.py b/torchchat/model.py index 313f5f45a..0f60ea318 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -30,8 +30,9 @@ SequenceParallel, ) from torch.nn import functional as F - -import lm_eval # noqa +# TODO: remove this after we figure out where in torchtune an `evaluate` module +# is being imported, which is being confused with huggingface's `evaluate``. +import lm_eval # noqa from torchtune.models.clip import clip_vision_encoder from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder from torchtune.models.llama3_2_vision._component_builders import ( From 0917c14d8dc969470b968b95d01f2636da3965f6 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Thu, 26 Sep 2024 13:20:29 -0700 Subject: [PATCH 11/11] Update cache key for runner-et Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- .github/workflows/pull.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ea4cc60a1..3e92ed9c0 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -905,13 +905,11 @@ jobs: - name: Load or install ET id: install-et uses: actions/cache@v4 - env: - cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('torchchat/utils/scripts/install_et.sh') }} with: - path: ./et-build - key: ${{env.cache-key}} - restore-keys: | - ${{env.cache-key}} + path: | + ./et-build + ./torchchat/utils/scripts + key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }} - if: ${{ steps.install-et.outputs.cache-hit != 'true' }} continue-on-error: true run: |