From fe5eadd4a01dc359f76128b06a5b45f653209766 Mon Sep 17 00:00:00 2001 From: Lunwen He Date: Thu, 17 Oct 2024 14:41:32 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- .ci/scripts/test_eval_llama_mmlu.sh | 64 +++++++++++++++++++++++++ .github/workflows/pull.yml | 27 +++++++++++ examples/models/llama/eval_llama_lib.py | 7 +++ 3 files changed, 98 insertions(+) create mode 100644 .ci/scripts/test_eval_llama_mmlu.sh diff --git a/.ci/scripts/test_eval_llama_mmlu.sh b/.ci/scripts/test_eval_llama_mmlu.sh new file mode 100644 index 00000000000..c3c0a3d1a69 --- /dev/null +++ b/.ci/scripts/test_eval_llama_mmlu.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +# Download and prepare stories model artifacts +prepare_model_artifacts() { + echo "Preparing stories model artifacts" + wget -O stories110M.pt "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" + wget -O tokenizer.model "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" + echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json +} + +run_and_verify() { + NOW=$(date +"%H:%M:%S") + echo "Starting to run eval_llama at ${NOW}" + if [[ ! -f "stories110M.pt" ]]; then + echo "stories110M.pt is missing." + exit 1 + fi + if [[ ! -f "tokenizer.model" ]]; then + echo "tokenizer.model is missing." + exit 1 + fi + if [[ ! -f "params.json" ]]; then + echo "params.json is missing." + exit 1 + fi + $PYTHON_EXECUTABLE -m examples.models.llama.eval_llama \ + -c stories110M.pt \ + -p params.json \ + -t tokenizer.model \ + -kv \ + -d fp32 \ + --tasks mmlu \ + -f 5 \ + --max_seq_length 2048 \ + --limit 5 > result.txt + + # Verify result.txt + RESULT=$(cat result.txt) + EXPECTED_TASK="mmlu" + EXPECTED_RESULT="acc" + if [[ "${RESULT}" == "${EXPECTED_TASK}: {"*"${EXPECTED_RESULT}"* ]]; then + echo "Actual result: ${RESULT}" + echo "Success" + exit 0 + else + echo "Actual result: ${RESULT}" + echo "Failure; results not the same" + exit 1 + fi +} + +prepare_model_artifacts +run_and_verify diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ae5d5d7da5d..6ea94f3c5d2 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -474,3 +474,30 @@ jobs: # run eval_llama wikitext task PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_wikitext.sh + + test-eval_llama-mmlu-linux: + name: test-eval_llama-mmlu-linux + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + strategy: + fail-fast: false + with: + runner: linux.24xlarge + docker-image: executorch-ubuntu-22.04-clang12 + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" + + # install pybind + bash install_requirements.sh --pybind xnnpack + + # install llama requirements + bash examples/models/llama/install_requirements.sh + + # run eval_llama mmlu task + PYTHON_EXECUTABLE=python bash .ci/scripts/test_eval_llama_mmlu.sh diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py index e95e6998d9b..285d2f874df 100644 --- a/examples/models/llama/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -291,6 +291,13 @@ def eval_llama( # Generate the eval wrapper eval_wrapper = gen_eval_wrapper(model_name, args) + # Needed for loading mmlu dataset. + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files + if args.tasks and "mmlu" in args.tasks: + import datasets + + datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True + # Evaluate the model with torch.no_grad(): eval_results = simple_evaluate(