From dc5cd6ee643b8b5411d3ff2e55deb4ebacf95e2a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 25 Sep 2025 18:10:39 -0700
Subject: [PATCH] Don't use_cache for lm_eval by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
use_cache option in lm_eval will read the results for (model_id, task) pair if it's already evaluated,
but during development sometimes we'll update the model and need to re-evaluate, so have to disable cache
to get new eval results.

This PR changes eval_quality.sh to not use cache by default, user can still enable it by explicitly passing
`--use_cache`

don't use cache:
sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4"

use cache:
sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4" --use_cache

Test Plan:
```
sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4" --use_cache 
Logs in /home/jerryzh/local/ao/.github/scripts/torchao_model_releases/jerryzh168_gemma-3-12b-it-AWQ-INT4_quality_mmlu.log:
MLoading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]^MLoading checkpoint shards:  50%|\█████     | 1/2 [00:01<00:01,  1.34s/it]^MLoading checkpoint shards: 100%|██████████| 2/2 [00:02<00:\00,  1.41s/it]^MLoading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.40s/it]
INFO:lm_eval.models.huggingface:Model type is 'gemma3', part of the Gemma family--a BOS token will b\e used as Gemma underperforms without it.
INFO:lm_eval.evaluator:Using cache at /tmp/jerryzh168_gemma-3-12b-it-AWQ-INT4_quality_mmlu_rank0.db

sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4"
Logs in /home/jerryzh/local/ao/.github/scripts/torchao_model_releases/jerryzh168_gemma-3-12b-it-AWQ-INT4_quality_mmlu.log:
^MLoading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]^MLoading checkpoint shards:  50%|\█████     | 1/2 [00:00<00:00,  1.04it/s]^MLoading checkpoint shards: 100%|██████████| 2/2 [00:01<00:\00,  1.23it/s]^MLoading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.20it/s]
INFO:lm_eval.models.huggingface:Model type is 'gemma3', part of the Gemma family--a BOS token will b\e used as Gemma underperforms without it.
INFO:lm_eval.api.task:Building contexts for mmlu_abstract_algebra on rank 0...
^M  0%|          | 0/100 [00:00<?, ?it/s]^M100%|██████████| 100/100 [00:00<00:00, 1051.54it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_anatomy on rank 0...
^M  0%|          | 0/135 [00:00<?, ?it/s]^M 79%|███████▉  | 107/135 [00:00<00:00, 1067.20it/s]^M100%\|██████████| 135/135 [00:00<00:00, 1066.27it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_astronomy on rank 0...
^M  0%|          | 0/152 [00:00<?, ?it/s]^M 70%|██████▉   | 106/152 [00:00<00:00, 1056.01it/s]^M100%\|██████████| 152/152 [00:00<00:00, 1054.96it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_college_biology on rank 0...
^M  0%|          | 0/144 [00:00<?, ?it/s]^M 63%|██████▎   | 91/144 [00:00<00:00, 903.99it/s]^M100%|█\█████████| 144/144 [00:00<00:00, 938.81it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_college_chemistry on rank 0...
^M  0%|          | 0/100 [00:00<?, ?it/s]^M100%|██████████| 100/100 [00:00<00:00, 1081.13it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_college_computer_science on rank 0...
^M  0%|          | 0/100 [00:00<?, ?it/s]^M100%|██████████| 100/100 [00:00<00:00, 1109.74it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_college_mathematics on rank 0...
^M  0%|          | 0/100 [00:00<?, ?it/s]^M100%|██████████| 100/100 [00:00<00:00, 1111.38it/s]
INFO:lm_eval.api.task:Building contexts for mmlu_college_physics on rank 0...
```

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .../scripts/torchao_model_releases/README.md  | 11 ++++++++--
 .../scripts/torchao_model_releases/eval.sh    | 13 +++++++++--
 .../torchao_model_releases/eval_quality.sh    | 22 +++++++++++++------
 3 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md
index f4609fc7ee..4ff1f96b14 100644
--- a/.github/scripts/torchao_model_releases/README.md
+++ b/.github/scripts/torchao_model_releases/README.md
@@ -119,7 +119,7 @@ uv pip install vllm --pre --extra-index-url https://download.pytorch.org/whl/nig
 
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1,256
+sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1 256
 ```
 
 #### Model Quality Eval
@@ -129,9 +129,16 @@ uv pip install lm-eval
 ```
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag,mmlu
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
 ```
 
+Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
+and you don't want to re-run all evals.
+```
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
+```
+
+
 #### Summarize results
 After we have finished all evals for each model, we can summarize the results with:
 ```
diff --git a/.github/scripts/torchao_model_releases/eval.sh b/.github/scripts/torchao_model_releases/eval.sh
index f284b2a0c3..bf1bd25e59 100644
--- a/.github/scripts/torchao_model_releases/eval.sh
+++ b/.github/scripts/torchao_model_releases/eval.sh
@@ -9,7 +9,7 @@ set -e
 source eval_env_checks.sh
 
 usage() {
-  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>]"
+  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>] [--use_cache]"
   echo "Defaults:"
   echo "  batch_sizes: 1 256"
   echo "  tasks: mmlu"
@@ -20,6 +20,7 @@ EVAL_TYPE="all"
 # these will be parsed in the other scripts
 BATCH_SIZES="1 256"    # Default for latency eval
 TASKS="mmlu"           # Default for quality eval
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -58,6 +59,10 @@ while [[ $# -gt 0 ]]; do
       TASKS="$1"
       shift
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
       usage
@@ -82,7 +87,11 @@ run_latency() {
 run_quality() {
   check_lm_eval
   local model_id="$1"
-  sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  if $USE_CACHE; then
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS --use_cache
+  else
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  fi
 }
 for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
   case "$EVAL_TYPE" in
diff --git a/.github/scripts/torchao_model_releases/eval_quality.sh b/.github/scripts/torchao_model_releases/eval_quality.sh
index dd0ab9c2b2..1674aee2d7 100644
--- a/.github/scripts/torchao_model_releases/eval_quality.sh
+++ b/.github/scripts/torchao_model_releases/eval_quality.sh
@@ -11,6 +11,7 @@ check_lm_eval
 
 MODEL_ID_ARRAY=()
 TASK_ARRAY=("mmlu")  # default can be overwritten by user input
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -29,9 +30,13 @@ while [[ $# -gt 0 ]]; do
         shift
       done
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
-      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)]"
+      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)] [--use_cache]"
       exit 1
       ;;
   esac
@@ -51,16 +56,19 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
         EVAL_CACHE_DB_PREFIX="/tmp/${SAFE_MODEL_ID}_quality_${TASK}"
         mkdir -p "${EVAL_CACHE_DB_PREFIX}"
         echo "Running model quality (accuracy) evaluation for model $MODEL_ID on task $TASK"
-
-        lm_eval \
+        LM_EVAL_CMD="lm_eval \
             --model hf \
-            --model_args pretrained="$MODEL_ID" \
-            --tasks "$TASK" \
+            --model_args pretrained=\"$MODEL_ID\" \
+            --tasks \"$TASK\" \
             --device cuda:0 \
-            --use_cache "$EVAL_CACHE_DB_PREFIX" \
             --batch_size auto \
-            --output_path "$RESULTS_DIR" > "$OUTPUT_FILE" 2>&1
+            --output_path \"$RESULTS_DIR\""
+
+        if $USE_CACHE; then
+            LM_EVAL_CMD="$LM_EVAL_CMD --use_cache \"$EVAL_CACHE_DB_PREFIX\""
+        fi
 
+        eval "$LM_EVAL_CMD" > "$OUTPUT_FILE" 2>&1
         echo "Quality eval output for task '$TASK' saved to $OUTPUT_FILE"
     done
     echo "======================== Eval Model Quality $MODEL_ID End =================="