pytorch · jerryzh168 · Sep 26, 2025 · Sep 26, 2025
diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md
@@ -119,7 +119,7 @@ uv pip install vllm --pre --extra-index-url https://download.pytorch.org/whl/nig
 
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1,256
+sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1 256
 ```
 
 #### Model Quality Eval
@@ -129,9 +129,16 @@ uv pip install lm-eval
 ```
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag,mmlu
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
 ```
 
+Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
+and you don't want to re-run all evals.
+```
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
+```
+
+
 #### Summarize results
 After we have finished all evals for each model, we can summarize the results with:
 ```

diff --git a/.github/scripts/torchao_model_releases/eval.sh b/.github/scripts/torchao_model_releases/eval.sh
@@ -9,7 +9,7 @@ set -e
 source eval_env_checks.sh
 
 usage() {
-  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>]"
+  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>] [--use_cache]"
   echo "Defaults:"
   echo "  batch_sizes: 1 256"
   echo "  tasks: mmlu"
@@ -20,6 +20,7 @@ EVAL_TYPE="all"
 # these will be parsed in the other scripts
 BATCH_SIZES="1 256"    # Default for latency eval
 TASKS="mmlu"           # Default for quality eval
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -58,6 +59,10 @@ while [[ $# -gt 0 ]]; do
       TASKS="$1"
       shift
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
       usage
@@ -82,7 +87,11 @@ run_latency() {
 run_quality() {
   check_lm_eval
   local model_id="$1"
-  sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  if $USE_CACHE; then
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS --use_cache
+  else
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  fi
 }
 for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
   case "$EVAL_TYPE" in

diff --git a/.github/scripts/torchao_model_releases/eval_quality.sh b/.github/scripts/torchao_model_releases/eval_quality.sh
@@ -11,6 +11,7 @@ check_lm_eval
 
 MODEL_ID_ARRAY=()
 TASK_ARRAY=("mmlu")  # default can be overwritten by user input
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -29,9 +30,13 @@ while [[ $# -gt 0 ]]; do
         shift
       done
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
-      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)]"
+      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)] [--use_cache]"
       exit 1
       ;;
   esac
@@ -51,16 +56,19 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
         EVAL_CACHE_DB_PREFIX="/tmp/${SAFE_MODEL_ID}_quality_${TASK}"
         mkdir -p "${EVAL_CACHE_DB_PREFIX}"
         echo "Running model quality (accuracy) evaluation for model $MODEL_ID on task $TASK"
-
-        lm_eval \
+        LM_EVAL_CMD="lm_eval \
             --model hf \
-            --model_args pretrained="$MODEL_ID" \
-            --tasks "$TASK" \
+            --model_args pretrained=\"$MODEL_ID\" \
+            --tasks \"$TASK\" \
             --device cuda:0 \
-            --use_cache "$EVAL_CACHE_DB_PREFIX" \
             --batch_size auto \
-            --output_path "$RESULTS_DIR" > "$OUTPUT_FILE" 2>&1
+            --output_path \"$RESULTS_DIR\""
+
+        if $USE_CACHE; then
+            LM_EVAL_CMD="$LM_EVAL_CMD --use_cache \"$EVAL_CACHE_DB_PREFIX\""
+        fi
 
+        eval "$LM_EVAL_CMD" > "$OUTPUT_FILE" 2>&1
         echo "Quality eval output for task '$TASK' saved to $OUTPUT_FILE"
     done
     echo "======================== Eval Model Quality $MODEL_ID End =================="