From dc5cd6ee643b8b5411d3ff2e55deb4ebacf95e2a Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 25 Sep 2025 18:10:39 -0700 Subject: [PATCH] Don't use_cache for lm_eval by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: use_cache option in lm_eval will read the results for (model_id, task) pair if it's already evaluated, but during development sometimes we'll update the model and need to re-evaluate, so have to disable cache to get new eval results. This PR changes eval_quality.sh to not use cache by default, user can still enable it by explicitly passing `--use_cache` don't use cache: sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4" use cache: sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4" --use_cache Test Plan: ``` sh eval.sh --eval_type quality --model_ids "$QMODEL_PREFIX-AWQ-INT4" --use_cache
 Logs in /home/jerryzh/local/ao/.github/scripts/torchao_model_releases/jerryzh168_gemma-3-12b-it-AWQ-INT4_quality_mmlu.log: MLoading checkpoint shards: 0%| | 0/2 [00:00 ... [--eval_type ] [--batch_sizes ] [--tasks ]" + echo "Usage: $0 --model_ids ... [--eval_type ] [--batch_sizes ] [--tasks ] [--use_cache]" echo "Defaults:" echo " batch_sizes: 1 256" echo " tasks: mmlu" @@ -20,6 +20,7 @@ EVAL_TYPE="all" # these will be parsed in the other scripts BATCH_SIZES="1 256" # Default for latency eval TASKS="mmlu" # Default for quality eval +USE_CACHE=false # default: do not use cache # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in @@ -58,6 +59,10 @@ while [[ $# -gt 0 ]]; do TASKS="$1" shift ;; + --use_cache) + USE_CACHE=true + shift + ;; *) echo "Unknown argument: $1" usage @@ -82,7 +87,11 @@ run_latency() { run_quality() { check_lm_eval local model_id="$1" - sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS + if $USE_CACHE; then + sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS --use_cache + else + sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS + fi } for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do case "$EVAL_TYPE" in diff --git a/.github/scripts/torchao_model_releases/eval_quality.sh b/.github/scripts/torchao_model_releases/eval_quality.sh index dd0ab9c2b2..1674aee2d7 100644 --- a/.github/scripts/torchao_model_releases/eval_quality.sh +++ b/.github/scripts/torchao_model_releases/eval_quality.sh @@ -11,6 +11,7 @@ check_lm_eval MODEL_ID_ARRAY=() TASK_ARRAY=("mmlu") # default can be overwritten by user input +USE_CACHE=false # default: do not use cache # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in @@ -29,9 +30,13 @@ while [[ $# -gt 0 ]]; do shift done ;; + --use_cache) + USE_CACHE=true + shift + ;; *) echo "Unknown argument: $1" - echo "Usage: $0 --model_id [--tasks (comma-separated, e.g. mmlu,arc_challenge, default mmlu)]" + echo "Usage: $0 --model_id [--tasks (comma-separated, e.g. mmlu,arc_challenge, default mmlu)] [--use_cache]" exit 1 ;; esac @@ -51,16 +56,19 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do EVAL_CACHE_DB_PREFIX="/tmp/${SAFE_MODEL_ID}_quality_${TASK}" mkdir -p "${EVAL_CACHE_DB_PREFIX}" echo "Running model quality (accuracy) evaluation for model $MODEL_ID on task $TASK" - - lm_eval \ + LM_EVAL_CMD="lm_eval \ --model hf \ - --model_args pretrained="$MODEL_ID" \ - --tasks "$TASK" \ + --model_args pretrained=\"$MODEL_ID\" \ + --tasks \"$TASK\" \ --device cuda:0 \ - --use_cache "$EVAL_CACHE_DB_PREFIX" \ --batch_size auto \ - --output_path "$RESULTS_DIR" > "$OUTPUT_FILE" 2>&1 + --output_path \"$RESULTS_DIR\"" + + if $USE_CACHE; then + LM_EVAL_CMD="$LM_EVAL_CMD --use_cache \"$EVAL_CACHE_DB_PREFIX\"" + fi + eval "$LM_EVAL_CMD" > "$OUTPUT_FILE" 2>&1 echo "Quality eval output for task '$TASK' saved to $OUTPUT_FILE" done echo "======================== Eval Model Quality $MODEL_ID End =================="