diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index f4609fc7ee..4ff1f96b14 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -119,7 +119,7 @@ uv pip install vllm --pre --extra-index-url https://download.pytorch.org/whl/nig After environment is setup, we can run eval: ``` -sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1,256 +sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1 256 ``` #### Model Quality Eval @@ -129,9 +129,16 @@ uv pip install lm-eval ``` After environment is setup, we can run eval: ``` -sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag,mmlu +sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu ``` +Note: you can pass in `--use_cache` if the eval task failed during the middle of the run +and you don't want to re-run all evals. +``` +sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache +``` + + #### Summarize results After we have finished all evals for each model, we can summarize the results with: ``` diff --git a/.github/scripts/torchao_model_releases/eval.sh b/.github/scripts/torchao_model_releases/eval.sh index f284b2a0c3..bf1bd25e59 100644 --- a/.github/scripts/torchao_model_releases/eval.sh +++ b/.github/scripts/torchao_model_releases/eval.sh @@ -9,7 +9,7 @@ set -e source eval_env_checks.sh usage() { - echo "Usage: $0 --model_ids ... [--eval_type ] [--batch_sizes ] [--tasks ]" + echo "Usage: $0 --model_ids ... [--eval_type ] [--batch_sizes ] [--tasks ] [--use_cache]" echo "Defaults:" echo " batch_sizes: 1 256" echo " tasks: mmlu" @@ -20,6 +20,7 @@ EVAL_TYPE="all" # these will be parsed in the other scripts BATCH_SIZES="1 256" # Default for latency eval TASKS="mmlu" # Default for quality eval +USE_CACHE=false # default: do not use cache # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in @@ -58,6 +59,10 @@ while [[ $# -gt 0 ]]; do TASKS="$1" shift ;; + --use_cache) + USE_CACHE=true + shift + ;; *) echo "Unknown argument: $1" usage @@ -82,7 +87,11 @@ run_latency() { run_quality() { check_lm_eval local model_id="$1" - sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS + if $USE_CACHE; then + sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS --use_cache + else + sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS + fi } for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do case "$EVAL_TYPE" in diff --git a/.github/scripts/torchao_model_releases/eval_quality.sh b/.github/scripts/torchao_model_releases/eval_quality.sh index dd0ab9c2b2..1674aee2d7 100644 --- a/.github/scripts/torchao_model_releases/eval_quality.sh +++ b/.github/scripts/torchao_model_releases/eval_quality.sh @@ -11,6 +11,7 @@ check_lm_eval MODEL_ID_ARRAY=() TASK_ARRAY=("mmlu") # default can be overwritten by user input +USE_CACHE=false # default: do not use cache # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in @@ -29,9 +30,13 @@ while [[ $# -gt 0 ]]; do shift done ;; + --use_cache) + USE_CACHE=true + shift + ;; *) echo "Unknown argument: $1" - echo "Usage: $0 --model_id [--tasks (comma-separated, e.g. mmlu,arc_challenge, default mmlu)]" + echo "Usage: $0 --model_id [--tasks (comma-separated, e.g. mmlu,arc_challenge, default mmlu)] [--use_cache]" exit 1 ;; esac @@ -51,16 +56,19 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do EVAL_CACHE_DB_PREFIX="/tmp/${SAFE_MODEL_ID}_quality_${TASK}" mkdir -p "${EVAL_CACHE_DB_PREFIX}" echo "Running model quality (accuracy) evaluation for model $MODEL_ID on task $TASK" - - lm_eval \ + LM_EVAL_CMD="lm_eval \ --model hf \ - --model_args pretrained="$MODEL_ID" \ - --tasks "$TASK" \ + --model_args pretrained=\"$MODEL_ID\" \ + --tasks \"$TASK\" \ --device cuda:0 \ - --use_cache "$EVAL_CACHE_DB_PREFIX" \ --batch_size auto \ - --output_path "$RESULTS_DIR" > "$OUTPUT_FILE" 2>&1 + --output_path \"$RESULTS_DIR\"" + + if $USE_CACHE; then + LM_EVAL_CMD="$LM_EVAL_CMD --use_cache \"$EVAL_CACHE_DB_PREFIX\"" + fi + eval "$LM_EVAL_CMD" > "$OUTPUT_FILE" 2>&1 echo "Quality eval output for task '$TASK' saved to $OUTPUT_FILE" done echo "======================== Eval Model Quality $MODEL_ID End =================="