diff --git a/.github/scripts/torchao_model_releases/README.md b/.github/scripts/torchao_model_releases/README.md index be8c32be46..07c18412f3 100644 --- a/.github/scripts/torchao_model_releases/README.md +++ b/.github/scripts/torchao_model_releases/README.md @@ -140,12 +140,36 @@ After environment is setup, we can run eval: sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu ``` +See https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks for all supported tasks. + Note: you can pass in `--use_cache` if the eval task failed during the middle of the run -and you don't want to re-run all evals. +and you don't want to re-run all evals and there is no change to the model checkpoint. ``` sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache ``` +#### Multi-modal Model Quality Eval +For multi-modal model quality eval, we need to install lmms-eval +``` +uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git +``` +After environment is setup, we can run eval: +``` +sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 +``` + +See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/models/simple for supported model types. +See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks for supported multi-modal tasks. + +Note: larger mm_eval_batch_size could speedup eval but may cause OOM, when that happens, please reduce the number + +Note: you can pass in `--use_cache` if the eval task failed during the middle of the run +and you don't want to re-run all evals and there is no change to model checkpoint. +``` +sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache +``` + +Alternatively, please feel free to use the example scripts directly from llms-eval repo: https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/examples/models to run the evaluation. #### Summarize results After we have finished all evals for each model, we can summarize the results with: diff --git a/.github/scripts/torchao_model_releases/eval.sh b/.github/scripts/torchao_model_releases/eval.sh index bf1bd25e59..e7c71b1533 100644 --- a/.github/scripts/torchao_model_releases/eval.sh +++ b/.github/scripts/torchao_model_releases/eval.sh @@ -19,7 +19,10 @@ MODEL_ID_ARRAY=() EVAL_TYPE="all" # these will be parsed in the other scripts BATCH_SIZES="1 256" # Default for latency eval +MM_EVAL_BATCH_SIZE=1 # Default batch size for mm quality eval TASKS="mmlu" # Default for quality eval +MM_TASKS="chartqa" # Default for multi-modal quality eval (not included in all) +MODEL_TYPE="" USE_CACHE=false # default: do not use cache # Parse arguments while [[ $# -gt 0 ]]; do @@ -50,6 +53,10 @@ while [[ $# -gt 0 ]]; do BATCH_SIZES="$1" shift ;; + --mm_eval_batch_size) + MM_EVAL_BATCH_SIZE="$2" + shift 2 + ;; --tasks) shift if [[ $# -eq 0 ]]; then @@ -59,6 +66,24 @@ while [[ $# -gt 0 ]]; do TASKS="$1" shift ;; + --model_type) + shift + if [[ $# -eq 0 ]]; then + echo "Error: --model_type requires a value" + exit 1 + fi + MODEL_TYPE="$1" + shift + ;; + --mm_tasks) + shift + if [[ $# -eq 0 ]]; then + echo "Error: --mm_tasks requires a value" + exit 1 + fi + MM_TASKS="$1" + shift + ;; --use_cache) USE_CACHE=true shift @@ -93,6 +118,16 @@ run_quality() { sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS fi } +run_mm_quality() { + check_lmms_eval + local model_id="$1" + echo "run_mm_quality" $model_id $MODEL_TYPE + if $USE_CACHE; then + sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE --use_cache + else + sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE + fi +} for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do case "$EVAL_TYPE" in memory) @@ -104,6 +139,9 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do quality) run_quality "$MODEL_ID" ;; + mm_quality) + run_mm_quality "$MODEL_ID" + ;; all) run_quality "$MODEL_ID" run_memory "$MODEL_ID" diff --git a/.github/scripts/torchao_model_releases/eval_env_checks.sh b/.github/scripts/torchao_model_releases/eval_env_checks.sh index d6eb9c8801..f81b7572bc 100644 --- a/.github/scripts/torchao_model_releases/eval_env_checks.sh +++ b/.github/scripts/torchao_model_releases/eval_env_checks.sh @@ -24,3 +24,10 @@ check_lm_eval() { exit 1 fi } + +check_lmms_eval() { + if ! pip show lmms_eval > /dev/null 2>&1; then + echo "Error: lmms_eval package is NOT installed. please install with `uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git`" >&2 + exit 1 + fi +} diff --git a/.github/scripts/torchao_model_releases/summarize_results.sh b/.github/scripts/torchao_model_releases/summarize_results.sh index 7e9c43b99b..c90e8dafc8 100644 --- a/.github/scripts/torchao_model_releases/summarize_results.sh +++ b/.github/scripts/torchao_model_releases/summarize_results.sh @@ -51,8 +51,8 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do PATTERN="pretrained=${MODEL_ID}" LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1) if [ -n "$LAST_LINE" ]; then - echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE + 1))) ---" - tail -n +"$((LAST_LINE + 1))" "$Q_LOG" + echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---" + tail -n +"$((LAST_LINE))" "$Q_LOG" else echo "Pattern not found in $Q_LOG" fi @@ -61,6 +61,26 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do echo "--- No quality logs found matching pattern: $QUALITY_LOG_PATTERN" fi + MM_QUALITY_LOG_PATTERN="${SAFE_MODEL_ID}_mm_quality_*.log" + # Multi-modal Quality logs (multiple files, one per task) + MM_QUALITY_LOGS=( $MM_QUALITY_LOG_PATTERN ) + if [ -e "${MM_QUALITY_LOGS[0]}" ]; then + for Q_LOG in "${MM_QUALITY_LOGS[@]}"; do + # find last appearance of pretrained={MODEL_ID} and + # extract all lines after that + PATTERN="pretrained=${MODEL_ID}" + LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1) + if [ -n "$LAST_LINE" ]; then + echo "--- Multi-modal Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---" + tail -n +"$((LAST_LINE))" "$Q_LOG" + else + echo "Pattern not found in $Q_LOG" + fi + done + else + echo "--- No quality logs found matching pattern: $MM_QUALITY_LOG_PATTERN" + fi + MEMORY_LOG="${SAFE_MODEL_ID}_memory.log" if [ -f "$MEMORY_LOG" ]; then echo "--- Memory log (last 1 lines) ---"