Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion .github/scripts/torchao_model_releases/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,36 @@ After environment is setup, we can run eval:
sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
```

See https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks for all supported tasks.

Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
and you don't want to re-run all evals.
and you don't want to re-run all evals and there is no change to the model checkpoint.
```
sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
```

#### Multi-modal Model Quality Eval
For multi-modal model quality eval, we need to install lmms-eval
```
uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
```
After environment is setup, we can run eval:
```
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32
```

See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/models/simple for supported model types.
See https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks for supported multi-modal tasks.

Note: larger mm_eval_batch_size could speedup eval but may cause OOM, when that happens, please reduce the number

Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
and you don't want to re-run all evals and there is no change to model checkpoint.
```
sh eval.sh --eval_type mm_quality --model_ids google/gemma-3-12b-it --mm_tasks chartqa --model_type gemma3 --mm_eval_batch_size 32 --use_cache
```

Alternatively, please feel free to use the example scripts directly from llms-eval repo: https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/examples/models to run the evaluation.

#### Summarize results
After we have finished all evals for each model, we can summarize the results with:
Expand Down
38 changes: 38 additions & 0 deletions .github/scripts/torchao_model_releases/eval.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ MODEL_ID_ARRAY=()
EVAL_TYPE="all"
# these will be parsed in the other scripts
BATCH_SIZES="1 256" # Default for latency eval
MM_EVAL_BATCH_SIZE=1 # Default batch size for mm quality eval
TASKS="mmlu" # Default for quality eval
MM_TASKS="chartqa" # Default for multi-modal quality eval (not included in all)
MODEL_TYPE=""
USE_CACHE=false # default: do not use cache
# Parse arguments
while [[ $# -gt 0 ]]; do
Expand Down Expand Up @@ -50,6 +53,10 @@ while [[ $# -gt 0 ]]; do
BATCH_SIZES="$1"
shift
;;
--mm_eval_batch_size)
MM_EVAL_BATCH_SIZE="$2"
shift 2
;;
--tasks)
shift
if [[ $# -eq 0 ]]; then
Expand All @@ -59,6 +66,24 @@ while [[ $# -gt 0 ]]; do
TASKS="$1"
shift
;;
--model_type)
shift
if [[ $# -eq 0 ]]; then
echo "Error: --model_type requires a value"
exit 1
fi
MODEL_TYPE="$1"
shift
;;
--mm_tasks)
shift
if [[ $# -eq 0 ]]; then
echo "Error: --mm_tasks requires a value"
exit 1
fi
MM_TASKS="$1"
shift
;;
--use_cache)
USE_CACHE=true
shift
Expand Down Expand Up @@ -93,6 +118,16 @@ run_quality() {
sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
fi
}
run_mm_quality() {
check_lmms_eval
local model_id="$1"
echo "run_mm_quality" $model_id $MODEL_TYPE
if $USE_CACHE; then
sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE --use_cache
else
sh eval_mm_quality.sh --model_ids "$model_id" --tasks $MM_TASKS --model_type $MODEL_TYPE --batch_size $MM_EVAL_BATCH_SIZE
fi
}
for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
case "$EVAL_TYPE" in
memory)
Expand All @@ -104,6 +139,9 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
quality)
run_quality "$MODEL_ID"
;;
mm_quality)
run_mm_quality "$MODEL_ID"
;;
all)
run_quality "$MODEL_ID"
run_memory "$MODEL_ID"
Expand Down
7 changes: 7 additions & 0 deletions .github/scripts/torchao_model_releases/eval_env_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,10 @@ check_lm_eval() {
exit 1
fi
}

check_lmms_eval() {
if ! pip show lmms_eval > /dev/null 2>&1; then
echo "Error: lmms_eval package is NOT installed. please install with `uv pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git`" >&2
exit 1
fi
}
24 changes: 22 additions & 2 deletions .github/scripts/torchao_model_releases/summarize_results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
PATTERN="pretrained=${MODEL_ID}"
LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
if [ -n "$LAST_LINE" ]; then
echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE + 1))) ---"
tail -n +"$((LAST_LINE + 1))" "$Q_LOG"
echo "--- Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
tail -n +"$((LAST_LINE))" "$Q_LOG"
else
echo "Pattern not found in $Q_LOG"
fi
Expand All @@ -61,6 +61,26 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
echo "--- No quality logs found matching pattern: $QUALITY_LOG_PATTERN"
fi

MM_QUALITY_LOG_PATTERN="${SAFE_MODEL_ID}_mm_quality_*.log"
# Multi-modal Quality logs (multiple files, one per task)
MM_QUALITY_LOGS=( $MM_QUALITY_LOG_PATTERN )
if [ -e "${MM_QUALITY_LOGS[0]}" ]; then
for Q_LOG in "${MM_QUALITY_LOGS[@]}"; do
# find last appearance of pretrained={MODEL_ID} and
# extract all lines after that
PATTERN="pretrained=${MODEL_ID}"
LAST_LINE=$(grep -n "$PATTERN" "$Q_LOG" | tail -1 | cut -d: -f1)
if [ -n "$LAST_LINE" ]; then
echo "--- Multi-modal Quality log: $Q_LOG (lines starting from $((LAST_LINE))) ---"
tail -n +"$((LAST_LINE))" "$Q_LOG"
else
echo "Pattern not found in $Q_LOG"
fi
done
else
echo "--- No quality logs found matching pattern: $MM_QUALITY_LOG_PATTERN"
fi

MEMORY_LOG="${SAFE_MODEL_ID}_memory.log"
if [ -f "$MEMORY_LOG" ]; then
echo "--- Memory log (last 1 lines) ---"
Expand Down
Loading