# 05 - Evaluation

Compute all metrics for each model's outputs against ground truth.
Run locally after all benchmark notebooks complete.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open(PROJECT_ROOT / "config" / "benchmark_config.yaml") as f:
    config = yaml.safe_load(f)
with open(PROJECT_ROOT / "config" / "model_registry.yaml") as f:
    registry = yaml.safe_load(f)

In [None]:
SAMPLE_SET = PROJECT_ROOT / "data" / "sample_sets" / "stratified_100.json"
RAW_OUTPUTS = PROJECT_ROOT / config["paths"]["raw_outputs_dir"]
GT_TEXT = PROJECT_ROOT / config["paths"]["embedded_text_dir"]
GT_TABLES = PROJECT_ROOT / config["paths"]["table_gt_dir"]
METRICS_DIR = PROJECT_ROOT / config["paths"]["metrics_dir"]

# Find which models have outputs
available_models = [d.name for d in RAW_OUTPUTS.iterdir() if d.is_dir()]
print(f"Models with outputs: {available_models}")

In [None]:
from src.pipeline.evaluator import evaluate_all_models

results = evaluate_all_models(
    model_keys=available_models,
    sample_set_path=SAMPLE_SET,
    raw_outputs_dir=RAW_OUTPUTS,
    gt_text_dir=GT_TEXT,
    gt_tables_dir=GT_TABLES if GT_TABLES.exists() else None,
    metrics_dir=METRICS_DIR,
    config=config,
)

In [None]:
# Summary comparison
import pandas as pd

rows = []
for model_key, result in results.items():
    agg = result.get("aggregate", {})
    text_agg = agg.get("text", {})
    row = {"model": model_key, "pages": result.get("total_pages_evaluated", 0)}
    for metric in ["ned", "cer", "wer", "bleu", "fuzzy_ratio"]:
        if metric in text_agg:
            row[metric] = text_agg[metric]["mean"]
    if "ned" in row:
        row["text_accuracy"] = round((1 - row["ned"]) * 100, 2)
    rows.append(row)

df = pd.DataFrame(rows).sort_values("text_accuracy", ascending=False)
print(df.to_string(index=False))

In [None]:
# Per content-type breakdown
for model_key in available_models[:3]:  # Top 3
    result = results[model_key]
    pages = result.get("per_page", [])
    by_type = {}
    for p in pages:
        ct = p.get("content_type", "unknown")
        by_type.setdefault(ct, []).append(p["text_metrics"]["ned"])
    
    print(f"\n{model_key}:")
    for ct, neds in sorted(by_type.items()):
        import statistics
        mean_ned = statistics.mean(neds)
        print(f"  {ct}: NED={mean_ned:.4f}, accuracy={100*(1-mean_ned):.1f}%")