# 06 - Results Dashboard

Generate comparison visualizations across all models.
Run locally after `05_evaluation.ipynb`.

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path(".").resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import yaml
with open(PROJECT_ROOT / "config" / "benchmark_config.yaml") as f:
    config = yaml.safe_load(f)

In [None]:
from src.reporting.dashboard import load_all_results, generate_summary_table

METRICS_DIR = PROJECT_ROOT / config["paths"]["metrics_dir"]
OUTPUTS_DIR = PROJECT_ROOT / config["paths"]["raw_outputs_dir"]
DASHBOARD_DIR = PROJECT_ROOT / "results" / "dashboard"

df = load_all_results(METRICS_DIR, OUTPUTS_DIR)
print(f"Loaded {len(df)} page evaluations across {df['model'].nunique()} models")

In [None]:
# Summary table
summary = generate_summary_table(df)
print(summary.to_string(index=False))

In [None]:
from src.reporting.dashboard import plot_accuracy_bar

fig = plot_accuracy_bar(df)
fig.show()

In [None]:
from src.reporting.dashboard import plot_accuracy_vs_speed

fig = plot_accuracy_vs_speed(df)
fig.show()

In [None]:
from src.reporting.dashboard import plot_radar

fig = plot_radar(df, top_n=5)
fig.show()

In [None]:
from src.reporting.dashboard import plot_content_type_heatmap

fig = plot_content_type_heatmap(df)
fig.show()

In [None]:
from src.reporting.dashboard import plot_metric_distributions

fig = plot_metric_distributions(df)
fig.show()

In [None]:
from src.reporting.dashboard import generate_full_dashboard

generate_full_dashboard(METRICS_DIR, OUTPUTS_DIR, DASHBOARD_DIR)
print(f"\nDashboard saved to {DASHBOARD_DIR}")

In [None]:
# Qualitative comparison (side-by-side viewer)
from src.reporting.qualitative import generate_comparison_report

SAMPLE_SET = PROJECT_ROOT / "data" / "sample_sets" / "quick_dev.json"
GT_TEXT = PROJECT_ROOT / config["paths"]["embedded_text_dir"]
IMAGE_DIR = PROJECT_ROOT / config["paths"]["image_dir"]

# Get available models
model_keys = [d.name for d in Path(OUTPUTS_DIR).iterdir() if d.is_dir()]

generate_comparison_report(
    sample_set_path=SAMPLE_SET,
    model_keys=model_keys[:4],  # Limit to 4 models for readability
    raw_outputs_dir=OUTPUTS_DIR,
    gt_text_dir=GT_TEXT,
    image_dir=IMAGE_DIR,
    output_path=DASHBOARD_DIR / "qualitative_comparison.html",
    max_pages=20,
)
print("Qualitative comparison saved!")