# Explore Evaluation Results

This notebook provides interactive exploration of evaluation data collected from the resume optimizer pipeline.

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from evals.db.eval_db import EvalDatabase
from evals.framework.analyzer import EvalAnalyzer
from evals.framework.config_resume import get_resume_eval_config, RESUME_STAGES

In [None]:
# Initialize database and analyzer
config = get_resume_eval_config()
db = EvalDatabase(config.db_path)
analyzer = EvalAnalyzer(db)

print(f"Database: {config.db_path}")

## 1. Overview Statistics

In [None]:
# Count scenarios and evaluations
scenarios = db.list_scenarios(limit=1000)
print(f"Total scenarios: {len(scenarios)}")

for stage_id in RESUME_STAGES:
    judgments = db.get_judgments_for_stage(stage_id)
    print(f"  {stage_id}: {len(judgments)} judgments")

## 2. Win Rates by Stage

In [None]:
# Select stage to analyze
stage_id = "optimizer"  # Change as needed

win_rates = analyzer.compute_win_rates(stage_id)

if win_rates:
    df = pd.DataFrame([
        {
            "Model": r.model_id.split("/")[-1],
            "Win Rate": r.win_rate,
            "Wins": r.wins,
            "Appearances": r.appearances,
        }
        for r in win_rates
    ])
    display(df)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=df, x="Model", y="Win Rate", ax=ax)
    ax.set_title(f"Win Rates for {stage_id} Stage")
    ax.set_ylim(0, 1)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("No evaluation data available for this stage.")

## 3. Bradley-Terry Ranking

In [None]:
bt_results = analyzer.bradley_terry_ranking(stage_id)

if bt_results:
    bt_df = pd.DataFrame([
        {
            "Rank": r.rank,
            "Model": r.model_id.split("/")[-1],
            "Strength": r.strength,
        }
        for r in bt_results
    ])
    display(bt_df)
else:
    print("Insufficient data for Bradley-Terry ranking.")

## 4. Pairwise Comparisons

In [None]:
pairwise = analyzer.all_pairwise_comparisons(stage_id)

if pairwise:
    pw_df = pd.DataFrame([
        {
            "Model A": r.model_a.split("/")[-1],
            "Model B": r.model_b.split("/")[-1],
            "P(A > B)": r.p_a_preferred,
            "CI Low": r.ci_low,
            "CI High": r.ci_high,
            "Significant": r.significant,
            "N": r.total,
        }
        for r in pairwise
    ])
    display(pw_df)
else:
    print("No pairwise data available.")

## 5. Score Analysis

In [None]:
mean_scores = analyzer.compute_mean_scores(stage_id)

if mean_scores:
    scores_data = []
    for model, criteria in mean_scores.items():
        for criterion, score in criteria.items():
            scores_data.append({
                "Model": model.split("/")[-1],
                "Criterion": criterion,
                "Score": score,
            })
    
    scores_df = pd.DataFrame(scores_data)
    
    # Pivot for heatmap
    pivot = scores_df.pivot(index="Model", columns="Criterion", values="Score")
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu", ax=ax, vmin=1, vmax=5)
    ax.set_title(f"Mean Scores by Criterion ({stage_id})")
    plt.tight_layout()
    plt.show()
else:
    print("No score data available.")

## 6. Tag Analysis

In [None]:
tag_freqs = analyzer.compute_tag_frequencies(stage_id)

if tag_freqs:
    tag_data = []
    for model, tags in tag_freqs.items():
        for tag, count in tags.items():
            tag_data.append({
                "Model": model.split("/")[-1],
                "Tag": tag,
                "Count": count,
            })
    
    tag_df = pd.DataFrame(tag_data)
    display(tag_df)
else:
    print("No tag data available.")

## 7. Export Report

In [None]:
import json

report = analyzer.generate_report(stage_id)

# Save to file
output_path = f"../results/{stage_id}_report.json"
Path(output_path).parent.mkdir(exist_ok=True)

with open(output_path, "w") as f:
    json.dump(report, f, indent=2)

print(f"Report saved to {output_path}")