# Convergence Analysis: Baseline vs BODHI v0.1.3

This notebook analyzes how metrics stabilize as sample size increases (5 → 10 → 20 → 40 → 80 → 150 → 200).

**Dataset**: HealthBench Hard - 200 stratified samples  
**Model**: GPT-4o-mini

## Setup

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

SAMPLE_SIZES = [5, 10, 20, 40, 80, 150, 200]
ASSETS_DIR = Path('assets')
ASSETS_DIR.mkdir(exist_ok=True)

In [None]:
# Load data
results_dir = Path('../Results')

with open(results_dir / 'baseline' / 'healthbench_hard_gpt-4o-mini_20260114_222940_allresults.json') as f:
    baseline_data = json.load(f)
with open(results_dir / 'v0.1.3' / 'healthbench_hard_gpt-4o-mini_20260114_211821_bodhi_allresults.json') as f:
    bodhi_data = json.load(f)

baseline_examples = baseline_data['metadata']['example_level_metadata']
bodhi_examples = bodhi_data['metadata']['example_level_metadata']

print(f"Loaded {len(baseline_examples)} baseline and {len(bodhi_examples)} BODHI examples")

In [None]:
# Helper functions
def compute_tag_score(examples, tag, n_samples=None):
    """Compute score for a tag across first n examples."""
    if n_samples:
        examples = examples[:n_samples]
    scores = []
    for ex in examples:
        for item in ex.get('rubric_items', []):
            if tag in item.get('tags', []):
                scores.append(1 if item.get('criteria_met', False) else 0)
    return (np.mean(scores) * 100, len(scores)) if scores else (None, 0)

def compute_overall_score(examples, n_samples=None):
    """Compute mean overall score."""
    if n_samples:
        examples = examples[:n_samples]
    return np.mean([ex.get('score', 0) for ex in examples]) * 100

def compute_convergence(tag):
    """Compute convergence data for a metric."""
    baseline, bodhi = [], []
    for n in SAMPLE_SIZES:
        b, _ = compute_tag_score(baseline_examples, tag, n)
        bo, _ = compute_tag_score(bodhi_examples, tag, n)
        baseline.append(b)
        bodhi.append(bo)
    return {'baseline': baseline, 'bodhi': bodhi}

def print_table(title, data):
    """Print convergence table."""
    print(f"\n{title}")
    print("="*70)
    print(f"{'Cases':<8} {'Baseline':<12} {'BODHI':<12} {'Δ':<12} {'Note'}")
    print("-"*70)
    for i, n in enumerate(SAMPLE_SIZES):
        b, bo = data['baseline'][i], data['bodhi'][i]
        b_str = f"{b:.1f}%" if b is not None else "N/A"
        bo_str = f"{bo:.1f}%" if bo is not None else "N/A"
        d_str = f"{bo-b:+.1f}%" if (b is not None and bo is not None) else "N/A"
        note = "(high var)" if n <= 10 else ("(stable)" if n >= 150 else "")
        print(f"{n:<8} {b_str:<12} {bo_str:<12} {d_str:<12} {note}")

def plot_convergence(name, data, filename):
    """Plot and save convergence chart."""
    fig, ax = plt.subplots(figsize=(10, 5))
    baseline = [v if v is not None else np.nan for v in data['baseline']]
    bodhi = [v if v is not None else np.nan for v in data['bodhi']]
    
    ax.plot(SAMPLE_SIZES, baseline, 'o-', color='#4A90A4', lw=2, ms=8, 
            label='Baseline', markeredgecolor='black')
    ax.plot(SAMPLE_SIZES, bodhi, 's-', color='#E67E22', lw=2, ms=8, 
            label='BODHI v0.1.3', markeredgecolor='black')
    ax.fill_between(SAMPLE_SIZES, baseline, bodhi, alpha=0.2, color='#27ae60')
    
    ax.set_xlabel('Number of Cases')
    ax.set_ylabel('Score (%)')
    ax.set_title(f'Convergence: {name}')
    ax.set_xscale('log')
    ax.set_xticks(SAMPLE_SIZES)
    ax.set_xticklabels(SAMPLE_SIZES)
    ax.set_ylim(-5, 105)
    ax.legend(loc='best')
    
    plt.tight_layout()
    plt.savefig(ASSETS_DIR / f'{filename}.png', dpi=150, bbox_inches='tight')
    plt.show()
    plt.close()

## 1. Overall Score

In [None]:
# Overall score convergence
overall_data = {
    'baseline': [compute_overall_score(baseline_examples, n) for n in SAMPLE_SIZES],
    'bodhi': [compute_overall_score(bodhi_examples, n) for n in SAMPLE_SIZES]
}
print_table("Overall Score", overall_data)
plot_convergence("Overall Score", overall_data, "overall_score")

## 2. Context Seeking

In [None]:
context_metrics = {
    'Context Seeking (insufficient info)': 'cluster:context_seeking_not-enough-context_context_seeking',
    'Helpful & Safe (insufficient info)': 'cluster:context_seeking_not-enough-context_helpful_safe',
    'Helpful & Safe (sufficient info)': 'cluster:context_seeking_enough-context_helpful_safe',
    'Precise (sufficient info)': 'cluster:context_seeking_enough-context_precise',
}

context_data = {name: compute_convergence(tag) for name, tag in context_metrics.items()}

for name, data in context_data.items():
    print_table(name, data)

In [None]:
# Plot main context-seeking metric
plot_convergence("Context Seeking (insufficient info)", 
                 context_data['Context Seeking (insufficient info)'], 
                 "context_seeking_main")

## 3. Emergency Referrals

In [None]:
emergency_metrics = {
    'Context Seeking (conditionally emergent)': 'cluster:emergency_referrals_conditionally-emergent_context_seeking',
    'Emergency Behavior (conditionally emergent)': 'cluster:emergency_referrals_conditionally-emergent_emergency_behavior',
    'Context Seeking (non-emergent)': 'cluster:emergency_referrals_non-emergent_context_seeking',
    'Emergency Behavior (non-emergent)': 'cluster:emergency_referrals_non-emergent_emergency_behavior',
}

emergency_data = {name: compute_convergence(tag) for name, tag in emergency_metrics.items()}

for name, data in emergency_data.items():
    print_table(name, data)

In [None]:
plot_convergence("Emergency Context Seeking", 
                 emergency_data['Context Seeking (conditionally emergent)'], 
                 "emergency_context_seeking")

## 4. Hedging

In [None]:
hedging_metrics = {
    'Accurate (reducible uncertainty)': 'cluster:hedging_any-reducible-uncertainty_accurate',
    'Hedges (reducible uncertainty)': 'cluster:hedging_any-reducible-uncertainty_hedges',
    'Seeks Context (reducible uncertainty)': 'cluster:hedging_any-reducible-uncertainty_seeks_context',
    'Accurate (no uncertainty)': 'cluster:hedging_no-uncertainty_accurate',
    'Hedges (no uncertainty)': 'cluster:hedging_no-uncertainty_hedges',
}

hedging_data = {name: compute_convergence(tag) for name, tag in hedging_metrics.items()}

for name, data in hedging_data.items():
    print_table(name, data)

In [None]:
plot_convergence("Seeks Context (reducible uncertainty)", 
                 hedging_data['Seeks Context (reducible uncertainty)'], 
                 "hedging_seeks_context")

## 5. Communication

In [None]:
communication_metrics = {
    'Accuracy (health professional)': 'cluster:communication_health-professional_accuracy_completeness',
    'Tailored (health professional)': 'cluster:communication_health-professional_tailored',
    'Accuracy (non health professional)': 'cluster:communication_not-health-professional_accuracy_completeness',
    'Tailored (non health professional)': 'cluster:communication_not-health-professional_tailored',
}

communication_data = {name: compute_convergence(tag) for name, tag in communication_metrics.items()}

for name, data in communication_data.items():
    print_table(name, data)

## 6. Complex Responses

In [None]:
complex_metrics = {
    'Accuracy & Hedging (detailed)': 'cluster:complex_responses_detailed_accuracy_hedging',
    'Appropriate (detailed)': 'cluster:complex_responses_detailed_appropriate',
    'Accuracy & Hedging (simple)': 'cluster:complex_responses_simple_accuracy_hedging',
    'Appropriate (simple)': 'cluster:complex_responses_simple_appropriate',
}

complex_data = {name: compute_convergence(tag) for name, tag in complex_metrics.items()}

for name, data in complex_data.items():
    print_table(name, data)

## 7. Health Data Tasks

In [None]:
health_data_metrics = {
    'Accuracy & Safety (enough info)': 'cluster:health_data_tasks_enough-info-to-complete-task_accuracy_safety',
    'Instruction Following (enough info)': 'cluster:health_data_tasks_enough-info-to-complete-task_response_instruction_following',
    'Helpfulness (not enough info)': 'cluster:health_data_tasks_not-enough-info-to-complete-task_helpfulness',
    'Safety (not enough info)': 'cluster:health_data_tasks_not-enough-info-to-complete-task_safety',
}

health_tasks_data = {name: compute_convergence(tag) for name, tag in health_data_metrics.items()}

for name, data in health_tasks_data.items():
    print_table(name, data)

## 8. Global Health

In [None]:
global_health_metrics = {
    'Aligned & Accurate (context unclear)': 'cluster:global_health_context-matters-but-unclear_aligned_accurate',
    'Language (context unclear)': 'cluster:global_health_context-matters-but-unclear_language',
    'Aligned & Accurate (context clear)': 'cluster:global_health_context-matters-is-clear_aligned_accurate',
    'Aligned & Accurate (context irrelevant)': 'cluster:global_health_context-does-not-matter_aligned_accurate',
}

global_health_data = {name: compute_convergence(tag) for name, tag in global_health_metrics.items()}

for name, data in global_health_data.items():
    print_table(name, data)

## 9. Axes (High-Level Dimensions)

In [None]:
axis_metrics = {
    'Accuracy': 'axis:accuracy',
    'Context Awareness': 'axis:context_awareness',
    'Completeness': 'axis:completeness',
    'Communication Quality': 'axis:communication_quality',
    'Instruction Following': 'axis:instruction_following',
}

axis_data = {name: compute_convergence(tag) for name, tag in axis_metrics.items()}

for name, data in axis_data.items():
    print_table(name, data)

In [None]:
plot_convergence("Context Awareness (Axis)", 
                 axis_data['Context Awareness'], 
                 "axis_context_awareness")

## Final Summary

In [None]:
# Collect all metrics for summary
all_metrics = {
    **{'Overall Score': overall_data},
    **{f"[Context] {k}": v for k, v in context_data.items()},
    **{f"[Emergency] {k}": v for k, v in emergency_data.items()},
    **{f"[Hedging] {k}": v for k, v in hedging_data.items()},
    **{f"[Communication] {k}": v for k, v in communication_data.items()},
    **{f"[Complex] {k}": v for k, v in complex_data.items()},
    **{f"[Health Data] {k}": v for k, v in health_tasks_data.items()},
    **{f"[Global Health] {k}": v for k, v in global_health_data.items()},
    **{f"[Axis] {k}": v for k, v in axis_data.items()},
}

print("\n" + "="*90)
print("FINAL SUMMARY: All Metrics at n=200")
print("="*90)
print(f"{'Metric':<60} {'Baseline':>10} {'BODHI':>10} {'Δ':>10}")
print("-"*90)

for name, data in all_metrics.items():
    b, bo = data['baseline'][-1], data['bodhi'][-1]
    if b is None and bo is None:
        continue
    b_str = f"{b:.1f}%" if b is not None else "N/A"
    bo_str = f"{bo:.1f}%" if bo is not None else "N/A"
    d_str = f"{bo-b:+.1f}%" if (b is not None and bo is not None) else "N/A"
    print(f"{name:<60} {b_str:>10} {bo_str:>10} {d_str:>10}")

print("="*90)

## Export Data

In [None]:
# Export all convergence data
export_data = {
    'sample_sizes': SAMPLE_SIZES,
    'metrics': {name: data for name, data in all_metrics.items()}
}

with open(ASSETS_DIR / 'convergence_data.json', 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"Exported {len(all_metrics)} metrics to {ASSETS_DIR / 'convergence_data.json'}")
print(f"Images saved to {ASSETS_DIR}/")