# NAME MVP Interactive Exploration

This notebook lets you explore the NAME evaluation results interactively.

You can:
- Adjust parameters and re-run the analysis
- Inspect individual learning curves
- Compare different AI capability levels
- Generate custom visualizations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pathlib import Path

# Load data
learning_curves = pd.read_csv("data/wrangled/name_learning_curves.csv")
horizons = pd.read_csv("data/wrangled/name_horizons.csv")

# Load runs (sample for memory efficiency)
runs = []
with open("data/external/name_runs.jsonl", 'r') as f:
    for i, line in enumerate(f):
        if i < 10000:  # First 10k runs
            runs.append(json.loads(line))
runs_df = pd.DataFrame(runs)

print(f"Loaded {len(learning_curves)} learning curves")
print(f"Loaded {len(horizons)} model horizons")
print(f"Loaded {len(runs_df)} runs (sample)")

## 1. Explore a Specific Learning Curve

In [None]:
# Pick a task to explore
task_id = learning_curves.iloc[25]['task_id']  # Change index to explore different tasks
print(f"Exploring: {task_id}")

# Get task data
task_curve = learning_curves[learning_curves['task_id'] == task_id].iloc[0]
task_runs = runs_df[
    (runs_df['task_id'] == task_id) & 
    (runs_df['learner_type'] == 'human_novice')
]

# Calculate empirical success rate by attempt
empirical = task_runs.groupby('attempt_number')['score_binarized'].agg(['mean', 'count'])

# Generate fitted curve
attempts = np.linspace(1, 10, 100)
base = task_curve['first_attempt_success_rate']
asymptote = task_curve['fitted_asymptote']
lam = task_curve['fitted_lambda']
gain = asymptote - base

fitted = base + gain * (1 - np.exp(-lam * attempts))

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(empirical.index, empirical['mean'], 'o-', markersize=8, 
        linewidth=2, label='Empirical (novices)', color='#1f77b4')
ax.plot(attempts, fitted, '--', linewidth=2, alpha=0.7, 
        label='Fitted model', color='#1f77b4')

# Add 50% and 80% lines
ax.axhline(0.5, color='red', linestyle=':', alpha=0.5, label='50% threshold')
ax.axhline(0.8, color='orange', linestyle=':', alpha=0.5, label='80% threshold')

ax.set_xlabel('Attempt Number', fontsize=12)
ax.set_ylabel('Success Rate', fontsize=12)
ax.set_title(f'Learning Curve: {task_id}\n' + 
             f'Difficulty: {base:.2f}, λ: {lam:.3f}, RMSE: {task_curve["rmse"]:.4f}',
             fontsize=13)
ax.set_ylim(-0.05, 1.05)
ax.grid(True, alpha=0.3)
ax.legend(fontsize=11)

plt.tight_layout()
plt.show()

# Print statistics
print(f"\nTask Statistics:")
print(f"  First-attempt success: {base:.1%}")
print(f"  Final asymptote: {asymptote:.1%}")
print(f"  Learning rate (λ): {lam:.3f}")
print(f"  RMSE: {task_curve['rmse']:.4f}")
print(f"  N for 50% success: {task_curve['n_for_50pct']:.2f} attempts")
print(f"  N for 80% success: {task_curve['n_for_80pct']:.2f} attempts")
print(f"  Novices sampled: {task_curve['n_novices']}")

## 2. Compare AI Models Across Difficulty Spectrum

In [None]:
# Get AI performance by task difficulty
ai_models = ['baseline-weak', 'baseline-medium', 'baseline-strong']
colors = {'baseline-weak': '#d62728', 'baseline-medium': '#ff7f0e', 'baseline-strong': '#2ca02c'}

fig, ax = plt.subplots(figsize=(12, 7))

for model in ai_models:
    model_runs = runs_df[
        (runs_df['learner_type'] == 'ai_zero_shot') & 
        (runs_df['alias'] == model)
    ]
    
    # Calculate success rate by task
    task_performance = model_runs.groupby('task_id').agg({
        'score_binarized': 'mean',
        'first_attempt_success_rate': 'first'
    }).reset_index()
    
    # Sort by difficulty for smooth line
    task_performance = task_performance.sort_values('first_attempt_success_rate')
    
    # Plot
    ax.scatter(task_performance['first_attempt_success_rate'], 
              task_performance['score_binarized'],
              alpha=0.4, s=30, color=colors[model], label=f'{model} (points)')
    
    # Add smoothed trend line
    from scipy.ndimage import uniform_filter1d
    smoothed = uniform_filter1d(task_performance['score_binarized'].values, size=5)
    ax.plot(task_performance['first_attempt_success_rate'], smoothed,
           linewidth=2.5, color=colors[model], label=f'{model} (trend)', alpha=0.8)

# Add diagonal reference line
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3, linewidth=1.5, label='y=x (perfect match)')

# Add model horizons as vertical lines
for _, row in horizons.iterrows():
    model = row['model']
    diff = row['difficulty_at_50pct']
    if np.isfinite(diff):
        ax.axvline(diff, color=colors[model], linestyle=':', alpha=0.5, linewidth=1.5)
        ax.text(diff, 0.95, f"{model}\n50% point", 
               rotation=90, va='top', ha='right', fontsize=8, color=colors[model])

ax.set_xlabel('Human First-Attempt Success Rate (Task Difficulty)', fontsize=12)
ax.set_ylabel('AI Zero-Shot Success Rate', fontsize=12)
ax.set_title('AI Performance vs Task Difficulty\n(Vertical lines show where each AI reaches 50%)', 
            fontsize=13, fontweight='bold')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-0.05, 1.05)
ax.grid(True, alpha=0.3)
ax.legend(fontsize=9, loc='upper left')
ax.set_aspect('equal')

plt.tight_layout()
plt.show()

## 3. Experiment: What if We Change AI Capability?

Let's simulate a new AI model with custom capability and see what its NAME horizon would be.

In [None]:
from src.simulate.novice_learning import AIZeroShotSimulator

# Create a custom AI model
custom_capability = 0.6  # Try values between 0.0 and 1.0
print(f"Simulating AI with capability: {custom_capability}")

ai_simulator = AIZeroShotSimulator(capability=custom_capability, random_seed=42)

# Simulate on all tasks
custom_results = []
for _, task in learning_curves.iterrows():
    difficulty = task['first_attempt_success_rate']
    
    # Simulate 20 attempts to get stable estimate
    successes = [ai_simulator.simulate_attempt(difficulty) for _ in range(20)]
    success_rate = np.mean(successes)
    
    custom_results.append({
        'task_id': task['task_id'],
        'difficulty': difficulty,
        'ai_success': success_rate,
        'n_for_50pct': task['n_for_50pct']
    })

custom_df = pd.DataFrame(custom_results)

# Find where AI reaches 50% success
from scipy.interpolate import interp1d

sorted_df = custom_df.sort_values('ai_success')
if len(sorted_df) >= 2:
    interp = interp1d(sorted_df['ai_success'], sorted_df['difficulty'], 
                     bounds_error=False, fill_value='extrapolate')
    difficulty_at_50 = float(interp(0.5))
    
    # Find N-attempts at that difficulty
    interp_n = interp1d(sorted_df['difficulty'], sorted_df['n_for_50pct'],
                       bounds_error=False, fill_value='extrapolate')
    n_horizon = float(interp_n(difficulty_at_50))
    
    print(f"\nResults for capability {custom_capability}:")
    print(f"  Overall success rate: {custom_df['ai_success'].mean():.1%}")
    print(f"  Difficulty @ 50% AI success: {difficulty_at_50:.3f}")
    print(f"  N-attempt horizon: {n_horizon:.2f} attempts")
    print(f"\nInterpretation: This AI performs like a novice after {n_horizon:.1f} practice attempts")

# Plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.scatter(custom_df['difficulty'], custom_df['ai_success'], 
          alpha=0.6, s=50, color='purple', label=f'Custom AI (capability={custom_capability})')
ax.plot([0, 1], [0, 1], 'k--', alpha=0.3, linewidth=1.5, label='y=x')

if 'difficulty_at_50' in locals():
    ax.axvline(difficulty_at_50, color='purple', linestyle=':', alpha=0.5, linewidth=2)
    ax.axhline(0.5, color='red', linestyle=':', alpha=0.5, linewidth=1)
    ax.plot([difficulty_at_50], [0.5], 'r*', markersize=15, label='50% intersection')

ax.set_xlabel('Task Difficulty (Human First-Attempt)', fontsize=12)
ax.set_ylabel('AI Success Rate', fontsize=12)
ax.set_title(f'Custom AI Performance (Capability={custom_capability})', fontsize=13)
ax.grid(True, alpha=0.3)
ax.legend(fontsize=11)

plt.tight_layout()
plt.show()

## 4. Analyze Learning Rates by Domain

In [None]:
# Extract domain from task_id
learning_curves['domain'] = learning_curves['task_id'].str.split('/').str[0]

# Compare learning rates across domains
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Difficulty by domain
ax = axes[0]
learning_curves.boxplot(column='first_attempt_success_rate', by='domain', ax=ax)
ax.set_xlabel('Domain', fontsize=11)
ax.set_ylabel('First-Attempt Success Rate', fontsize=11)
ax.set_title('Task Difficulty by Domain', fontsize=12)
plt.sca(ax)
plt.xticks(rotation=45, ha='right')

# Learning rate by domain
ax = axes[1]
learning_curves.boxplot(column='fitted_lambda', by='domain', ax=ax)
ax.set_xlabel('Domain', fontsize=11)
ax.set_ylabel('Learning Rate (λ)', fontsize=11)
ax.set_title('Learning Speed by Domain', fontsize=12)
plt.sca(ax)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Summary stats
print("\nDomain Statistics:")
domain_stats = learning_curves.groupby('domain').agg({
    'first_attempt_success_rate': ['mean', 'std'],
    'fitted_lambda': ['mean', 'std'],
    'fitted_asymptote': 'mean'
}).round(3)
print(domain_stats)

## 5. Re-run Analysis with Different Parameters

Want to test the system with different settings? You can regenerate data with custom parameters.

In [None]:
# Example: Generate tasks with different difficulty range
from src.generate.synthetic_tasks import SyntheticTaskGenerator

# Try different parameters
generator = SyntheticTaskGenerator(
    n_tasks=50,  # Fewer tasks for quick testing
    difficulty_range=(0.1, 0.7),  # Different range
    default_lambda=0.5,  # Faster learning
    random_seed=123  # Different seed
)

new_tasks = generator.generate_task_suite()
stats = generator.get_summary_statistics(new_tasks)

print("New task suite with custom parameters:")
print(f"  Tasks: {stats['total_tasks']}")
print(f"  Difficulty: {stats['difficulty']['mean']:.3f} ± {stats['difficulty']['std']:.3f}")
print(f"  Learning rate: {stats['learning_rate']['mean']:.3f} ± {stats['learning_rate']['std']:.3f}")

# Note: To fully regenerate the pipeline, you'd run:
# !poetry run dvc repro plot_name_results

## Next Steps

To explore further:
1. Modify the cells above to look at different tasks or models
2. Change parameters and re-run the pipeline: `!poetry run dvc repro plot_name_results`
3. Add your own AI models by modifying `src/simulate/novice_learning.py`
4. Add real tasks by creating a YAML file matching the schema in `data/external/name_tasks.yaml`

See `docs/NAME_README.md` for complete documentation.