# Multi-Seed Results Analysis

This notebook analyzes results from the 5-seed evaluation pipeline, comparing:
- **Sampling strategies**: RSS (Random Stratified Sampling) vs RS (Random Sampling)
- **Evaluation modes**: Baseline vs BODHI v0.1.3

## Metrics Computed
- Mean score across seeds
- Standard deviation across seeds
- BODHI improvement over baseline
- Theme-level performance variance
- Statistical significance tests

In [None]:
import json
import os
from pathlib import Path
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Configuration
RESULTS_BASE = Path('../Results')
RSS_DIR = RESULTS_BASE / 'results-5-seeds-200rss'
RS_DIR = RESULTS_BASE / 'results-5-seeds-200rs'
SEEDS = [42, 43, 44, 45, 46]

print(f"Results directories:")
print(f"  RSS: {RSS_DIR}")
print(f"  RS:  {RS_DIR}")

## 1. Data Verification

First, verify that all expected result files exist and contain the correct number of samples.

In [None]:
def verify_results_directory(base_dir, strategy_name):
    """Verify all expected files exist in results directory."""
    print(f"\n{'='*60}")
    print(f"Verifying {strategy_name} Results: {base_dir}")
    print('='*60)
    
    if not base_dir.exists():
        print(f"ERROR: Directory does not exist!")
        return None
    
    results = {
        'baseline': {},
        'bodhi': {}
    }
    
    all_valid = True
    
    for seed in SEEDS:
        for mode, prefix in [('baseline', 'baseline'), ('bodhi', 'bodhiv0.1.3')]:
            seed_dir = base_dir / f"{prefix}-seed{seed}"
            
            if not seed_dir.exists():
                print(f"  [ ] {prefix}-seed{seed}: Directory missing")
                all_valid = False
                continue
            
            # Find allresults file
            allresults_files = list(seed_dir.glob('*_allresults.json'))
            
            if not allresults_files:
                print(f"  [ ] {prefix}-seed{seed}: No allresults.json found")
                all_valid = False
                continue
            
            # Load and verify
            allresults_file = allresults_files[0]
            try:
                with open(allresults_file) as f:
                    data = json.load(f)
                
                num_results = len(data.get('results', []))
                score = data.get('score', 'N/A')
                
                if num_results == 200:
                    print(f"  [x] {prefix}-seed{seed}: {num_results} samples, score={score:.4f}")
                    results[mode][seed] = {
                        'file': allresults_file,
                        'data': data,
                        'num_samples': num_results,
                        'score': score
                    }
                else:
                    print(f"  [!] {prefix}-seed{seed}: {num_results} samples (expected 200)")
                    all_valid = False
                    
            except Exception as e:
                print(f"  [ ] {prefix}-seed{seed}: Error loading - {e}")
                all_valid = False
    
    print(f"\nVerification: {'PASSED' if all_valid else 'INCOMPLETE'}")
    return results if all_valid else results  # Return partial results anyway

# Verify both directories
rss_results = verify_results_directory(RSS_DIR, 'RSS')
rs_results = verify_results_directory(RS_DIR, 'RS')

## 2. Load and Aggregate Results

In [None]:
def extract_scores(results_dict):
    """Extract scores from results dictionary."""
    scores = {
        'baseline': [],
        'bodhi': []
    }
    
    for mode in ['baseline', 'bodhi']:
        for seed in SEEDS:
            if seed in results_dict.get(mode, {}):
                score = results_dict[mode][seed].get('score')
                if score is not None and score != 'N/A':
                    scores[mode].append(score)
    
    return scores

def compute_statistics(scores_list):
    """Compute mean, std, min, max for a list of scores."""
    if not scores_list:
        return {'mean': None, 'std': None, 'min': None, 'max': None, 'n': 0}
    
    return {
        'mean': np.mean(scores_list),
        'std': np.std(scores_list, ddof=1) if len(scores_list) > 1 else 0,
        'min': np.min(scores_list),
        'max': np.max(scores_list),
        'n': len(scores_list),
        'values': scores_list
    }

# Extract and compute statistics
print("\nScore Statistics:")
print("="*70)

all_stats = {}

for strategy, results in [('RSS', rss_results), ('RS', rs_results)]:
    if results is None:
        continue
        
    scores = extract_scores(results)
    all_stats[strategy] = {}
    
    print(f"\n{strategy}:")
    print("-"*50)
    
    for mode in ['baseline', 'bodhi']:
        stats_dict = compute_statistics(scores[mode])
        all_stats[strategy][mode] = stats_dict
        
        if stats_dict['mean'] is not None:
            print(f"  {mode.upper():10} mean={stats_dict['mean']:.4f} "
                  f"std={stats_dict['std']:.4f} "
                  f"[{stats_dict['min']:.4f}, {stats_dict['max']:.4f}] "
                  f"n={stats_dict['n']}")
        else:
            print(f"  {mode.upper():10} No data available")

## 3. BODHI Improvement Analysis

In [None]:
print("\nBODHI Improvement Over Baseline:")
print("="*70)

for strategy in ['RSS', 'RS']:
    if strategy not in all_stats:
        continue
    
    baseline_stats = all_stats[strategy].get('baseline', {})
    bodhi_stats = all_stats[strategy].get('bodhi', {})
    
    if baseline_stats.get('mean') is None or bodhi_stats.get('mean') is None:
        print(f"\n{strategy}: Insufficient data for comparison")
        continue
    
    # Absolute improvement
    abs_improvement = bodhi_stats['mean'] - baseline_stats['mean']
    
    # Relative improvement
    rel_improvement = (abs_improvement / baseline_stats['mean']) * 100 if baseline_stats['mean'] != 0 else 0
    
    print(f"\n{strategy}:")
    print(f"  Baseline mean:     {baseline_stats['mean']:.4f}")
    print(f"  BODHI mean:        {bodhi_stats['mean']:.4f}")
    print(f"  Absolute change:   {abs_improvement:+.4f}")
    print(f"  Relative change:   {rel_improvement:+.2f}%")
    
    # Paired t-test if we have matching seeds
    if baseline_stats['n'] == bodhi_stats['n'] and baseline_stats['n'] > 1:
        t_stat, p_value = stats.ttest_rel(baseline_stats['values'], bodhi_stats['values'])
        print(f"  Paired t-test:     t={t_stat:.3f}, p={p_value:.4f}")
        print(f"  Significant (p<0.05): {'Yes' if p_value < 0.05 else 'No'}")

## 4. RSS vs RS Comparison

In [None]:
print("\nRSS vs RS Comparison:")
print("="*70)

for mode in ['baseline', 'bodhi']:
    rss_stats = all_stats.get('RSS', {}).get(mode, {})
    rs_stats = all_stats.get('RS', {}).get(mode, {})
    
    if rss_stats.get('mean') is None or rs_stats.get('mean') is None:
        print(f"\n{mode.upper()}: Insufficient data for comparison")
        continue
    
    diff = rss_stats['mean'] - rs_stats['mean']
    
    print(f"\n{mode.upper()}:")
    print(f"  RSS mean: {rss_stats['mean']:.4f} (std={rss_stats['std']:.4f})")
    print(f"  RS mean:  {rs_stats['mean']:.4f} (std={rs_stats['std']:.4f})")
    print(f"  Difference (RSS - RS): {diff:+.4f}")
    
    # Independent t-test
    if rss_stats['n'] > 1 and rs_stats['n'] > 1:
        t_stat, p_value = stats.ttest_ind(rss_stats['values'], rs_stats['values'])
        print(f"  Independent t-test: t={t_stat:.3f}, p={p_value:.4f}")

## 5. Visualization

In [None]:
# Prepare data for plotting
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Score comparison by strategy and mode
ax1 = axes[0]
x = np.arange(2)  # RSS, RS
width = 0.35

baseline_means = []
baseline_stds = []
bodhi_means = []
bodhi_stds = []

for strategy in ['RSS', 'RS']:
    if strategy in all_stats:
        baseline_means.append(all_stats[strategy].get('baseline', {}).get('mean', 0) or 0)
        baseline_stds.append(all_stats[strategy].get('baseline', {}).get('std', 0) or 0)
        bodhi_means.append(all_stats[strategy].get('bodhi', {}).get('mean', 0) or 0)
        bodhi_stds.append(all_stats[strategy].get('bodhi', {}).get('std', 0) or 0)
    else:
        baseline_means.append(0)
        baseline_stds.append(0)
        bodhi_means.append(0)
        bodhi_stds.append(0)

bars1 = ax1.bar(x - width/2, baseline_means, width, yerr=baseline_stds, 
                label='Baseline', color='#3498db', capsize=5)
bars2 = ax1.bar(x + width/2, bodhi_means, width, yerr=bodhi_stds, 
                label='BODHI v0.1.3', color='#2ecc71', capsize=5)

ax1.set_ylabel('Score')
ax1.set_title('Mean Scores by Strategy and Mode')
ax1.set_xticks(x)
ax1.set_xticklabels(['RSS (Stratified)', 'RS (Random)'])
ax1.legend()
ax1.set_ylim(0, max(max(baseline_means), max(bodhi_means)) * 1.2 if baseline_means else 1)

# Add value labels on bars
for bar, mean in zip(bars1, baseline_means):
    if mean > 0:
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{mean:.3f}', ha='center', va='bottom', fontsize=9)
for bar, mean in zip(bars2, bodhi_means):
    if mean > 0:
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{mean:.3f}', ha='center', va='bottom', fontsize=9)

# Plot 2: Score distribution across seeds
ax2 = axes[1]

data_to_plot = []
labels = []

for strategy in ['RSS', 'RS']:
    for mode in ['baseline', 'bodhi']:
        if strategy in all_stats and mode in all_stats[strategy]:
            values = all_stats[strategy][mode].get('values', [])
            if values:
                data_to_plot.append(values)
                labels.append(f"{strategy}\n{mode.capitalize()}")

if data_to_plot:
    bp = ax2.boxplot(data_to_plot, labels=labels, patch_artist=True)
    colors = ['#3498db', '#2ecc71', '#e74c3c', '#f39c12'][:len(data_to_plot)]
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)

ax2.set_ylabel('Score')
ax2.set_title('Score Distribution Across Seeds')

plt.tight_layout()
plt.savefig('../Results/multi_seed_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nPlot saved to: Results/multi_seed_comparison.png")

## 6. Per-Seed Results Table

In [None]:
print("\nPer-Seed Results:")
print("="*90)
print(f"{'Seed':<6} {'RSS Baseline':<14} {'RSS BODHI':<14} {'RS Baseline':<14} {'RS BODHI':<14}")
print("-"*90)

for seed in SEEDS:
    row = f"{seed:<6}"
    
    for strategy, results in [('RSS', rss_results), ('RS', rs_results)]:
        for mode in ['baseline', 'bodhi']:
            if results and mode in results and seed in results[mode]:
                score = results[mode][seed].get('score', 'N/A')
                if isinstance(score, (int, float)):
                    row += f"{score:<14.4f}"
                else:
                    row += f"{str(score):<14}"
            else:
                row += f"{'--':<14}"
    
    print(row)

print("-"*90)

## 7. Theme-Level Analysis

In [None]:
def extract_theme_scores(results_dict):
    """Extract per-theme scores from all results."""
    theme_scores = defaultdict(lambda: defaultdict(list))
    
    for mode in ['baseline', 'bodhi']:
        for seed, seed_data in results_dict.get(mode, {}).items():
            data = seed_data.get('data', {})
            
            for result in data.get('results', []):
                # Extract theme from example_tags
                example_tags = result.get('example', {}).get('example_tags', [])
                theme = 'unknown'
                for tag in example_tags:
                    if tag.startswith('theme:'):
                        theme = tag.replace('theme:', '')
                        break
                
                # Get score for this example
                score = result.get('score')
                if score is not None:
                    theme_scores[mode][theme].append(score)
    
    return theme_scores

# Analyze themes for RSS results
if rss_results:
    print("\nTheme-Level Analysis (RSS):")
    print("="*80)
    
    rss_theme_scores = extract_theme_scores(rss_results)
    
    themes = sorted(set(list(rss_theme_scores['baseline'].keys()) + 
                       list(rss_theme_scores['bodhi'].keys())))
    
    print(f"{'Theme':<25} {'Baseline':<20} {'BODHI':<20} {'Improvement':<15}")
    print("-"*80)
    
    for theme in themes:
        baseline_scores = rss_theme_scores['baseline'].get(theme, [])
        bodhi_scores = rss_theme_scores['bodhi'].get(theme, [])
        
        baseline_mean = np.mean(baseline_scores) if baseline_scores else 0
        bodhi_mean = np.mean(bodhi_scores) if bodhi_scores else 0
        improvement = bodhi_mean - baseline_mean
        
        print(f"{theme:<25} {baseline_mean:.4f} (n={len(baseline_scores):<4}) "
              f"{bodhi_mean:.4f} (n={len(bodhi_scores):<4}) {improvement:+.4f}")

## 8. Summary Report

In [None]:
print("\n" + "="*70)
print("SUMMARY REPORT")
print("="*70)

print("\n1. DATA COMPLETENESS:")
for strategy, results in [('RSS', rss_results), ('RS', rs_results)]:
    if results:
        baseline_count = len(results.get('baseline', {}))
        bodhi_count = len(results.get('bodhi', {}))
        print(f"   {strategy}: {baseline_count}/5 baseline, {bodhi_count}/5 BODHI")
    else:
        print(f"   {strategy}: No data")

print("\n2. KEY FINDINGS:")

# BODHI improvement
for strategy in ['RSS', 'RS']:
    if strategy in all_stats:
        baseline = all_stats[strategy].get('baseline', {}).get('mean')
        bodhi = all_stats[strategy].get('bodhi', {}).get('mean')
        if baseline and bodhi:
            imp = ((bodhi - baseline) / baseline) * 100
            print(f"   {strategy} BODHI improvement: {imp:+.2f}%")

# Variance comparison
print("\n3. VARIANCE (Lower = More Stable):")
for strategy in ['RSS', 'RS']:
    if strategy in all_stats:
        for mode in ['baseline', 'bodhi']:
            std = all_stats[strategy].get(mode, {}).get('std')
            if std is not None:
                print(f"   {strategy} {mode}: std={std:.4f}")

print("\n" + "="*70)

## 9. Export Results

In [None]:
# Export summary to JSON
export_data = {
    'seeds': SEEDS,
    'statistics': {},
    'per_seed_scores': {}
}

for strategy in ['RSS', 'RS']:
    if strategy in all_stats:
        export_data['statistics'][strategy] = {}
        export_data['per_seed_scores'][strategy] = {}
        
        for mode in ['baseline', 'bodhi']:
            stats_dict = all_stats[strategy].get(mode, {})
            export_data['statistics'][strategy][mode] = {
                'mean': stats_dict.get('mean'),
                'std': stats_dict.get('std'),
                'min': stats_dict.get('min'),
                'max': stats_dict.get('max'),
                'n': stats_dict.get('n')
            }
            
            # Per-seed scores
            results = rss_results if strategy == 'RSS' else rs_results
            if results and mode in results:
                export_data['per_seed_scores'][strategy][mode] = {
                    str(seed): data.get('score') 
                    for seed, data in results[mode].items()
                }

# Save
output_path = RESULTS_BASE / 'multi_seed_analysis_summary.json'
with open(output_path, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"Summary exported to: {output_path}")