In [3]:
# ============================================================================
# COMPREHENSIVE RESULTS ANALYSIS
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json
import os
# Load all results
gen_df = pd.read_csv(os.path.join(BASE_OUTPUT_DIR, "all_seeds_generation_results.csv"))
pair_df = pd.read_csv(os.path.join(BASE_OUTPUT_DIR, "all_seeds_minimal_pairs.csv"))

# Load training perplexities
perplexity_results = []
for seed in SEEDS:
    run_info_path = os.path.join(BASE_OUTPUT_DIR, f"curriculum_seed_{seed}", "run_info.json")
    if os.path.exists(run_info_path):
        with open(run_info_path, "r", encoding="utf-8") as f:
            run_info = json.load(f)
            perplexity_results.append({
                'seed': seed,
                'perplexity': run_info.get('final_eval_perplexity'),
                'eval_loss': run_info.get('final_eval_loss'),
                'train_loss': run_info.get('final_train_loss')
            })
perplexity_df = pd.DataFrame(perplexity_results)

print("="*70)
print("1. DATA OVERVIEW")
print("="*70)
print(f"Generation results: {len(gen_df)} entries ({len(gen_df['seed'].unique())} seeds)")
print(f"Minimal pair results: {len(pair_df)} entries ({len(pair_df['seed'].unique())} seeds)")
print(f"Training perplexities: {len(perplexity_df)} seeds")
print(f"\nSeeds in analysis: {sorted(gen_df['seed'].unique().tolist())}")


NameError: name 'BASE_OUTPUT_DIR' is not defined

In [None]:
# ============================================================================
# 2. TEXT GENERATION ANALYSIS
# ============================================================================

print("="*70)
print("2. TEXT GENERATION ANALYSIS")
print("="*70)

# Calculate generation length (in characters)
gen_df['gen_length'] = gen_df['generated_text'].str.len()

# Group by seed
gen_summary = gen_df.groupby('seed').agg({
    'gen_length': ['mean', 'std', 'min', 'max'],
}).round(2)

print("\nGeneration Length Statistics (characters) by Seed:")
print(gen_summary)

# Check for empty/very short generations
empty_gens = gen_df[gen_df['gen_length'] < 5]
if len(empty_gens) > 0:
    print(f"\nâš  Warning: {len(empty_gens)} very short generations (< 5 chars)")
    print(empty_gens[['seed', 'prompt_id', 'generated_text']].head())

# Average generation length across all seeds
print(f"\nOverall average generation length: {gen_df['gen_length'].mean():.1f} characters")
print(f"Overall std: {gen_df['gen_length'].std():.1f} characters")


In [None]:
# ============================================================================
# 3. MINIMAL PAIR CONSISTENCY ANALYSIS
# ============================================================================

print("="*70)
print("3. MINIMAL PAIR CONSISTENCY ANALYSIS")
print("="*70)

# For each pair, check how consistent models are across seeds
pair_consistency = pair_df.groupby('pair_id').agg({
    'lower_ppl': lambda x: x.value_counts().to_dict(),
    'ppl1': ['mean', 'std'],
    'ppl2': ['mean', 'std'],
    'ppl_difference': ['mean', 'std']
}).round(2)

print("\nConsistency across seeds (which sentence has lower PPL):")
for pair_id in sorted(pair_df['pair_id'].unique()):
    pair_data = pair_df[pair_df['pair_id'] == pair_id]
    sent1_count = sum(pair_data['lower_ppl'] == 'sentence1')
    sent2_count = sum(pair_data['lower_ppl'] == 'sentence2')
    total = len(pair_data)
    
    print(f"\nPair {pair_id}:")
    print(f"  Sentence 1 preferred: {sent1_count}/{total} ({sent1_count/total*100:.1f}%)")
    print(f"  Sentence 2 preferred: {sent2_count}/{total} ({sent2_count/total*100:.1f}%)")
    print(f"  Avg PPL difference: {pair_data['ppl_difference'].mean():.1f}")

# Most/least consistent pairs
pair_agreement = []
for pair_id in sorted(pair_df['pair_id'].unique()):
    pair_data = pair_df[pair_df['pair_id'] == pair_id]
    majority = pair_data['lower_ppl'].mode()[0] if len(pair_data['lower_ppl'].mode()) > 0 else 'tie'
    agreement = sum(pair_data['lower_ppl'] == majority) / len(pair_data)
    pair_agreement.append({
        'pair_id': pair_id,
        'agreement': agreement,
        'majority_choice': majority
    })

agreement_df = pd.DataFrame(pair_agreement).sort_values('agreement')
print("\n\nPair Consistency Ranking (higher = more consistent across seeds):")
print(agreement_df.to_string(index=False))


In [None]:
# ============================================================================
# 4. CORRELATION ANALYSIS: Training Perplexity vs Test Performance
# ============================================================================

print("="*70)
print("4. CORRELATION ANALYSIS")
print("="*70)

# Merge training perplexity with test results
merged_df = perplexity_df.merge(
    gen_df.groupby('seed')['gen_length'].mean().reset_index().rename(columns={'gen_length': 'avg_gen_length'}),
    on='seed', how='inner'
)

# Calculate average minimal pair PPL difference per seed
avg_pair_ppl_diff = pair_df.groupby('seed')['ppl_difference'].mean().reset_index()
merged_df = merged_df.merge(avg_pair_ppl_diff, on='seed', how='inner')

# Calculate correlation
correlations = {
    'perplexity_vs_gen_length': merged_df['perplexity'].corr(merged_df['avg_gen_length']),
    'perplexity_vs_pair_diff': merged_df['perplexity'].corr(merged_df['ppl_difference']),
    'eval_loss_vs_gen_length': merged_df['eval_loss'].corr(merged_df['avg_gen_length']),
    'eval_loss_vs_pair_diff': merged_df['eval_loss'].corr(merged_df['ppl_difference'])
}

print("\nCorrelations:")
for key, val in correlations.items():
    print(f"  {key}: {val:.3f}")

print("\nMerged Data (Training Perplexity vs Test Metrics):")
print(merged_df[['seed', 'perplexity', 'eval_loss', 'avg_gen_length', 'ppl_difference']].to_string(index=False))
