In [None]:
# SEAL Knowledge Incorporation Results Viewer

This notebook provides comprehensive analysis and visualization of SEAL knowledge incorporation experiment results.

## Features:
- 📊 **Summary View**: Quick overview of all experiments
- 🔍 **Detailed Analysis**: Deep dive into specific results
- 📈 **Comparisons**: Side-by-side experiment comparison
- 📋 **Interactive Tables**: Browse results with filtering
- 🎨 **Visualizations**: Charts and heatmaps for better understanding

## Usage:
1. Run the setup cells below
2. Use the provided functions to analyze your results
3. Customize visualizations as needed


In [None]:
# Setup and imports
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print("✅ Setup complete! Ready to analyze SEAL knowledge incorporation results.")


In [None]:
# Configuration - Update these paths as needed
RESULTS_DIR = Path("knowledge-incorporation/results")

# Check if results directory exists
if RESULTS_DIR.exists():
    print(f"✅ Found results directory: {RESULTS_DIR}")
    print(f"📁 Available subdirectories:")
    for subdir in RESULTS_DIR.iterdir():
        if subdir.is_dir():
            file_count = len(list(subdir.glob("**/*.json")))
            print(f"   📊 {subdir.name}: {file_count} JSON files")
else:
    print(f"❌ Results directory not found: {RESULTS_DIR}")
    print("💡 Please update RESULTS_DIR variable above to point to your results folder")


In [None]:
# Helper functions for loading and analyzing results

def load_json_file(filepath):
    """Load JSON file with error handling"""
    try:
        with open(filepath, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

def detect_result_type(filepath):
    """Detect the type of result file"""
    filepath_str = str(filepath)
    if "continual_self_edits" in filepath_str and "summary" in filepath.name:
        return "continual"
    elif filepath.parent.name in ["eval", "train"] and "query_server" in filepath_str:
        return "query_server"
    elif "cpt_" in filepath.name:
        return "cpt"
    else:
        return "unknown"

def find_all_results():
    """Find all result files organized by type"""
    results = {"query_server": [], "cpt": [], "continual": []}
    
    if not RESULTS_DIR.exists():
        return results
    
    # Query server results
    qs_dir = RESULTS_DIR / "query_server"
    if qs_dir.exists():
        results["query_server"].extend(qs_dir.glob("**/*.json"))
    
    # CPT results
    cpt_dir = RESULTS_DIR / "cpt"
    if cpt_dir.exists():
        results["cpt"].extend(cpt_dir.glob("*.json"))
    
    # Continual self-edits results
    cont_dir = RESULTS_DIR / "continual_self_edits"
    if cont_dir.exists():
        results["continual"].extend(cont_dir.glob("**/summary_*.json"))
    
    return results

print("✅ Helper functions loaded")


In [None]:
# 📊 SUMMARY: Overview of All Results

def show_summary():
    """Display a comprehensive summary of all available results"""
    results = find_all_results()
    
    print("🔬 SEAL Knowledge Incorporation Results Summary")
    print("=" * 70)
    
    summary_data = []
    
    for result_type, files in results.items():
        print(f"\n📊 {result_type.upper().replace('_', ' ')} RESULTS:")
        
        if not files:
            print("   No results found")
            continue
            
        for filepath in sorted(files):
            data = load_json_file(filepath)
            if not data:
                continue
                
            # Extract key metrics based on result type
            if result_type == "query_server":
                overall = data.get("overall", {})
                exp_name = data.get("exp_name", filepath.stem)
                baseline_acc = overall.get("baseline_mean_accuracy", 0)
                adapter_acc = overall.get("adapter_mean_accuracy", 0)
                gain = overall.get("mean_gain", 0)
                n_articles = data.get("n_articles", "?")
                
                print(f"   📄 {exp_name}")
                print(f"      Baseline: {baseline_acc:.3f} → Adapter: {adapter_acc:.3f} (Gain: +{gain:.3f})")
                print(f"      Articles: {n_articles}")
                
                summary_data.append({
                    'Type': 'Query Server',
                    'Name': exp_name,
                    'Baseline': baseline_acc,
                    'Adapter': adapter_acc,
                    'Gain': gain,
                    'Articles': n_articles
                })
                
            elif result_type == "cpt":
                overall = data.get("overall", {})
                baseline_acc = overall.get("baseline_accuracy", 0)
                adapter_acc = overall.get("adapter_accuracy", 0)
                gain = overall.get("gain", 0)
                n_articles = data.get("n_articles", "?")
                
                print(f"   📄 {filepath.stem}")
                print(f"      Baseline: {baseline_acc:.3f} → Adapter: {adapter_acc:.3f} (Gain: +{gain:.3f})")
                print(f"      Articles: {n_articles}")
                
                summary_data.append({
                    'Type': 'CPT',
                    'Name': filepath.stem,
                    'Baseline': baseline_acc,
                    'Adapter': adapter_acc,
                    'Gain': gain,
                    'Articles': n_articles
                })
                
            elif result_type == "continual":
                n_seq = data.get("n_sequences", 0)
                n_data = data.get("n_datapoints", 0)
                mean_matrix = data.get("mean_over_sequences", [])
                
                if mean_matrix:
                    final_acc = mean_matrix[-1][-1] if mean_matrix[-1] else 0
                    initial_acc = mean_matrix[0][0] if mean_matrix[0] else 0
                    improvement = final_acc - initial_acc
                    
                    print(f"   📄 {filepath.parent.name}")
                    print(f"      Initial: {initial_acc:.3f} → Final: {final_acc:.3f} (Change: {improvement:+.3f})")
                    print(f"      Sequences: {n_seq}, Datapoints: {n_data}")
                    
                    summary_data.append({
                        'Type': 'Continual',
                        'Name': filepath.parent.name,
                        'Baseline': initial_acc,
                        'Adapter': final_acc,
                        'Gain': improvement,
                        'Articles': f"{n_seq}seq×{n_data}pts"
                    })
    
    # Create summary DataFrame
    if summary_data:
        print(f"\n📋 SUMMARY TABLE:")
        df = pd.DataFrame(summary_data)
        print(df.to_string(index=False, float_format='%.3f'))
        return df
    else:
        print("\n❌ No results found to summarize")
        return None

# Run the summary
summary_df = show_summary()


In [None]:
# 📈 VISUALIZATION: Results Overview

def plot_summary_comparison(df):
    """Create comparison plots from summary data"""
    if df is None or df.empty:
        print("No data to plot")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('SEAL Knowledge Incorporation Results Overview', fontsize=16, fontweight='bold')
    
    # 1. Accuracy comparison by experiment
    ax1 = axes[0, 0]
    x = range(len(df))
    width = 0.35
    
    ax1.bar([i - width/2 for i in x], df['Baseline'], width, label='Baseline', alpha=0.8, color='lightcoral')
    ax1.bar([i + width/2 for i in x], df['Adapter'], width, label='Adapter', alpha=0.8, color='lightblue')
    
    ax1.set_xlabel('Experiment')
    ax1.set_ylabel('Accuracy')
    ax1.set_title('Baseline vs Adapter Accuracy')
    ax1.set_xticks(x)
    ax1.set_xticklabels(df['Name'], rotation=45, ha='right')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Gain by experiment
    ax2 = axes[0, 1]
    colors = ['green' if g > 0 else 'red' for g in df['Gain']]
    bars = ax2.bar(df['Name'], df['Gain'], color=colors, alpha=0.8)
    ax2.set_xlabel('Experiment')
    ax2.set_ylabel('Accuracy Gain')
    ax2.set_title('Accuracy Gain by Experiment')
    ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax2.grid(True, alpha=0.3)
    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    # Add value labels on bars
    for bar, gain in zip(bars, df['Gain']):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + (0.005 if height >= 0 else -0.01),
                f'{gain:.3f}', ha='center', va='bottom' if height >= 0 else 'top', fontsize=9)
    
    # 3. Performance by experiment type
    ax3 = axes[1, 0]
    type_summary = df.groupby('Type').agg({'Gain': ['mean', 'std', 'count']}).round(3)
    type_summary.columns = ['Mean_Gain', 'Std_Gain', 'Count']
    type_summary = type_summary.reset_index()
    
    bars = ax3.bar(type_summary['Type'], type_summary['Mean_Gain'], 
                   yerr=type_summary['Std_Gain'], capsize=5, alpha=0.8)
    ax3.set_xlabel('Experiment Type')
    ax3.set_ylabel('Mean Accuracy Gain')
    ax3.set_title('Average Performance by Experiment Type')
    ax3.grid(True, alpha=0.3)
    
    # Add count labels
    for bar, count in zip(bars, type_summary['Count']):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'n={int(count)}', ha='center', va='bottom', fontsize=9)
    
    # 4. Baseline vs Adapter scatter
    ax4 = axes[1, 1]
    scatter = ax4.scatter(df['Baseline'], df['Adapter'], 
                         c=df['Gain'], cmap='RdYlGn', s=100, alpha=0.8)
    
    # Add diagonal line (no improvement)
    lims = [min(ax4.get_xlim()[0], ax4.get_ylim()[0]), 
            max(ax4.get_xlim()[1], ax4.get_ylim()[1])]
    ax4.plot(lims, lims, 'k--', alpha=0.5, zorder=0)
    
    ax4.set_xlabel('Baseline Accuracy')
    ax4.set_ylabel('Adapter Accuracy')
    ax4.set_title('Baseline vs Adapter Performance')
    ax4.grid(True, alpha=0.3)
    
    # Add colorbar
    cbar = plt.colorbar(scatter, ax=ax4)
    cbar.set_label('Accuracy Gain')
    
    plt.tight_layout()
    plt.show()

# Create visualization if we have data
if summary_df is not None and not summary_df.empty:
    plot_summary_comparison(summary_df)
else:
    print("📊 No data available for visualization")


In [None]:
# 🔍 DETAILED ANALYSIS: Examine Specific Results

def analyze_query_server_results(filepath):
    """Detailed analysis of query server results"""
    data = load_json_file(filepath)
    if not data:
        return
    
    print(f"🔍 DETAILED ANALYSIS: {filepath.name}")
    print("=" * 70)
    
    overall = data.get("overall", {})
    
    print("📈 OVERALL METRICS:")
    for key, value in overall.items():
        if isinstance(value, float):
            print(f"   {key}: {value:.4f}")
        else:
            print(f"   {key}: {value}")
    
    print(f"\n⚙️ EXPERIMENT CONFIG:")
    print(f"   Dataset: {data.get('dataset', 'N/A')}")
    print(f"   Articles: {data.get('n_articles', 'N/A')}")
    print(f"   Completions: {data.get('k_completions', 'N/A')}")
    
    lora_params = data.get("lora_params", {})
    if lora_params:
        print(f"\n🔧 LORA PARAMETERS:")
        for key, value in lora_params.items():
            print(f"   {key}: {value}")
    
    # Analyze per-article performance
    articles = data.get("articles", [])
    if articles:
        print(f"\n📊 PER-ARTICLE ANALYSIS:")
        
        article_data = []
        for art in articles:
            if "stats" in art:
                article_data.append({
                    'Title': art["title"][:30] + "..." if len(art["title"]) > 30 else art["title"],
                    'Baseline': art["stats"]["baseline_accuracy"],
                    'Adapter': art["stats"]["adapter_mean_accuracy"],
                    'Gain': art["stats"]["mean_gain"]
                })
        
        if article_data:
            df = pd.DataFrame(article_data)
            
            # Show top and bottom performers
            df_sorted = df.sort_values('Gain', ascending=False)
            
            print("\n🏆 TOP 5 PERFORMERS:")
            print(df_sorted.head().to_string(index=False, float_format='%.3f'))
            
            print("\n📉 BOTTOM 5 PERFORMERS:")
            print(df_sorted.tail().to_string(index=False, float_format='%.3f'))
            
            # Plot article performance distribution
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
            
            # Gain distribution
            ax1.hist(df['Gain'], bins=20, alpha=0.7, edgecolor='black')
            ax1.axvline(df['Gain'].mean(), color='red', linestyle='--', 
                       label=f'Mean: {df["Gain"].mean():.3f}')
            ax1.set_xlabel('Accuracy Gain')
            ax1.set_ylabel('Number of Articles')
            ax1.set_title('Distribution of Accuracy Gains')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # Baseline vs Adapter scatter
            scatter = ax2.scatter(df['Baseline'], df['Adapter'], 
                                c=df['Gain'], cmap='RdYlGn', alpha=0.7)
            
            # Add diagonal line
            lims = [min(ax2.get_xlim()[0], ax2.get_ylim()[0]), 
                    max(ax2.get_xlim()[1], ax2.get_ylim()[1])]
            ax2.plot(lims, lims, 'k--', alpha=0.5)
            
            ax2.set_xlabel('Baseline Accuracy')
            ax2.set_ylabel('Adapter Accuracy')
            ax2.set_title('Baseline vs Adapter Accuracy by Article')
            plt.colorbar(scatter, ax=ax2, label='Gain')
            ax2.grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            return df

# Example usage - uncomment and modify path as needed
# analyze_query_server_results(RESULTS_DIR / "query_server/eval/base.json")


In [None]:
# 📊 CONTINUAL SELF-EDITS ANALYSIS

def analyze_continual_results(filepath):
    """Analyze continual self-edits matrix results"""
    data = load_json_file(filepath)
    if not data:
        return
    
    print(f"🔍 CONTINUAL SELF-EDITS ANALYSIS: {filepath.name}")
    print("=" * 70)
    
    print("📈 EXPERIMENT CONFIG:")
    print(f"   Sequences: {data.get('n_sequences', 'N/A')}")
    print(f"   Datapoints: {data.get('n_datapoints', 'N/A')}")
    print(f"   Dataset: {data.get('dataset', 'N/A')}")
    print(f"   Base model: {data.get('base_model', 'N/A')}")
    
    mean_matrix = data.get("mean_over_sequences", [])
    std_matrix = data.get("std_over_sequences", [])
    
    if not mean_matrix:
        print("❌ No matrix data found")
        return
    
    # Convert to numpy arrays for easier manipulation
    mean_array = np.array(mean_matrix)
    std_array = np.array(std_matrix) if std_matrix else None
    
    n_steps, n_datapoints = mean_array.shape
    
    print(f"\n📊 ACCURACY MATRIX ({n_steps} steps × {n_datapoints} datapoints):")
    
    # Print matrix in readable format
    print("   Step\\Data ", end="")
    for i in range(n_datapoints):
        print(f"    d{i:>2}", end="")
    print()
    print("   " + "-" * (10 + 6 * n_datapoints))
    
    for i in range(n_steps):
        step_name = "Base" if i == 0 else f"Step{i-1}"
        print(f"   {step_name:<8} ", end="")
        
        for j in range(n_datapoints):
            if i == 0 or j < i:  # Only show relevant cells
                print(f"{mean_array[i, j]:>6.3f}", end="")
            else:
                print(f"{'':>6}", end="")
        print()
    
    # Analysis and visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Continual Self-Edits Analysis', fontsize=16, fontweight='bold')
    
    # 1. Heatmap of the accuracy matrix
    ax1 = axes[0, 0]
    
    # Create lower triangular matrix for plotting
    plot_matrix = np.full((n_steps, n_datapoints), np.nan)
    for i in range(n_steps):
        max_j = n_datapoints if i == 0 else i
        for j in range(min(n_datapoints, max_j)):
            plot_matrix[i, j] = mean_array[i, j]
    
    im = ax1.imshow(plot_matrix, cmap='viridis', aspect='auto')
    ax1.set_title('Accuracy Matrix Heatmap')
    ax1.set_xlabel('Datapoint')
    ax1.set_ylabel('Training Step')
    ax1.set_xticks(range(n_datapoints))
    ax1.set_xticklabels([f'd{i}' for i in range(n_datapoints)])
    ax1.set_yticks(range(n_steps))
    ax1.set_yticklabels(['Base'] + [f'Step{i}' for i in range(n_steps-1)])
    plt.colorbar(im, ax=ax1, label='Accuracy')
    
    # Add text annotations
    for i in range(n_steps):
        max_j = n_datapoints if i == 0 else i
        for j in range(min(n_datapoints, max_j)):
            if not np.isnan(plot_matrix[i, j]):
                ax1.text(j, i, f'{plot_matrix[i, j]:.2f}', 
                        ha='center', va='center', fontsize=8)
    
    # 2. Diagonal progression (how each datapoint evolves)
    ax2 = axes[0, 1]
    for j in range(min(n_datapoints, n_steps-1)):
        values = [mean_array[0, j]] + [mean_array[i+1, j] for i in range(j+1)]
        steps = ['Base'] + [f'Step{i}' for i in range(j+1)]
        ax2.plot(steps, values, 'o-', label=f'Datapoint {j}', alpha=0.8)
    
    ax2.set_title('Performance Evolution by Datapoint')
    ax2.set_xlabel('Training Step')
    ax2.set_ylabel('Accuracy')
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.grid(True, alpha=0.3)
    plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45)
    
    # 3. Final step performance comparison
    ax3 = axes[1, 0]
    if n_steps > 1:
        final_step_data = []
        for j in range(min(n_datapoints, n_steps-1)):
            final_step_data.append({
                'Datapoint': f'd{j}',
                'Base': mean_array[0, j],
                'Final': mean_array[j+1, j],
                'Change': mean_array[j+1, j] - mean_array[0, j]
            })
        
        if final_step_data:
            df_final = pd.DataFrame(final_step_data)
            
            x = range(len(df_final))
            width = 0.35
            
            bars1 = ax3.bar([i - width/2 for i in x], df_final['Base'], width, 
                           label='Base', alpha=0.8)
            bars2 = ax3.bar([i + width/2 for i in x], df_final['Final'], width, 
                           label='Final', alpha=0.8)
            
            ax3.set_title('Base vs Final Performance')
            ax3.set_xlabel('Datapoint')
            ax3.set_ylabel('Accuracy')
            ax3.set_xticks(x)
            ax3.set_xticklabels(df_final['Datapoint'])
            ax3.legend()
            ax3.grid(True, alpha=0.3)
    
    # 4. Performance statistics
    ax4 = axes[1, 1]
    
    # Calculate statistics for each step
    step_stats = []
    for i in range(n_steps):
        if i == 0:
            valid_values = mean_array[i, :n_datapoints]
        else:
            valid_values = mean_array[i, :i]
        
        if len(valid_values) > 0:
            step_stats.append({
                'Step': 'Base' if i == 0 else f'Step{i-1}',
                'Mean': np.mean(valid_values),
                'Std': np.std(valid_values),
                'Min': np.min(valid_values),
                'Max': np.max(valid_values),
                'Count': len(valid_values)
            })
    
    if step_stats:
        df_stats = pd.DataFrame(step_stats)
        
        ax4.errorbar(range(len(df_stats)), df_stats['Mean'], 
                    yerr=df_stats['Std'], fmt='o-', capsize=5)
        ax4.fill_between(range(len(df_stats)), 
                        df_stats['Min'], df_stats['Max'], alpha=0.3)
        
        ax4.set_title('Performance Statistics by Step')
        ax4.set_xlabel('Training Step')
        ax4.set_ylabel('Accuracy')
        ax4.set_xticks(range(len(df_stats)))
        ax4.set_xticklabels(df_stats['Step'], rotation=45)
        ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return mean_array, std_array

# Example usage - uncomment and modify path as needed
# analyze_continual_results(RESULTS_DIR / "continual_self_edits/run0/summary_1748902664.json")


In [None]:
# 🔄 EXPERIMENT COMPARISON

def compare_experiments(experiment_names):
    """Compare multiple experiments side by side"""
    print("🔄 EXPERIMENT COMPARISON")
    print("=" * 70)
    
    results = []
    
    # Try to find results for each experiment name
    for exp_name in experiment_names:
        found_files = []
        
        # Search in query_server/eval
        eval_dir = RESULTS_DIR / "query_server" / "eval"
        if eval_dir.exists():
            found_files.extend(eval_dir.glob(f"{exp_name}.json"))
        
        # Search in cpt
        cpt_dir = RESULTS_DIR / "cpt"
        if cpt_dir.exists():
            found_files.extend(cpt_dir.glob(f"cpt_{exp_name}.json"))
        
        if found_files:
            data = load_json_file(found_files[0])
            if data:
                results.append((exp_name, data, found_files[0]))
        else:
            print(f"⚠️  No results found for experiment: {exp_name}")
    
    if len(results) < 2:
        print("❌ Need at least 2 experiments to compare")
        return None
    
    # Extract comparison data
    comparison_data = []
    for exp_name, data, filepath in results:
        result_type = detect_result_type(filepath)
        
        if result_type == "query_server":
            overall = data.get("overall", {})
            baseline = overall.get("baseline_mean_accuracy", 0)
            adapter = overall.get("adapter_mean_accuracy", 0)
            gain = overall.get("mean_gain", 0)
        elif result_type == "cpt":
            overall = data.get("overall", {})
            baseline = overall.get("baseline_accuracy", 0)
            adapter = overall.get("adapter_accuracy", 0)
            gain = overall.get("gain", 0)
        else:
            continue
            
        n_articles = data.get("n_articles", "?")
        
        comparison_data.append({
            'Experiment': exp_name,
            'Type': result_type,
            'Baseline': baseline,
            'Adapter': adapter,
            'Gain': gain,
            'Articles': n_articles,
            'Dataset': data.get('dataset', 'N/A')
        })
    
    if not comparison_data:
        print("❌ No comparable data found")
        return None
    
    # Create comparison DataFrame
    df = pd.DataFrame(comparison_data)
    
    print("📊 COMPARISON TABLE:")
    print(df.to_string(index=False, float_format='%.4f'))
    
    # Create comparison visualizations
    if len(df) > 1:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Experiment Comparison', fontsize=16, fontweight='bold')
        
        # 1. Accuracy comparison
        ax1 = axes[0, 0]
        x = range(len(df))
        width = 0.35
        
        ax1.bar([i - width/2 for i in x], df['Baseline'], width, 
               label='Baseline', alpha=0.8, color='lightcoral')
        ax1.bar([i + width/2 for i in x], df['Adapter'], width, 
               label='Adapter', alpha=0.8, color='lightblue')
        
        ax1.set_xlabel('Experiment')
        ax1.set_ylabel('Accuracy')
        ax1.set_title('Baseline vs Adapter Accuracy')
        ax1.set_xticks(x)
        ax1.set_xticklabels(df['Experiment'], rotation=45, ha='right')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # 2. Gain comparison
        ax2 = axes[0, 1]
        colors = ['green' if g > 0 else 'red' for g in df['Gain']]
        bars = ax2.bar(df['Experiment'], df['Gain'], color=colors, alpha=0.8)
        ax2.set_xlabel('Experiment')
        ax2.set_ylabel('Accuracy Gain')
        ax2.set_title('Accuracy Gain by Experiment')
        ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        ax2.grid(True, alpha=0.3)
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        # Add value labels on bars
        for bar, gain in zip(bars, df['Gain']):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., 
                    height + (0.002 if height >= 0 else -0.005),
                    f'{gain:.3f}', ha='center', 
                    va='bottom' if height >= 0 else 'top', fontsize=10)
        
        # 3. Baseline vs Adapter scatter
        ax3 = axes[1, 0]
        scatter = ax3.scatter(df['Baseline'], df['Adapter'], 
                            c=df['Gain'], cmap='RdYlGn', s=100, alpha=0.8)
        
        # Add diagonal line (no improvement)
        lims = [min(ax3.get_xlim()[0], ax3.get_ylim()[0]), 
                max(ax3.get_xlim()[1], ax3.get_ylim()[1])]
        ax3.plot(lims, lims, 'k--', alpha=0.5, zorder=0)
        
        ax3.set_xlabel('Baseline Accuracy')
        ax3.set_ylabel('Adapter Accuracy')
        ax3.set_title('Baseline vs Adapter Performance')
        ax3.grid(True, alpha=0.3)
        
        # Add experiment labels
        for i, exp in enumerate(df['Experiment']):
            ax3.annotate(exp, (df['Baseline'].iloc[i], df['Adapter'].iloc[i]),
                        xytext=(5, 5), textcoords='offset points', fontsize=9)
        
        # Add colorbar
        cbar = plt.colorbar(scatter, ax=ax3)
        cbar.set_label('Accuracy Gain')
        
        # 4. Performance ranking
        ax4 = axes[1, 1]
        df_sorted = df.sort_values('Gain', ascending=True)
        
        colors = ['green' if g > 0 else 'red' for g in df_sorted['Gain']]
        bars = ax4.barh(range(len(df_sorted)), df_sorted['Gain'], color=colors, alpha=0.8)
        
        ax4.set_yticks(range(len(df_sorted)))
        ax4.set_yticklabels(df_sorted['Experiment'])
        ax4.set_xlabel('Accuracy Gain')
        ax4.set_title('Performance Ranking')
        ax4.axvline(x=0, color='black', linestyle='-', alpha=0.3)
        ax4.grid(True, alpha=0.3)
        
        # Add value labels
        for i, (bar, gain) in enumerate(zip(bars, df_sorted['Gain'])):
            width = bar.get_width()
            ax4.text(width + (0.002 if width >= 0 else -0.005), bar.get_y() + bar.get_height()/2.,
                    f'{gain:.3f}', ha='left' if width >= 0 else 'right', va='center', fontsize=10)
        
        plt.tight_layout()
        plt.show()
    
    return df

# Example usage - uncomment and modify experiment names as needed
# compare_experiments(['base', 'iter1', 'iter2'])


In [None]:
# 🎯 QUICK ACCESS: Easy Analysis Functions

def quick_analysis():
    """One-click analysis of all available results"""
    print("🎯 QUICK ANALYSIS: Running comprehensive analysis...")
    
    results = find_all_results()
    
    # Summary
    print("\n" + "="*50)
    print("📊 SUMMARY")
    print("="*50)
    summary_df = show_summary()
    
    # Auto-compare available experiments
    if summary_df is not None and len(summary_df) > 1:
        print("\n" + "="*50)
        print("🔄 AUTO-COMPARISON")
        print("="*50)
        unique_names = summary_df['Name'].tolist()
        if len(unique_names) > 1:
            compare_experiments(unique_names[:5])  # Compare up to 5 experiments
    
    # Analyze first continual result if available
    continual_files = results.get("continual", [])
    if continual_files:
        print("\n" + "="*50)
        print("📊 CONTINUAL ANALYSIS")
        print("="*50)
        analyze_continual_results(continual_files[0])
    
    # Analyze first query server result if available
    qs_files = results.get("query_server", [])
    if qs_files:
        print("\n" + "="*50)
        print("🔍 DETAILED QUERY SERVER ANALYSIS")
        print("="*50)
        analyze_query_server_results(qs_files[0])

def list_available_files():
    """List all available result files for manual analysis"""
    results = find_all_results()
    
    print("📁 AVAILABLE RESULT FILES:")
    print("="*50)
    
    for result_type, files in results.items():
        if files:
            print(f"\n📊 {result_type.upper().replace('_', ' ')} FILES:")
            for i, filepath in enumerate(files, 1):
                rel_path = filepath.relative_to(RESULTS_DIR)
                print(f"   {i}. {rel_path}")
                
                # Show quick stats
                data = load_json_file(filepath)
                if data and result_type == "query_server":
                    overall = data.get("overall", {})
                    gain = overall.get("mean_gain", 0)
                    print(f"      → Gain: {gain:+.3f}")
                elif data and result_type == "cpt":
                    overall = data.get("overall", {})
                    gain = overall.get("gain", 0)
                    print(f"      → Gain: {gain:+.3f}")
                elif data and result_type == "continual":
                    n_seq = data.get("n_sequences", 0)
                    n_data = data.get("n_datapoints", 0)
                    print(f"      → {n_seq} sequences × {n_data} datapoints")

# Run quick analysis
print("🎯 Ready for analysis! Use the functions below:")
print()
print("📋 Available functions:")
print("• quick_analysis()                           - Run comprehensive analysis")
print("• list_available_files()                     - List all result files") 
print("• show_summary()                             - Show results summary")
print("• compare_experiments(['exp1', 'exp2'])      - Compare specific experiments")
print("• analyze_query_server_results(filepath)     - Detailed query server analysis")
print("• analyze_continual_results(filepath)        - Detailed continual analysis")
print()
print("💡 Tip: Start with quick_analysis() for a comprehensive overview!")


In [None]:
# 🚀 RUN ANALYSIS
# Uncomment the function you want to run:

# Quick comprehensive analysis of everything
# quick_analysis()

# List all available files
list_available_files()

# Show just the summary
# show_summary()
