# Transfer Learning Analysis for Atari Games

This notebook analyzes transfer learning results including:
1. Learning curves for each experiment
2. Transfer benefit analysis (comparing transfer vs from-scratch performance)
3. Visualizations split by algorithm

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tensorboard.backend.event_processing import event_accumulator
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## Configuration

Define the results directory and experiment parameters.

In [None]:
# Configuration
RESULTS_DIR = "results"  # Transfer learning results
BASELINE_DIR = "results_baseline"  # Baseline (from-scratch) results
OUTPUT_DIR = "analysis_plots"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Games and algorithms from your config
GAMES = ["Pong", "Breakout", "SpaceInvaders", "Tennis"]
ALGORITHMS = ["dqn", "ppo", "qrdqn"]

# Metrics to extract from TensorBoard
METRICS = [
    "rollout/ep_rew_mean",  # Episode reward
    "rollout/ep_len_mean",  # Episode length
    "train/loss",            # Training loss
]

print(f"Transfer results directory: {RESULTS_DIR}")
print(f"Baseline results directory: {BASELINE_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Plots will be saved to: {os.path.abspath(OUTPUT_DIR)}")

## Helper Functions

In [None]:
def load_tensorboard_data(log_dir, metric="rollout/ep_rew_mean"):
    """
    Load data from TensorBoard event files.
    
    Args:
        log_dir: Path to TensorBoard log directory
        metric: Metric name to extract
    
    Returns:
        DataFrame with columns: step, value
    """
    try:
        ea = event_accumulator.EventAccumulator(log_dir)
        ea.Reload()
        
        if metric not in ea.Tags()['scalars']:
            print(f"Warning: Metric '{metric}' not found in {log_dir}")
            return pd.DataFrame(columns=['step', 'value'])
        
        events = ea.Scalars(metric)
        data = pd.DataFrame([
            {'step': e.step, 'value': e.value}
            for e in events
        ])
        return data
    except Exception as e:
        print(f"Error loading {log_dir}: {e}")
        return pd.DataFrame(columns=['step', 'value'])


def parse_experiment_name(exp_name):
    """
    Parse experiment name to extract metadata.
    
    Expected format: {algorithm}_{source}_to_{target}_{timestamp}
    or: {algorithm}_{source}_to_{target}_pretrained_{timestamp}
    
    Returns:
        dict with algorithm, source, target, pretrained, timestamp
    """
    parts = exp_name.split('_')
    
    if len(parts) < 4:
        return None
    
    algorithm = parts[0]
    
    # Find 'to' index
    try:
        to_idx = parts.index('to')
    except ValueError:
        return None
    
    source = parts[to_idx - 1]
    
    # Check if pretrained
    if 'pretrained' in parts:
        pretrained_idx = parts.index('pretrained')
        target = parts[to_idx + 1]
        pretrained = True
        timestamp = '_'.join(parts[pretrained_idx + 1:])
    else:
        target = parts[to_idx + 1]
        pretrained = False
        timestamp = '_'.join(parts[to_idx + 2:])
    
    return {
        'algorithm': algorithm,
        'source': source,
        'target': target,
        'pretrained': pretrained,
        'timestamp': timestamp
    }


def find_experiments(results_dir, pretrained_only=True):
    """
    Find all experiments in the results directory.
    
    Args:
        results_dir: Path to results directory
        pretrained_only: If True, only include pretrained experiments
    
    Returns:
        List of dicts with experiment metadata and paths
        - If multiple experiments exist for the same (algorithm, source, target) 
          combination, only the latest one (by timestamp) is kept
    """
    experiments = []
    
    if not os.path.exists(results_dir):
        print(f"Results directory '{results_dir}' not found!")
        return experiments
    
    for exp_name in os.listdir(results_dir):
        exp_path = os.path.join(results_dir, exp_name)
        
        if not os.path.isdir(exp_path):
            continue
        
        # Skip non-experiment directories
        if exp_name in ['slurm_scripts', 'slurm_scripts_pretrained', 'slurm_logs']:
            continue
        
        metadata = parse_experiment_name(exp_name)
        if metadata is None:
            continue
        
        # Filter for pretrained only if requested
        if pretrained_only and not metadata['pretrained']:
            continue
        
        # Find log directories
        source_logs = os.path.join(exp_path, 'source_logs')
        target_logs = os.path.join(exp_path, 'target_logs')
        
        metadata['name'] = exp_name
        metadata['path'] = exp_path
        metadata['source_logs'] = source_logs if os.path.exists(source_logs) else None
        metadata['target_logs'] = target_logs if os.path.exists(target_logs) else None
        
        experiments.append(metadata)
    
    # Keep only the latest experiment for each (algorithm, source, target) combination
    # Group by (algorithm, source, target)
    grouped = {}
    for exp in experiments:
        key = (exp['algorithm'], exp['source'], exp['target'])
        if key not in grouped:
            grouped[key] = []
        grouped[key].append(exp)
    
    # For each group, keep only the one with the latest timestamp
    latest_experiments = []
    for key, exps in grouped.items():
        # Sort by timestamp (lexicographic sort works for format YYYYMMDD_HHMMSS)
        latest_exp = max(exps, key=lambda e: e['timestamp'])
        latest_experiments.append(latest_exp)
    
    print(f"\nFiltering: pretrained_only={pretrained_only}")
    print(f"Found {len(experiments)} total experiments")
    print(f"Kept {len(latest_experiments)} latest unique experiments")
    
    # Show which experiments were kept/discarded
    if len(experiments) > len(latest_experiments):
        print("\nDuplicate experiments found (keeping only latest):")
        kept_names = set(exp['name'] for exp in latest_experiments)
        for exp in experiments:
            if exp['name'] not in kept_names:
                key = (exp['algorithm'], exp['source'], exp['target'])
                print(f"  Discarded: {exp['name']}")
    
    return latest_experiments

## Load Experiments

In [None]:
# Find all transfer experiments
experiments = find_experiments(RESULTS_DIR)

print(f"Found {len(experiments)} transfer experiments")
print("\nTransfer experiment summary:")
exp_df = pd.DataFrame(experiments)
if len(exp_df) > 0:
    print(exp_df[['algorithm', 'source', 'target', 'pretrained']].to_string())
else:
    print("No experiments found!")

# Load baseline experiments
print("\n" + "="*80)
print("LOADING BASELINE EXPERIMENTS")
print("="*80)

baseline_experiments = {}

if os.path.exists(BASELINE_DIR):
    for exp_name in os.listdir(BASELINE_DIR):
        exp_path = os.path.join(BASELINE_DIR, exp_name)
        
        if not os.path.isdir(exp_path):
            continue
        
        # Parse baseline experiment name: {algorithm}_{game}_baseline_{job_id}
        parts = exp_name.split('_')
        if len(parts) < 3 or 'baseline' not in parts:
            continue
        
        algorithm = parts[0]
        game = parts[1]
        
        # Find log directory
        logs_dir = os.path.join(exp_path, 'logs')
        if not os.path.exists(logs_dir):
            continue
        
        # Store baseline experiment (keep latest if multiple)
        key = (algorithm, game)
        if key not in baseline_experiments or exp_name > baseline_experiments[key]['name']:
            baseline_experiments[key] = {
                'algorithm': algorithm,
                'game': game,
                'name': exp_name,
                'path': exp_path,
                'logs': logs_dir
            }

print(f"\nFound {len(baseline_experiments)} baseline experiments")
if baseline_experiments:
    print("\nBaseline summary:")
    for (algo, game), exp in sorted(baseline_experiments.items()):
        print(f"  {algo.upper()}: {game}")
else:
    print("No baseline experiments found. Make sure BASELINE_DIR is correct.")

## 1. Learning Curves

Plot learning curves for each experiment, organized by algorithm.

In [None]:
def plot_learning_curves_by_algorithm(experiments, metric="rollout/ep_rew_mean"):
    """
    Plot learning curves grouped by algorithm with consistent colors and average curves.
    """
    algorithms = sorted(set(exp['algorithm'] for exp in experiments))
    
    # Create consistent color mapping for game combinations
    # Get all unique games
    all_games = set()
    all_pairs = set()
    for exp in experiments:
        all_games.add(exp['source'])
        all_games.add(exp['target'])
        all_pairs.add((exp['source'], exp['target']))
    
    # Create color palettes
    game_colors = {}
    pair_colors = {}
    
    # Use a consistent colormap
    source_cmap = plt.cm.get_cmap('tab10')
    pair_cmap = plt.cm.get_cmap('tab20')
    
    for i, game in enumerate(sorted(all_games)):
        game_colors[game] = source_cmap(i % 10)
    
    for i, pair in enumerate(sorted(all_pairs)):
        pair_colors[pair] = pair_cmap(i % 20)
    
    for algo in algorithms:
        algo_exps = [exp for exp in experiments if exp['algorithm'] == algo]
        
        if not algo_exps:
            continue
        
        # Create subplots for source and target
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        fig.suptitle(f'{algo.upper()} Learning Curves - {metric}', fontsize=14, fontweight='bold')
        
        # Plot source training
        ax_source = axes[0]
        plotted_sources = set()
        source_data_list = []
        
        for exp in algo_exps:
            if exp['source_logs'] is None or not os.path.exists(exp['source_logs']):
                continue
            
            # Avoid duplicate source game plots
            if exp['source'] in plotted_sources:
                continue
            
            data = load_tensorboard_data(exp['source_logs'], metric)
            if len(data) > 0:
                label = f"{exp['source']}"
                color = game_colors[exp['source']]
                ax_source.plot(data['step'], data['value'], label=label, 
                             color=color, alpha=0.6, linewidth=1.5)
                plotted_sources.add(exp['source'])
                source_data_list.append(data)
        
        # Compute and plot average for source
        if len(source_data_list) > 1:
            # Interpolate all curves to common timesteps
            all_steps = sorted(set(step for data in source_data_list for step in data['step']))
            interp_values = []
            
            for data in source_data_list:
                interp_val = np.interp(all_steps, data['step'], data['value'])
                interp_values.append(interp_val)
            
            avg_values = np.mean(interp_values, axis=0)
            ax_source.plot(all_steps, avg_values, label='Average', 
                         color='black', linewidth=3, linestyle='--', alpha=0.9, zorder=100)
        
        ax_source.set_xlabel('Timesteps', fontsize=11)
        ax_source.set_ylabel('Reward', fontsize=11)
        ax_source.set_title('Source Game Training (from scratch)', fontsize=12)
        ax_source.legend()
        ax_source.grid(True, alpha=0.3)
        
        # Plot target training
        ax_target = axes[1]
        target_data_list = []
        
        for exp in algo_exps:
            if exp['target_logs'] is None or not os.path.exists(exp['target_logs']):
                continue
            
            data = load_tensorboard_data(exp['target_logs'], metric)
            if len(data) > 0:
                pretrained_tag = " (pretrained)" if exp['pretrained'] else ""
                label = f"{exp['source']} → {exp['target']}{pretrained_tag}"
                color = pair_colors[(exp['source'], exp['target'])]
                ax_target.plot(data['step'], data['value'], label=label, 
                             color=color, alpha=0.6, linewidth=1.5)
                target_data_list.append(data)
        
        # Compute and plot average for target
        if len(target_data_list) > 1:
            # Interpolate all curves to common timesteps
            all_steps = sorted(set(step for data in target_data_list for step in data['step']))
            interp_values = []
            
            for data in target_data_list:
                interp_val = np.interp(all_steps, data['step'], data['value'])
                interp_values.append(interp_val)
            
            avg_values = np.mean(interp_values, axis=0)
            ax_target.plot(all_steps, avg_values, label='Average', 
                         color='black', linewidth=3, linestyle='--', alpha=0.9, zorder=100)
        
        ax_target.set_xlabel('Timesteps', fontsize=11)
        ax_target.set_ylabel('Reward', fontsize=11)
        ax_target.set_title('Target Game Training (with transfer)', fontsize=12)
        ax_target.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        ax_target.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'learning_curves_{algo}.png'), dpi=300, bbox_inches='tight')
        plt.show()

# Plot learning curves
plot_learning_curves_by_algorithm(experiments)

## 2. Individual Transfer Comparisons

For each transfer pair, compare source (baseline) vs target (transfer) performance.

In [None]:
def smooth_curve(data, window_size=10):
    """
    Smooth a curve using a moving average.
    
    Args:
        data: Array of values to smooth
        window_size: Size of the smoothing window
    
    Returns:
        Smoothed array
    """
    if len(data) < window_size:
        return data
    
    # Use pandas for efficient rolling mean
    smoothed = pd.Series(data).rolling(window=window_size, min_periods=1, center=True).mean()
    return smoothed.values


def plot_transfer_vs_baseline(experiments, baseline_experiments, metric="rollout/ep_rew_mean", 
                              smooth_window=10):
    """
    Plot transfer learning vs baseline with improvement metric.
    
    For each transfer experiment, plot:
    1. Transfer learning curve (target training with pretrained source)
    2. Baseline learning curve (training from scratch)
    3. Difference/improvement over time (with average across all transfer experiments)
    
    Args:
        experiments: List of transfer experiments
        baseline_experiments: Dict of baseline experiments
        metric: Metric to plot
        smooth_window: Window size for smoothing the average curve
    """
    # Group by algorithm and target game
    transfer_by_target = {}
    for exp in experiments:
        key = (exp['algorithm'], exp['target'])
        if key not in transfer_by_target:
            transfer_by_target[key] = []
        transfer_by_target[key].append(exp)
    
    for (algo, target_game), transfer_exps in sorted(transfer_by_target.items()):
        # Check if we have a baseline for this algorithm and target game
        baseline_key = (algo, target_game)
        if baseline_key not in baseline_experiments:
            print(f"Skipping {algo}/{target_game}: No baseline found")
            continue
        
        baseline_exp = baseline_experiments[baseline_key]
        
        # Load baseline data
        baseline_data = load_tensorboard_data(baseline_exp['logs'], metric)
        if len(baseline_data) == 0:
            print(f"Skipping {algo}/{target_game}: No baseline data")
            continue
        
        # Create figure with 3 subplots
        fig, axes = plt.subplots(1, 3, figsize=(20, 5))
        fig.suptitle(f'{algo.upper()}: Transfer vs Baseline on {target_game}', 
                    fontsize=14, fontweight='bold')
        
        # Subplot 1: Learning curves comparison
        ax1 = axes[0]
        
        # Plot baseline
        ax1.plot(baseline_data['step'], baseline_data['value'], 
                label='Baseline (from scratch)', 
                color='gray', linewidth=2.5, linestyle='--', alpha=0.8)
        
        # Plot transfer experiments
        colors = plt.cm.tab10(np.linspace(0, 1, len(transfer_exps)))
        for i, exp in enumerate(transfer_exps):
            if exp['target_logs'] is None or not os.path.exists(exp['target_logs']):
                continue
            
            data = load_tensorboard_data(exp['target_logs'], metric)
            if len(data) > 0:
                label = f"Transfer: {exp['source']} → {target_game}"
                ax1.plot(data['step'], data['value'], 
                        label=label, color=colors[i], linewidth=2, alpha=0.8)
        
        ax1.set_xlabel('Timesteps', fontsize=11)
        ax1.set_ylabel('Episode Reward', fontsize=11)
        ax1.set_title('Learning Curves', fontsize=12)
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Subplot 2: Improvement over baseline (absolute difference)
        ax2 = axes[1]
        
        # Collect all improvement curves for averaging
        all_improvements = []
        all_common_steps = []
        
        for i, exp in enumerate(transfer_exps):
            if exp['target_logs'] is None or not os.path.exists(exp['target_logs']):
                continue
            
            transfer_data = load_tensorboard_data(exp['target_logs'], metric)
            if len(transfer_data) == 0:
                continue
            
            # Interpolate both curves to common timesteps
            common_steps = sorted(set(baseline_data['step']).intersection(set(transfer_data['step'])))
            if len(common_steps) == 0:
                # Use all steps and interpolate
                all_steps = sorted(set(list(baseline_data['step']) + list(transfer_data['step'])))
                baseline_interp = np.interp(all_steps, baseline_data['step'], baseline_data['value'])
                transfer_interp = np.interp(all_steps, transfer_data['step'], transfer_data['value'])
                common_steps = all_steps
            else:
                baseline_interp = np.interp(common_steps, baseline_data['step'], baseline_data['value'])
                transfer_interp = np.interp(common_steps, transfer_data['step'], transfer_data['value'])
            
            # Calculate improvement (positive = better than baseline)
            improvement = transfer_interp - baseline_interp
            
            # Plot individual curve (more transparent)
            label = f"{exp['source']} → {target_game}"
            ax2.plot(common_steps, improvement, 
                    label=label, color=colors[i], linewidth=1.5, alpha=0.4)
            
            # Store for averaging
            all_improvements.append(improvement)
            all_common_steps.append(common_steps)
        
        # Compute and plot average improvement
        if len(all_improvements) > 1:
            # Find common timesteps across all experiments
            min_len = min(len(steps) for steps in all_common_steps)
            # Truncate all to same length and compute mean
            truncated_improvements = [imp[:min_len] for imp in all_improvements]
            avg_improvement = np.mean(truncated_improvements, axis=0)
            avg_steps = all_common_steps[0][:min_len]
            
            # Smooth the average
            smoothed_avg = smooth_curve(avg_improvement, window_size=smooth_window)
            
            ax2.plot(avg_steps, smoothed_avg, 
                    label='Average (smoothed)', 
                    color='black', linewidth=3, linestyle='-', alpha=0.9, zorder=100)
        
        ax2.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
        ax2.set_xlabel('Timesteps', fontsize=11)
        ax2.set_ylabel('Reward Improvement vs Baseline', fontsize=11)
        ax2.set_title('Absolute Improvement (Transfer - Baseline)', fontsize=12)
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # Subplot 3: Relative improvement (percentage)
        ax3 = axes[2]
        
        # Collect all relative improvement curves for averaging
        all_rel_improvements = []
        
        for i, exp in enumerate(transfer_exps):
            if exp['target_logs'] is None or not os.path.exists(exp['target_logs']):
                continue
            
            transfer_data = load_tensorboard_data(exp['target_logs'], metric)
            if len(transfer_data) == 0:
                continue
            
            # Interpolate both curves to common timesteps
            common_steps = sorted(set(baseline_data['step']).intersection(set(transfer_data['step'])))
            if len(common_steps) == 0:
                all_steps = sorted(set(list(baseline_data['step']) + list(transfer_data['step'])))
                baseline_interp = np.interp(all_steps, baseline_data['step'], baseline_data['value'])
                transfer_interp = np.interp(all_steps, transfer_data['step'], transfer_data['value'])
                common_steps = all_steps
            else:
                baseline_interp = np.interp(common_steps, baseline_data['step'], baseline_data['value'])
                transfer_interp = np.interp(common_steps, transfer_data['step'], transfer_data['value'])
            
            # Calculate relative improvement (handle division by zero)
            with np.errstate(divide='ignore', invalid='ignore'):
                relative_improvement = np.where(
                    np.abs(baseline_interp) > 1e-6,
                    ((transfer_interp - baseline_interp) / np.abs(baseline_interp)) * 100,
                    0
                )
            
            # Plot individual curve (more transparent)
            label = f"{exp['source']} → {target_game}"
            ax3.plot(common_steps, relative_improvement, 
                    label=label, color=colors[i], linewidth=1.5, alpha=0.4)
            
            # Store for averaging
            all_rel_improvements.append(relative_improvement)
        
        # Compute and plot average relative improvement
        if len(all_rel_improvements) > 1:
            # Find common length
            min_len = min(len(imp) for imp in all_rel_improvements)
            truncated_rel_improvements = [imp[:min_len] for imp in all_rel_improvements]
            avg_rel_improvement = np.mean(truncated_rel_improvements, axis=0)
            avg_steps = all_common_steps[0][:min_len]
            
            # Smooth the average
            smoothed_avg_rel = smooth_curve(avg_rel_improvement, window_size=smooth_window)
            
            ax3.plot(avg_steps, smoothed_avg_rel, 
                    label='Average (smoothed)', 
                    color='black', linewidth=3, linestyle='-', alpha=0.9, zorder=100)
        
        ax3.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
        ax3.set_xlabel('Timesteps', fontsize=11)
        ax3.set_ylabel('Relative Improvement (%)', fontsize=11)
        ax3.set_title('Relative Improvement ((Transfer - Baseline) / |Baseline| × 100)', fontsize=12)
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'transfer_vs_baseline_{algo}_{target_game}.png'), 
                   dpi=300, bbox_inches='tight')
        plt.show()

# Plot transfer vs baseline comparisons
print("\n" + "="*80)
print("TRANSFER VS BASELINE COMPARISON")
print("="*80)
print(f"Smoothing window size: 10 (adjustable via smooth_window parameter)")
plot_transfer_vs_baseline(experiments, baseline_experiments, smooth_window=10)

## 3. Transfer Benefit Analysis

Compute transfer benefit as the improvement over from-scratch baseline.

In [None]:
def compute_final_performance(log_dir, metric="rollout/ep_rew_mean", last_n_steps=50000):
    """
    Compute average performance over the last N steps.
    """
    data = load_tensorboard_data(log_dir, metric)
    if len(data) == 0:
        return None
    
    # Get data from last N steps
    max_step = data['step'].max()
    final_data = data[data['step'] >= (max_step - last_n_steps)]
    
    if len(final_data) == 0:
        return data['value'].mean()
    
    return final_data['value'].mean()


def compute_transfer_benefits(experiments):
    """
    Compute transfer benefit for each experiment.
    
    Transfer benefit = (target performance - baseline) / |baseline| * 100
    where baseline is the source game's from-scratch performance.
    """
    results = []
    
    for exp in experiments:
        # Get target performance (with transfer)
        if exp['target_logs'] is None or not os.path.exists(exp['target_logs']):
            continue
        
        target_perf = compute_final_performance(exp['target_logs'])
        if target_perf is None:
            continue
        
        # Get baseline performance (source game from scratch)
        # This is the source_logs of the same experiment
        baseline_perf = None
        if exp['source_logs'] and os.path.exists(exp['source_logs']):
            baseline_perf = compute_final_performance(exp['source_logs'])
        
        if baseline_perf is None or baseline_perf == 0:
            continue
        
        # Compute transfer benefit
        benefit = ((target_perf - baseline_perf) / abs(baseline_perf)) * 100
        
        results.append({
            'algorithm': exp['algorithm'],
            'source': exp['source'],
            'target': exp['target'],
            'pretrained': exp['pretrained'],
            'baseline_performance': baseline_perf,
            'target_performance': target_perf,
            'transfer_benefit_pct': benefit
        })
    
    return pd.DataFrame(results)


# Compute transfer benefits
benefits_df = compute_transfer_benefits(experiments)

if len(benefits_df) > 0:
    print("\nTransfer Benefit Analysis:")
    print("=" * 80)
    print(benefits_df.to_string(index=False))
    
    # Save to CSV
    benefits_df.to_csv(os.path.join(OUTPUT_DIR, 'transfer_benefits.csv'), index=False)
    print(f"\nSaved to {os.path.join(OUTPUT_DIR, 'transfer_benefits.csv')}")
else:
    print("No transfer benefit data available yet.")

## 4. Transfer Benefit Visualization

In [None]:
if len(benefits_df) > 0:
    # Plot transfer benefits by algorithm
    algorithms = benefits_df['algorithm'].unique()
    
    for algo in algorithms:
        algo_df = benefits_df[benefits_df['algorithm'] == algo]
        
        if len(algo_df) == 0:
            continue
        
        # Create transfer pair labels
        algo_df = algo_df.copy()
        algo_df['transfer_pair'] = algo_df['source'] + ' → ' + algo_df['target']
        
        # Sort by benefit
        algo_df = algo_df.sort_values('transfer_benefit_pct')
        
        # Plot
        fig, ax = plt.subplots(figsize=(12, max(6, len(algo_df) * 0.4)))
        
        colors = ['green' if x > 0 else 'red' for x in algo_df['transfer_benefit_pct']]
        bars = ax.barh(algo_df['transfer_pair'], algo_df['transfer_benefit_pct'], color=colors, alpha=0.7)
        
        ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
        ax.set_xlabel('Transfer Benefit (%)', fontsize=12)
        ax.set_ylabel('Transfer Pair', fontsize=12)
        ax.set_title(f'{algo.upper()} Transfer Benefit Analysis', fontsize=14, fontweight='bold')
        ax.grid(True, alpha=0.3, axis='x')
        
        # Add value labels
        for i, (idx, row) in enumerate(algo_df.iterrows()):
            value = row['transfer_benefit_pct']
            x_pos = value + (5 if value > 0 else -5)
            ha = 'left' if value > 0 else 'right'
            ax.text(x_pos, i, f'{value:.1f}%', ha=ha, va='center', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'transfer_benefit_{algo}.png'), dpi=300, bbox_inches='tight')
        plt.show()
    
    # Summary statistics
    print("\n" + "="*80)
    print("TRANSFER BENEFIT SUMMARY BY ALGORITHM")
    print("="*80)
    summary = benefits_df.groupby('algorithm')['transfer_benefit_pct'].agg(['mean', 'std', 'min', 'max', 'count'])
    summary.columns = ['Mean (%)', 'Std (%)', 'Min (%)', 'Max (%)', 'N']
    print(summary.to_string())
else:
    print("No transfer benefit data to visualize.")

## 5. Transfer Matrix Heatmap

Create a heatmap showing transfer benefits between all game pairs.

In [None]:
if len(benefits_df) > 0:
    algorithms = benefits_df['algorithm'].unique()
    
    for algo in algorithms:
        algo_df = benefits_df[benefits_df['algorithm'] == algo]
        
        if len(algo_df) == 0:
            continue
        
        # Create pivot table for heatmap
        games = sorted(set(algo_df['source'].tolist() + algo_df['target'].tolist()))
        matrix = pd.DataFrame(index=games, columns=games, dtype=float)
        
        for _, row in algo_df.iterrows():
            matrix.loc[row['source'], row['target']] = row['transfer_benefit_pct']
        
        # Plot heatmap
        fig, ax = plt.subplots(figsize=(10, 8))
        
        sns.heatmap(matrix, annot=True, fmt=".1f", cmap="RdYlGn", center=0,
                   cbar_kws={'label': 'Transfer Benefit (%)'}, ax=ax,
                   linewidths=0.5, linecolor='gray')
        
        ax.set_xlabel('Target Game', fontsize=12)
        ax.set_ylabel('Source Game', fontsize=12)
        ax.set_title(f'{algo.upper()} Transfer Learning Matrix\n(Source → Target)', 
                    fontsize=14, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'transfer_matrix_{algo}.png'), dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("No data available for transfer matrix.")

## 6. Compare Pretrained vs From-Scratch Source

If you have both pretrained and from-scratch experiments, compare them.

In [None]:
if len(benefits_df) > 0 and 'pretrained' in benefits_df.columns:
    # Check if we have both pretrained and non-pretrained experiments
    has_pretrained = benefits_df['pretrained'].any()
    has_scratch = (~benefits_df['pretrained']).any()
    
    if has_pretrained and has_scratch:
        fig, ax = plt.subplots(figsize=(12, 6))
        
        for algo in benefits_df['algorithm'].unique():
            algo_df = benefits_df[benefits_df['algorithm'] == algo]
            
            pretrained_mean = algo_df[algo_df['pretrained']]['transfer_benefit_pct'].mean()
            scratch_mean = algo_df[~algo_df['pretrained']]['transfer_benefit_pct'].mean()
            
            x = np.arange(2)
            width = 0.25
            offset = list(benefits_df['algorithm'].unique()).index(algo) * width
            
            ax.bar(x + offset, [scratch_mean, pretrained_mean], width, 
                  label=algo.upper(), alpha=0.8)
        
        ax.set_xlabel('Source Model Type', fontsize=12)
        ax.set_ylabel('Average Transfer Benefit (%)', fontsize=12)
        ax.set_title('Pretrained vs From-Scratch Source Models', fontsize=14, fontweight='bold')
        ax.set_xticks([0.25, 1.25])
        ax.set_xticklabels(['From Scratch', 'Pretrained (Zoo)'])
        ax.legend()
        ax.grid(True, alpha=0.3, axis='y')
        ax.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, 'pretrained_vs_scratch.png'), dpi=300, bbox_inches='tight')
        plt.show()
    else:
        print("Need both pretrained and from-scratch experiments for comparison.")
else:
    print("No pretrained comparison data available.")

## Summary

All plots have been saved to the `analysis_plots/` directory.

In [None]:
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)
print(f"\nTotal experiments analyzed: {len(experiments)}")
print(f"Algorithms: {', '.join(sorted(set(exp['algorithm'] for exp in experiments)))}")
print(f"\nPlots saved to: {OUTPUT_DIR}/")
print("\nGenerated files:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    print(f"  - {f}")

In [None]:
# Configuration for phased experiments
PHASED_DIR = "results_phased"  # or "/mnt/home/slee1/ceph/atari_phased_results"

print(f"Phased experiments directory: {PHASED_DIR}")
print(f"Looking for experiments in: {os.path.abspath(PHASED_DIR) if os.path.exists(PHASED_DIR) else 'NOT FOUND'}")

In [None]:
# Configuration for phased experiments
PHASED_DIR = "results_phased"  # or "/mnt/home/slee1/ceph/atari_phased_results"

print(f"Phased experiments directory: {PHASED_DIR}")
print(f"Looking for experiments in: {os.path.abspath(PHASED_DIR) if os.path.exists(PHASED_DIR) else 'NOT FOUND'}")

In [None]:
def find_phased_experiments(phased_dir):
    """
    Find phased transfer learning experiments.
    
    Returns:
        source_experiments: Dict of source training experiments
            key: (algorithm, source_game)
            value: {path, checkpoints, logs, phases}
        transfer_experiments: List of transfer experiments
            [{algorithm, source, target, phase, checkpoint, path, logs}, ...]
    """
    if not os.path.exists(phased_dir):
        print(f"Phased directory not found: {phased_dir}")
        return {}, []
    
    source_experiments = {}
    transfer_experiments = []
    
    for exp_name in os.listdir(phased_dir):
        exp_path = os.path.join(phased_dir, exp_name)
        
        if not os.path.isdir(exp_path):
            continue
        
        # Skip script directories
        if 'slurm' in exp_name:
            continue
        
        # Parse source experiments: {algorithm}_{game}_source
        if exp_name.endswith('_source'):
            parts = exp_name.split('_')
            if len(parts) < 3:
                continue
            
            algorithm = parts[0]
            source_game = parts[1]
            
            # Find checkpoints
            checkpoint_dir = os.path.join(exp_path, 'checkpoints')
            logs_dir = os.path.join(exp_path, 'logs')
            
            if not os.path.exists(checkpoint_dir):
                continue
            
            # Find phase checkpoints
            phase_checkpoints = {}
            for ckpt_file in os.listdir(checkpoint_dir):
                if ckpt_file.startswith('phase_') and ckpt_file.endswith('_model.zip'):
                    # Extract phase number
                    phase_str = ckpt_file.replace('phase_', '').replace('_model.zip', '')
                    try:
                        phase_num = int(phase_str)
                        phase_checkpoints[phase_num] = os.path.join(checkpoint_dir, ckpt_file)
                    except ValueError:
                        continue
            
            key = (algorithm, source_game)
            source_experiments[key] = {
                'algorithm': algorithm,
                'source_game': source_game,
                'path': exp_path,
                'checkpoints': phase_checkpoints,
                'logs': logs_dir if os.path.exists(logs_dir) else None,
                'num_phases': len(phase_checkpoints)
            }
        
        # Parse transfer experiments: {algorithm}_{source}_to_{target}_from_{checkpoint}_{jobid}
        elif '_to_' in exp_name and '_from_' in exp_name:
            parts = exp_name.split('_')
            
            try:
                to_idx = parts.index('to')
                from_idx = parts.index('from')
                
                algorithm = parts[0]
                source_game = parts[to_idx - 1]
                target_game = parts[to_idx + 1]
                
                # Extract phase from checkpoint name
                # Checkpoint name format: phase_X_model
                checkpoint_parts = parts[from_idx + 1:]
                if len(checkpoint_parts) >= 2 and checkpoint_parts[0] == 'phase':
                    try:
                        phase_num = int(checkpoint_parts[1])
                    except ValueError:
                        continue
                else:
                    continue
                
                # Find logs
                logs_dir = os.path.join(exp_path, 'logs')
                
                transfer_experiments.append({
                    'algorithm': algorithm,
                    'source': source_game,
                    'target': target_game,
                    'phase': phase_num,
                    'checkpoint_name': '_'.join(checkpoint_parts[:3]),  # phase_X_model
                    'path': exp_path,
                    'logs': logs_dir if os.path.exists(logs_dir) else None,
                    'name': exp_name
                })
            except (ValueError, IndexError):
                continue
    
    return source_experiments, transfer_experiments


# Load phased experiments
phased_sources, phased_transfers = find_phased_experiments(PHASED_DIR)

print(f"\nFound {len(phased_sources)} source training experiments")
print(f"Found {len(phased_transfers)} transfer experiments\n")

if phased_sources:
    print("Source experiments:")
    for (algo, game), exp in sorted(phased_sources.items()):
        print(f"  {algo.upper()}/{game}: {exp['num_phases']} phases")

if phased_transfers:
    print(f"\nTransfer experiments by phase:")
    transfer_df = pd.DataFrame(phased_transfers)
    phase_summary = transfer_df.groupby(['algorithm', 'phase']).size().unstack(fill_value=0)
    print(phase_summary)

## Phased Analysis: Transfer Performance by Phase

For each phase checkpoint, compare baseline vs transfer performance.

In [None]:
def plot_phased_transfer_comparison(phased_sources, phased_transfers, baseline_experiments,
                                    metric="rollout/ep_rew_mean", smooth_window=10):
    """
    Plot transfer performance by phase compared to baseline.
    
    For each target game:
    - Show baseline (training from scratch)
    - Show transfer from each source at each phase
    - Compare how performance changes with more source training
    """
    # Group transfers by algorithm and target
    transfers_by_target = {}
    for transfer in phased_transfers:
        key = (transfer['algorithm'], transfer['target'])
        if key not in transfers_by_target:
            transfers_by_target[key] = []
        transfers_by_target[key].append(transfer)
    
    for (algo, target_game), transfers in sorted(transfers_by_target.items()):
        # Check for baseline
        baseline_key = (algo, target_game)
        if baseline_key not in baseline_experiments:
            print(f"Skipping {algo}/{target_game}: No baseline found")
            continue
        
        baseline_exp = baseline_experiments[baseline_key]
        baseline_data = load_tensorboard_data(baseline_exp['logs'], metric)
        
        if len(baseline_data) == 0:
            print(f"Skipping {algo}/{target_game}: No baseline data")
            continue
        
        # Group transfers by source and phase
        transfers_by_source = {}
        for transfer in transfers:
            source = transfer['source']
            if source not in transfers_by_source:
                transfers_by_source[source] = {}
            phase = transfer['phase']
            transfers_by_source[source][phase] = transfer
        
        # Create figure - one subplot per source game
        num_sources = len(transfers_by_source)
        fig, axes = plt.subplots(1, num_sources, figsize=(8 * num_sources, 6))
        if num_sources == 1:
            axes = [axes]
        
        fig.suptitle(f'{algo.upper()}: Transfer to {target_game} by Phase',
                    fontsize=14, fontweight='bold')
        
        for ax, (source_game, phase_transfers) in zip(axes, sorted(transfers_by_source.items())):
            # Plot baseline
            ax.plot(baseline_data['step'], baseline_data['value'],
                   label='Baseline (from scratch)',
                   color='gray', linewidth=2.5, linestyle='--', alpha=0.8, zorder=100)
            
            # Plot each phase transfer
            phases = sorted(phase_transfers.keys())
            colors = plt.cm.viridis(np.linspace(0, 1, len(phases)))
            
            for i, phase in enumerate(phases):
                transfer = phase_transfers[phase]
                if transfer['logs'] is None or not os.path.exists(transfer['logs']):
                    continue
                
                data = load_tensorboard_data(transfer['logs'], metric)
                if len(data) == 0:
                    continue
                
                label = f"Phase {phase}"
                ax.plot(data['step'], data['value'],
                       label=label, color=colors[i], linewidth=2, alpha=0.7)
            
            ax.set_xlabel('Timesteps', fontsize=11)
            ax.set_ylabel('Episode Reward', fontsize=11)
            ax.set_title(f'Source: {source_game}', fontsize=12)
            ax.legend(loc='best')
            ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'phased_transfer_{algo}_{target_game}.png'),
                   dpi=300, bbox_inches='tight')
        plt.show()


if phased_transfers and baseline_experiments:
    print("\n" + "="*80)
    print("PHASED TRANSFER ANALYSIS")
    print("="*80)
    plot_phased_transfer_comparison(phased_sources, phased_transfers, baseline_experiments)
else:
    print("Need both phased transfers and baseline experiments for comparison.")

## Final Performance by Phase

Analyze how final transfer performance changes with more source training.

In [None]:
def analyze_performance_by_phase(phased_transfers, baseline_experiments,
                                metric="rollout/ep_rew_mean", last_n_steps=50000):
    """
    Analyze how transfer performance changes with phase.
    
    Returns DataFrame with columns:
    - algorithm, source, target, phase
    - baseline_perf, transfer_perf, improvement, improvement_pct
    """
    results = []
    
    for transfer in phased_transfers:
        if transfer['logs'] is None or not os.path.exists(transfer['logs']):
            continue
        
        # Get transfer performance
        transfer_perf = compute_final_performance(transfer['logs'], metric, last_n_steps)
        if transfer_perf is None:
            continue
        
        # Get baseline performance
        baseline_key = (transfer['algorithm'], transfer['target'])
        if baseline_key not in baseline_experiments:
            continue
        
        baseline_exp = baseline_experiments[baseline_key]
        baseline_perf = compute_final_performance(baseline_exp['logs'], metric, last_n_steps)
        
        if baseline_perf is None or abs(baseline_perf) < 1e-6:
            continue
        
        # Compute improvement
        improvement = transfer_perf - baseline_perf
        improvement_pct = (improvement / abs(baseline_perf)) * 100
        
        results.append({
            'algorithm': transfer['algorithm'],
            'source': transfer['source'],
            'target': transfer['target'],
            'phase': transfer['phase'],
            'baseline_perf': baseline_perf,
            'transfer_perf': transfer_perf,
            'improvement': improvement,
            'improvement_pct': improvement_pct
        })
    
    return pd.DataFrame(results)


# Analyze performance by phase
if phased_transfers and baseline_experiments:
    phased_results = analyze_performance_by_phase(phased_transfers, baseline_experiments)
    
    if len(phased_results) > 0:
        print("\nPerformance by Phase:")
        print("="*80)
        print(phased_results.to_string(index=False))
        
        # Save to CSV
        phased_results.to_csv(os.path.join(OUTPUT_DIR, 'phased_transfer_results.csv'), index=False)
        print(f"\nSaved to {os.path.join(OUTPUT_DIR, 'phased_transfer_results.csv')}")
    else:
        print("No phased results data available yet.")
else:
    print("Need phased transfers and baseline experiments.")

## Phase Progression Plot

Show how transfer benefit changes with more source training.

In [None]:
if 'phased_results' in locals() and len(phased_results) > 0:
    # Group by algorithm
    for algo in phased_results['algorithm'].unique():
        algo_df = phased_results[phased_results['algorithm'] == algo]
        
        # Group by source-target pair
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        fig.suptitle(f'{algo.upper()}: Transfer Performance vs Source Training Phase',
                    fontsize=14, fontweight='bold')
        
        # Plot 1: Absolute improvement
        ax1 = axes[0]
        for (source, target), group in algo_df.groupby(['source', 'target']):
            group = group.sort_values('phase')
            label = f"{source} → {target}"
            ax1.plot(group['phase'], group['improvement'],
                    marker='o', label=label, linewidth=2, markersize=8)
        
        ax1.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
        ax1.set_xlabel('Source Training Phase', fontsize=11)
        ax1.set_ylabel('Improvement vs Baseline (Reward)', fontsize=11)
        ax1.set_title('Absolute Improvement', fontsize=12)
        ax1.legend(loc='best')
        ax1.grid(True, alpha=0.3)
        
        # Plot 2: Relative improvement
        ax2 = axes[1]
        for (source, target), group in algo_df.groupby(['source', 'target']):
            group = group.sort_values('phase')
            label = f"{source} → {target}"
            ax2.plot(group['phase'], group['improvement_pct'],
                    marker='o', label=label, linewidth=2, markersize=8)
        
        ax2.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
        ax2.set_xlabel('Source Training Phase', fontsize=11)
        ax2.set_ylabel('Improvement vs Baseline (%)', fontsize=11)
        ax2.set_title('Relative Improvement', fontsize=12)
        ax2.legend(loc='best')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'phased_progression_{algo}.png'),
                   dpi=300, bbox_inches='tight')
        plt.show()
        
        # Summary statistics
        print(f"\n{algo.upper()} - Summary by Phase:")
        print("="*80)
        phase_summary = algo_df.groupby('phase').agg({
            'improvement_pct': ['mean', 'std', 'min', 'max'],
            'transfer_perf': 'mean'
        })
        phase_summary.columns = ['Mean Improvement (%)', 'Std (%)', 'Min (%)', 'Max (%)', 'Mean Transfer Perf']
        print(phase_summary.to_string())
else:
    print("No phased results available for progression plot.")

## Heatmap: Transfer Benefit by Phase

Visualize how different source games transfer to targets at each phase.

In [None]:
if 'phased_results' in locals() and len(phased_results) > 0:
    for algo in phased_results['algorithm'].unique():
        algo_df = phased_results[phased_results['algorithm'] == algo]
        
        # Get unique phases
        phases = sorted(algo_df['phase'].unique())
        
        # Create subplots - one per phase
        num_phases = len(phases)
        cols = min(3, num_phases)
        rows = (num_phases + cols - 1) // cols
        
        fig, axes = plt.subplots(rows, cols, figsize=(7 * cols, 6 * rows))
        if num_phases == 1:
            axes = np.array([axes])
        axes = axes.flatten()
        
        fig.suptitle(f'{algo.upper()}: Transfer Benefit Heatmap by Phase',
                    fontsize=16, fontweight='bold')
        
        for i, phase in enumerate(phases):
            phase_df = algo_df[algo_df['phase'] == phase]
            
            # Create pivot table
            sources = sorted(phase_df['source'].unique())
            targets = sorted(phase_df['target'].unique())
            
            matrix = pd.DataFrame(index=sources, columns=targets, dtype=float)
            for _, row in phase_df.iterrows():
                matrix.loc[row['source'], row['target']] = row['improvement_pct']
            
            # Plot heatmap
            ax = axes[i]
            sns.heatmap(matrix, annot=True, fmt=".1f", cmap="RdYlGn", center=0,
                       cbar_kws={'label': 'Improvement (%)'},
                       linewidths=0.5, linecolor='gray', ax=ax)
            ax.set_xlabel('Target Game', fontsize=10)
            ax.set_ylabel('Source Game', fontsize=10)
            ax.set_title(f'Phase {phase}', fontsize=12, fontweight='bold')
        
        # Hide extra subplots
        for i in range(num_phases, len(axes)):
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_DIR, f'phased_heatmap_{algo}.png'),
                   dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("No phased results for heatmap.")

## Phased Analysis Summary

All phased experiment plots have been saved.

In [None]:
print("\n" + "="*80)
print("PHASED ANALYSIS COMPLETE")
print("="*80)

if 'phased_results' in locals() and len(phased_results) > 0:
    print(f"\nTotal phased experiments analyzed: {len(phased_results)}")
    print(f"Algorithms: {', '.join(sorted(phased_results['algorithm'].unique()))}")
    print(f"Phases: {sorted(phased_results['phase'].unique())}")
    print(f"\nPhased plots saved to: {OUTPUT_DIR}/")
    
    phased_files = [f for f in os.listdir(OUTPUT_DIR) if 'phased' in f]
    if phased_files:
        print("\nGenerated phased analysis files:")
        for f in sorted(phased_files):
            print(f"  - {f}")
else:
    print("\nNo phased experiments found or not yet complete.")
    print(f"Make sure {PHASED_DIR} contains completed phased experiments.")