In [2]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import glob
from collections import defaultdict
import argparse

SEEDS = [13, 19, 42, 94, 1337]
STEPS = [100000, 200000, 300000, 400000, 500000]
ALGORITHMS = ['Dreamer', 'R2I', 'DRQN']

def extract_metrics(metrics_file, metric_name="episodes/score_mean"):
    """Extract metrics from a JSONL file."""
    steps_to_values = {}
    with metrics_file.open('r') as f:
        for line in f:
            data = json.loads(line)
            if metric_name in data:
                steps_to_values[data["step"]] = data[metric_name]
                
    return steps_to_values

def collect_algorithm_metrics(algorithm_dir, eval_dir, metric_name="episodes/score_mean"):
    """Collect metrics for all seeds of an algorithm."""
    all_seed_metrics = []
    
    # Find all seed directories
    seed_dirs = [algorithm_dir / f"{seed}" for seed in SEEDS]
    
    for seed_dir in seed_dirs:
        metrics_file = seed_dir / eval_dir / "metrics.jsonl"
        if not metrics_file.exists():
            print(f"Metrics file {metrics_file} does not exist")
            continue
        steps_to_values = extract_metrics(metrics_file, metric_name)    
        all_seed_metrics.append([steps_to_values[step] for step in STEPS])
    # return array of seeds x steps
    return np.array(all_seed_metrics)

In [2]:
import pathlib
from rliable import metrics
from rliable import library as rly
from rliable import plot_utils
from collections import defaultdict




scores_2_iqm = lambda scores: np.array([metrics.aggregate_iqm(scores[..., env_step_idx])
                               for env_step_idx in range(scores.shape[-1])])


base_dir = pathlib.Path('/work/dlclarge1/ramans-powm/powm/experiments/mordor_hike')

env2algos = {
    'easy': {
        'Dreamer': base_dir / '044_dreamer_easy_tuned',
        'R2I': base_dir / '046_r2i_easy_tuned',
        'DRQN': base_dir / '045_drqn_easy_tuned',
    },
    'medium': {
        'Dreamer': base_dir / '044_dreamer_medium_tuned',
        'R2I': base_dir / '046_r2i_medium_tuned',
        'DRQN': base_dir / '045_drqn_medium_tuned',
    },
    'hard': {
        'Dreamer': base_dir / '044_dreamer_hard_tuned',
        'R2I': base_dir / '046_r2i_hard_tuned',
        'DRQN': base_dir / '045_drqn_hard_tuned',
    },
}


metric_meta = {
    'score_mean': {
        'ylabel': 'Average Score',
        'file_name': 'iqm_scores',
    },
    'episodic_kldiv': {
        'ylabel': 'Average KL Divergence',
        'file_name': 'episodic_kldiv',
    },
    'success_mean': {
        'ylabel': 'Success Rate',
        'file_name': 'success_mean',
    },
}

trajectory_meta = {
    'episodes': 'in_distribution',
    'noisy_episodes': 'noisy_ood',
    'waypoint_episodes': 'waypoint_ood',
}

eval_dir = 'eval_with_angle'
# Initialize a nested dictionary to store all plotted data for tables
plot_data = defaultdict(lambda: defaultdict(dict))
save_path = pathlib.Path(eval_dir)
save_path.mkdir(parents=True, exist_ok=True)

# Add dictionaries to store global min and max values for each metric
metric_global_min = defaultdict(lambda: float('inf'))
metric_global_max = defaultdict(lambda: float('-inf'))

# Loop through each trajectory type and metric to create plots and store data
for trajectory_type, trajectory_name in trajectory_meta.items():
    for metric, metric_info in metric_meta.items():
        # Collect data for all environments
        for i, env in enumerate(['easy', 'medium', 'hard']):
            algo_scores = {}
            for algo, path in env2algos[env].items():
                algo_scores[algo] = collect_algorithm_metrics(pathlib.Path(path), eval_dir, f'{trajectory_type}/{metric}')
            
            iqm_scores, iqm_cis = rly.get_interval_estimates(
                algo_scores, scores_2_iqm, reps=50000)
            
            # Update global min and max for this metric
            for algo in ALGORITHMS:
                metric_global_min[metric] = min(metric_global_min[metric], np.min(iqm_scores[algo]))
                metric_global_max[metric] = max(metric_global_max[metric], np.max(iqm_scores[algo]))
            
            # Store the data for tables
            plot_data[metric][env][trajectory_type] = {
                'iqm_scores': iqm_scores,
                'iqm_cis': iqm_cis
            }

In [14]:
for trajectory_type, trajectory_name in trajectory_meta.items():
    for metric, metric_info in metric_meta.items():
        # Create one figure with 3 subplots for easy, medium, hard
        fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)
        
        # Apply consistent y-limits across all subplots using global min/max
        y_min = metric_global_min[metric]
        y_max = metric_global_max[metric]
        # Add a small padding (5%) to prevent curves from touching the edges
        padding = abs((y_max - y_min) * 0.05)
        y_min -= padding
        y_max += padding
        
        for i, env in enumerate(['easy', 'medium', 'hard']):
            # Get data for this environment
            iqm_scores = plot_data[metric][env][trajectory_type]['iqm_scores']
            iqm_cis = plot_data[metric][env][trajectory_type]['iqm_cis']
            
            # Plot on the respective subplot
            plot_utils.plot_sample_efficiency_curve(
                [f"{step / 1000:.0f}k" for step in STEPS], 
                iqm_scores, iqm_cis, 
                algorithms=ALGORITHMS,
                ax=axes[i],
                xlabel=r'Steps' if i == 1 else '', # show x-label only for second subplot
                ylabel=metric_info['ylabel'] if i == 0 else '',  # Only show y-label on first subplot
                legend=False)  # No legend for individual subplots
            
            # Set consistent y-limits
            axes[i].set_ylim(y_min, y_max)
            # Increase font size for x and y labels
            if i == 1:
                axes[i].xaxis.label.set_fontsize(18)  # Larger x-label font
            if i == 0:
                axes[i].yaxis.label.set_fontsize(18)  # Larger y-label font
            
            # Set title for each subplot
            axes[i].set_title(f'{env.capitalize()}', fontsize=18)
        
        # Add a single legend for the entire figure
        handles, labels = axes[0].get_legend_handles_labels()
        fig.legend(handles, labels, loc='lower right', ncol=len(ALGORITHMS), fontsize=18)
        
        # Adjust layout and save
        plt.tight_layout()
        fig.subplots_adjust(bottom=0.2)  # Make room for the legend
        fig.savefig(save_path / f'{metric_info["file_name"]}_{trajectory_name}.pdf', dpi=300, bbox_inches='tight')
        plt.close(fig)


In [None]:


def create_consolidated_latex_tables(plot_data, save_path, metrics_to_include=None, traj_types_to_include=None, file_suffix=""):
    """
    Create tables for specified metrics where each table shows algorithm performance
    across specified trajectory types and difficulty levels.
    
    Parameters:
    -----------
    plot_data: The data dictionary containing all metrics
    save_path: Path to save the generated tables
    metrics_to_include: List of metrics to include (if None, includes all metrics)
    traj_types_to_include: List of trajectory types to include (if None, includes all types)
    file_suffix: Optional suffix to add to filenames
    """
    
    all_metrics_table = {
        'score_mean': {
            'caption': 'Final average score comparison (IQM across seeds) across different evaluation conditions and difficulty levels. Agent is evaluated at the 500k steps. Higher values are better.',
            'label': 'tab:final_scores_consolidated',
            'filename': 'final_scores_consolidated',
            'higher_is_better': True
        },
        'episodic_kldiv': {
            'caption': 'Final KL divergence comparison (IQM across seeds) across different evaluation conditions and difficulty levels. Agent is evaluated at the 500k steps. Lower values indicate better state estimation.',
            'label': 'tab:final_kldiv_consolidated',
            'filename': 'final_kldiv_consolidated',
            'higher_is_better': False
        },
        'success_mean': {
            'caption': 'Final average success rate comparison (IQM across seeds) across different evaluation conditions and difficulty levels. Agent is evaluated at the 500k steps. Higher values are better.',
            'label': 'tab:final_success_consolidated',
            'filename': 'final_success_consolidated',
            'higher_is_better': True
        }
    }
    if metrics_to_include is not None:
        all_metrics_table = {k: v for k, v in all_metrics_table.items() if k in metrics_to_include}
        
    
    # Default trajectory types if not specified
    if traj_types_to_include is None:
        traj_types_to_include = ['episodes', 'noisy_episodes', 'waypoint_episodes']
    
    traj_display_names = {
        'episodes': 'In-distribution',
        'noisy_episodes': 'Action noise OOD',
        'waypoint_episodes': 'Waypoint OOD'
    }
    
    # Final step index (500000 should be the last step)
    final_step_idx = -1  # Last step
    
    # Create one table per metric
    for metric, meta in all_metrics_table.items():
        # Create the table header
        table = "\\begin{table}[ht]\n\\centering\n\\resizebox{\\textwidth}{!}{\n"
        
        # Determine number of columns based on trajectory types
        num_traj_types = len(traj_types_to_include)
        num_cols_per_traj = 3  # easy, medium, hard
        total_cols = num_traj_types * num_cols_per_traj
        
        # Create tabular with appropriate number of columns
        table += f"\\begin{{tabular}}{{l|{('c' * num_cols_per_traj + '|') * (num_traj_types - 1) + 'c' * num_cols_per_traj}}}\n\\toprule\n"
        
        # Add trajectory type headers with subcolumns for difficulty levels
        table += "& "
        for i, traj_type in enumerate(traj_types_to_include):
            is_last = i == len(traj_types_to_include) - 1
            border = "" if is_last else "|"
            table += f"\\multicolumn{{3}}{{c{border}}}{{\\textbf{{{traj_display_names[traj_type]}}}}} & " if not is_last else f"\\multicolumn{{3}}{{c}}{{\\textbf{{{traj_display_names[traj_type]}}}}} "
        table += "\\\\\n"
        
        # Add difficulty level subheaders
        table += "\\textbf{Algorithm} "
        for _ in range(num_traj_types):  # For each trajectory type
            table += "& \\textbf{Easy} & \\textbf{Medium} & \\textbf{Hard} "
        table += "\\\\\n\\midrule\n"
        
        # Find the best value for each column
        best_values = {}
        for traj_type in traj_types_to_include:
            for env in ['easy', 'medium', 'hard']:
                column_values = []
                for algo in ALGORITHMS:
                    data = plot_data[metric][env][traj_type]
                    final_value = data['iqm_scores'][algo][final_step_idx]
                    column_values.append(final_value)
                
                if meta['higher_is_better']:
                    best_values[(traj_type, env)] = max(column_values)
                else:
                    best_values[(traj_type, env)] = min(column_values)
        
        # Add data rows
        for algo in ALGORITHMS:
            table += f"{algo} "
            for traj_type in traj_types_to_include:
                for env in ['easy', 'medium', 'hard']:
                    data = plot_data[metric][env][traj_type]
                    final_value = data['iqm_scores'][algo][final_step_idx]
                    
                    # Check if this is the best value for this column
                    is_best = abs(final_value - best_values[(traj_type, env)]) < 1e-6
                    
                    # Format with appropriate precision and highlight if best
                    if metric == 'success_mean':
                        # Format as percentage
                        formatted_value = f"{final_value*100:.1f}\\%"
                    elif metric == 'episodic_kldiv':
                        # Format with more decimals for small values
                        formatted_value = f"{final_value:.3f}"
                    else:
                        # Standard format for scores
                        formatted_value = f"{final_value:.2f}"
                    
                    if is_best:
                        table += f"& \\textbf{{{formatted_value}}} "
                    else:
                        table += f"& {formatted_value} "
            table += "\\\\\n"
        
        # Finish the table
        table += "\\bottomrule\n\\end{tabular}\n}\n"
        table += f"\\caption{{{meta['caption']}}}\n"
        table += f"\\label{{{meta['label']}_{file_suffix}}}\n"
        table += "\\end{table}"
        
        # Add suffix to filename
        filename = f"{meta['filename']}{file_suffix}.tex" if file_suffix else f"{meta['filename']}.tex"
        
        # Save the table to file
        with open(save_path / filename, 'w') as f:
            f.write(table)
        
        print(f"Created consolidated LaTeX table for {metric} at {save_path / filename}")


create_consolidated_latex_tables(plot_data, save_path, None, ['noisy_episodes', 'waypoint_episodes'], "_ood")

create_consolidated_latex_tables(plot_data, save_path, ['success_mean'], None, "")


NameError: name 'plot_data' is not defined

In [12]:

metrics_to_table = {
    'score_mean': {
        'caption_template': '\\textbf{{{}: Average score}} comparison across difficulty levels. We report the IQM across seeds. Agent is evaluated at the 500k steps. Higher values are better.',
        'label_template': 'tab:final_scores_{}',
        'filename_template': 'final_scores_{}.tex',
        'higher_is_better': True
    },
    'episodic_kldiv': {
        'caption_template': '\\textbf{{{}: Average KL divergence}} comparison across difficulty levels. We report the IQM across seeds. Agent is evaluated at the 500k steps. Lower values indicate better state estimation.',
        'label_template': 'tab:final_kldiv_{}',
        'filename_template': 'final_kldiv_{}.tex',
        'higher_is_better': False
    },
    'success_mean': {
        'caption_template': '\\textbf{{{}: Success rate}} comparison across difficulty levels. We report the IQM across seeds. Agent is evaluated at the 500k steps. Higher values are better.',
        'label_template': 'tab:final_success_{}',
        'filename_template': 'final_success_{}.tex',
        'higher_is_better': True
    }
}

traj_display_names = {
    'episodes': 'In-distribution',
    'noisy_episodes': 'Action noise OOD',
    'waypoint_episodes': 'Waypoint OOD'
}
trajectory_label = {
    'episodes': 'in_distribution',
    'noisy_episodes': 'action_noise_ood',
    'waypoint_episodes': 'waypoint_ood'
}

def create_separate_latex_tables(plot_data, save_path):
    """
    Create separate tables for each trajectory type (in-distribution, action noise OOD, waypoint OOD).
    For each trajectory type, create 3 tables (one per metric).
    Highlights the best performance in each column with bold text.
    """
    # Final step index (500000 should be the last step)
    final_step_idx = -1  # Last step
    
    # Create tables for each trajectory type
    for traj_type, traj_name in traj_display_names.items():
        # Create one table per metric for this trajectory type
        for metric, meta in metrics_to_table.items():
            # Fill in the templates with trajectory name
            caption = meta['caption_template'].format(traj_name)
            traj_label = trajectory_label[traj_type]

            label = meta['label_template'].format(traj_label)
            filename = meta['filename_template'].format(traj_label)
            
            # Create the table header
            table = "\\begin{table}[ht]\n\\centering\n"
            table += "\\begin{tabular}{l|ccc}\n\\toprule\n"
            
            # Add difficulty level headers
            table += "\\textbf{Algorithm} & \\textbf{Easy} & \\textbf{Medium} & \\textbf{Hard} \\\\\n\\midrule\n"
            
            # Find the best value for each column (difficulty level)
            best_values = {}
            for env in ['easy', 'medium', 'hard']:
                column_values = []
                for algo in ALGORITHMS:
                    data = plot_data[metric][env][traj_type]
                    final_value = data['iqm_scores'][algo][final_step_idx]
                    column_values.append(final_value)
                
                if meta['higher_is_better']:
                    best_values[env] = max(column_values)
                else:
                    best_values[env] = min(column_values)
            
            # Add data rows
            for algo in ALGORITHMS:
                table += f"{algo} "
                for env in ['easy', 'medium', 'hard']:
                    data = plot_data[metric][env][traj_type]
                    final_value = data['iqm_scores'][algo][final_step_idx]
                    
                    # Check if this is the best value for this column
                    is_best = abs(final_value - best_values[env]) < 1e-6
                    
                    # Format with appropriate precision and highlight if best
                    if metric == 'success_mean':
                        # Format as percentage
                        formatted_value = f"{final_value*100:.1f}\\%"
                    elif metric == 'episodic_kldiv':
                        # Format with more decimals for small values
                        formatted_value = f"{final_value:.3f}"
                    else:
                        # Standard format for scores
                        formatted_value = f"{final_value:.2f}"
                    
                    if is_best:
                        table += f"& \\textbf{{{formatted_value}}} "
                    else:
                        table += f"& {formatted_value} "
                table += "\\\\\n"
            
            # Finish the table
            table += "\\bottomrule\n\\end{tabular}\n"
            table += f"\\caption{{{caption}}}\n"
            table += f"\\label{{{label}}}\n"
            table += "\\end{table}"
            
            
            # Save the table to file
            with open(save_path / filename, 'w') as f:
                f.write(table)
            
            print(f"Created LaTeX table for {metric} ({traj_name}) at {save_path / filename}")

# Create separate tables for each trajectory type
create_separate_latex_tables(plot_data, save_path)

Created LaTeX table for score_mean (In-distribution) at eval_with_angle/final_scores_in_distribution.tex
Created LaTeX table for episodic_kldiv (In-distribution) at eval_with_angle/final_kldiv_in_distribution.tex
Created LaTeX table for success_mean (In-distribution) at eval_with_angle/final_success_in_distribution.tex
Created LaTeX table for score_mean (Action noise OOD) at eval_with_angle/final_scores_action_noise_ood.tex
Created LaTeX table for episodic_kldiv (Action noise OOD) at eval_with_angle/final_kldiv_action_noise_ood.tex
Created LaTeX table for success_mean (Action noise OOD) at eval_with_angle/final_success_action_noise_ood.tex
Created LaTeX table for score_mean (Waypoint OOD) at eval_with_angle/final_scores_waypoint_ood.tex
Created LaTeX table for episodic_kldiv (Waypoint OOD) at eval_with_angle/final_kldiv_waypoint_ood.tex
Created LaTeX table for success_mean (Waypoint OOD) at eval_with_angle/final_success_waypoint_ood.tex
