In [None]:
import os
import glob
import re
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from offline_compositional_rl_datasets.utils.data_utils import *

task_list_path = '/Users/shubhankar/Developer/compositional-rl-synth-data/offline_compositional_rl_datasets/_train_test_splits'
base_logs_path = '/Users/shubhankar/Developer/compositional-rl-synth-data/scripts/policies_slurm'

In [None]:
def get_log_file(base_logs_path, 
                 dataset_type, 
                 algorithm, algorithm_seed, 
                 task_list_seed, 
                 robot, obj, obst, subtask, 
                 model_run=None, denoiser=None, num_train=None):
    """
    Find the appropriate log file based on supplied parameters
    
    Args:
        dataset_type: 'synthetic' or 'expert'
        algorithm: algorithm name (e.g., 'td3_bc')
        algorithm_seed: algorithm seed value
        task_list_seed: task list seed value
        robot: robot type (e.g., 'Jaco')
        obj: object type (e.g., 'Hollowbox')
        obst: obstacle type or 'None'
        subtask: subtask name (e.g., 'Trashcan')
        model_run: model run number (required for 'synthetic')
        denoiser: denoiser type (required for 'synthetic')
        num_train: number of training tasks in diffusion model
    
    Returns:
        Path to the matching log file or None if not found
    """
    
    if dataset_type == "synthetic":
        if model_run is None or denoiser is None:
            raise ValueError("model_run and denoiser required for synthetic dataset")
            
        if algorithm == 'td3_bc':
            algorithm1 = 'td3'
            algorithm2 = 'bc'
            pattern = f"*_{algorithm1}_{algorithm2}_{algorithm_seed}_{denoiser}_{task_list_seed}_{num_train}_{model_run}_" \
                    f"{robot}_{obj}_{obst}_{subtask}.out"
        else:
            pattern = f"*_{algorithm}_{algorithm_seed}_{denoiser}_{task_list_seed}_{num_train}_{model_run}_" \
                    f"{robot}_{obj}_{obst}_{subtask}.out" 
    
    elif dataset_type == "expert":
        pattern = f"*_{algorithm}_{algorithm_seed}_tl_{task_list_seed}_" \
                 f"{robot}_{obj}_{obst}_{subtask}.out"
    else:
        raise ValueError("dataset_type must be 'synthetic' or 'expert'")
    
    search_path = os.path.join(base_logs_path, pattern)
    matching_files = glob.glob(search_path)
    
    return matching_files[0] if matching_files else None


def parse_log_file(file_path):
    """Extract the final best score from a log file."""
    try:
        with open(file_path, 'r') as f:
            content = f.read()
        match = re.search(r'Training completed\. Best score: ([0-9]+\.[0-9]+)', content)
        if match:
            return float(match.group(1))
        else:
            return None
    except Exception as e:
        print(f"Error parsing log file: {e}")
        return None
    
    
def run_evaluation_sweep(base_logs_path, task_list_path, task_list_seeds, 
                         algorithm, algorithm_seeds,
                         dataset_type, 
                         experiment_type='default', holdout_elem=None,
                         denoiser=None, num_train=None, model_run=1):
    """
    Run evaluation sweep across multiple seeds and tasks.
    For synthetic data, denoiser and model_run must be provided.
    """
    results = defaultdict(list)
    # Loop over task list seeds
    for task_list_seed in task_list_seeds:
        # Get task list for this seed
        _, _, _, test_task_list = get_task_list(
            task_list_path,
            dataset_type,
            experiment_type,
            holdout_elem,  
            task_list_seed,
        )        
        # Loop over algorithm seeds
        for algorithm_seed in algorithm_seeds:
            # Loop over each task in the task list
            for task_num, (robot, obj, obst, subtask) in enumerate(test_task_list):
                task_id = f"{robot}_{obj}_{obst}_{subtask}"
                # Get log file path
                log_file = get_log_file(
                    base_logs_path, 
                    dataset_type, 
                    algorithm, 
                    algorithm_seed, 
                    task_list_seed, 
                    robot, obj, obst, subtask,
                    model_run=model_run,
                    denoiser=denoiser,
                    num_train=num_train
                )
                if log_file:
                    # Parse the log file to get the best score
                    score = parse_log_file(log_file)
                    if score is not None:
                        # Store result
                        results[task_id].append((task_list_seed, algorithm_seed, score))
                    else:
                        print(f"Task {task_num + 1}: {task_id} - Failed to parse score")
                else:
                    print(f"Task {task_num + 1}: {task_id} - No log file found")
    
    return results


def create_aggregate_results_dataframe(results):

    df_results = []
    for task_id, scores in results.items():
        # Group scores by task_list_seed
        seed_scores = defaultdict(list)
        for task_list_seed, algorithm_seed, score in scores:
            seed_scores[task_list_seed].append(score)
        
        # Calculate average and standard error for each task_list_seed
        for task_list_seed, seed_data in seed_scores.items():
            avg_score = np.mean(seed_data)
            std_error = np.std(seed_data)/np.sqrt(len(seed_data))
            
            df_results.append({
                'task_id': task_id,
                'task_list_seed': task_list_seed,
                'avg_score': avg_score,
                'std_error': std_error,
                'num_rl_seeds': len(seed_data)
            })

    return pd.DataFrame(df_results)


def create_results_dataframe(results):
    df_results = []
    
    for task_id, scores in results.items():
        # Extract the robot, object, obstacle, subtask components
        robot, obj, obst, subtask = task_id.split('_')
        
        # Add each individual data point to preserve all seed information
        for task_list_seed, algorithm_seed, score in scores:
            df_results.append({
                'task_id': task_id,
                'robot': robot,
                'object': obj,
                'obstacle': obst,
                'subtask': subtask,
                'task_list_seed': task_list_seed,
                'algorithm_seed': algorithm_seed,
                'score': score
            })
    
    return pd.DataFrame(df_results)


def percent_diff(synthetic, expert):
    return ((synthetic - expert) / expert) * 100


def create_comparison_pivot(combined_df, task_list_seed, num_train):
   """
   Create a pivot table comparing expert and synthetic scores for a specific task_list_seed and num_train
   
   Args:
       combined_df: The combined dataframe with all results
       task_list_seed: Task list seed to filter by
       num_train: Number of training examples to include from synthetic data
       
   Returns:
       Pivot table with tasks as rows and scores as columns, sorted by expert score
   """

   filtered_df = combined_df[combined_df['task_list_seed'] == task_list_seed]
   filtered_df = filtered_df[
       (filtered_df['data_source'] == 'expert') | 
       (filtered_df['num_train'] == num_train)
   ]
   
   filtered_df['source_seed'] = filtered_df.apply(
       lambda row: 'expert_seed_' + str(row['algorithm_seed']) if row['data_source'] == 'expert' 
                   else 'synth_' + str(row['num_train']) + '_seed_' + str(row['algorithm_seed']),
       axis=1
   )
   
   pivot_df = filtered_df.pivot(
       index='task_id',
       columns='source_seed',
       values='score'
   )
   
   if any(col.startswith('expert') for col in pivot_df.columns):
       expert_col = [col for col in pivot_df.columns if col.startswith('expert')][0]
       pivot_df = pivot_df.sort_values(by=expert_col, ascending=False)
   
   return pivot_df

In [None]:
algorithm = 'td3_bc'
task_list_seeds = [0, 1]
algorithm_seeds = [0, 1]
num_train_tasks = [56, 98]

In [None]:
expert_results = run_evaluation_sweep(
    base_logs_path=base_logs_path,
    task_list_path=task_list_path,
    task_list_seeds=task_list_seeds,
    algorithm=algorithm,
    algorithm_seeds=algorithm_seeds,
    dataset_type="expert",
)
expert_df = create_aggregate_results_dataframe(expert_results)
expert_df['data_source'] = 'expert'
expert_df['num_train'] = None
expert_df.head()

In [None]:
summary = expert_df.groupby('task_list_seed').agg({
    'avg_score': 'mean',
    'std_error': lambda x: np.sqrt(np.sum(x**2)) / len(x)  # Propagate error
}).reset_index()

print("Expert data:")
display(summary)

In [None]:
synthetic_results_dfs = []

for num_train in num_train_tasks:
    synthetic_results = run_evaluation_sweep(
        base_logs_path=base_logs_path,
        task_list_path=task_list_path,
        task_list_seeds=task_list_seeds,
        algorithm=algorithm,
        algorithm_seeds=algorithm_seeds,
        dataset_type="synthetic",
        denoiser='monolithic',
        num_train=num_train
    )
    synthetic_results_df = create_aggregate_results_dataframe(synthetic_results)
    synthetic_results_dfs.append(synthetic_results_df)

    summary = synthetic_results_df.groupby('task_list_seed').agg({
        'avg_score': 'mean',
        'std_error': lambda x: np.sqrt(np.sum(x**2)) / len(x)  # Propagate error
    }).reset_index()

    print(f"Synthetic data; Training tasks: {num_train}:")
    display(summary)

In [None]:
combined_df = expert_df.copy()

for i, num_train in enumerate(num_train_tasks):
    synthetic_df = synthetic_results_dfs[i].copy()
    synthetic_df['data_source'] = 'synthetic'
    synthetic_df['num_train'] = num_train
    combined_df = pd.concat([combined_df, synthetic_df], ignore_index=True)

combined_df.head()

In [None]:
# Group expert data by task_id and task_list_seed to get baseline scores
expert_scores = combined_df[combined_df['data_source'] == 'expert'].groupby(['task_id', 'task_list_seed'])['avg_score'].mean().reset_index()

# Group synthetic data by task_id, task_list_seed, and num_train
synthetic_scores = combined_df[combined_df['data_source'] == 'synthetic'].groupby(['task_id', 'task_list_seed', 'num_train'])['avg_score'].mean().reset_index()

percent_diffs = []
# For each task_id and task_list_seed, calculate percentage difference
for (task, seed) in expert_scores[['task_id', 'task_list_seed']].values:
    # Get the expert score for this task and seed
    expert_score = expert_scores[(expert_scores['task_id'] == task) & 
                               (expert_scores['task_list_seed'] == seed)]['avg_score'].values[0]
    # Get all synthetic scores for this task and seed
    task_synth_scores = synthetic_scores[(synthetic_scores['task_id'] == task) & 
                                       (synthetic_scores['task_list_seed'] == seed)]
    for _, row in task_synth_scores.iterrows():
        percent_diffs.append({
            'task_id': task,
            'task_list_seed': seed,
            'num_train': row['num_train'],
            'expert_score': expert_score,
            'synthetic_score': row['avg_score'],
            'percent_difference': percent_diff(row['avg_score'], expert_score)
        })
diff_df = pd.DataFrame(percent_diffs)
# Compute average percentage difference for each num_train across all tasks/seeds
avg_diffs = diff_df.groupby('num_train')['percent_difference'].agg(['mean', 'std']).reset_index()

print("Average percentage difference by number of training tasks:")
print(avg_diffs)

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(avg_diffs['num_train'], avg_diffs['mean'], yerr=avg_diffs['std'], 
             marker='o', linestyle='-', capsize=5, linewidth=2, label='Monolithic Model Data')
plt.axhline(y=0, color='r', linestyle='--', alpha=0.7, label='Expert')
plt.xlabel('Number of Training Tasks', fontsize=12)
plt.ylabel('Percentage Difference from Expert Score (%)', fontsize=12)
plt.title('Score Gap: Synthetic and Expert Data', fontsize=12)
plt.grid(True, alpha=0.5)

plt.xticks(avg_diffs['num_train'], fontsize=12)
plt.yticks(fontsize=12)

# Add annotations for each point
for i, row in avg_diffs.iterrows():
    plt.annotate(f"{row['mean']:.1f}%", 
                (row['num_train'], row['mean']),
                textcoords="offset points",
                xytext=(0,10),
                ha='center',
                fontsize=12)

plt.legend(fontsize=12)
plt.tight_layout()

In [None]:
fine_expert_df = create_results_dataframe(expert_results)
fine_synthetic_results_dfs = []

for num_train in num_train_tasks:
    synthetic_results = run_evaluation_sweep(
        base_logs_path=base_logs_path,
        task_list_path=task_list_path,
        task_list_seeds=task_list_seeds,
        algorithm=algorithm,
        algorithm_seeds=algorithm_seeds,
        dataset_type="synthetic",
        denoiser='monolithic',
        num_train=num_train
    )
    fine_synthetic_results_df = create_results_dataframe(synthetic_results)
    fine_synthetic_results_dfs.append(fine_synthetic_results_df)

fine_combined_df = fine_expert_df.copy()
fine_combined_df['data_source'] = 'expert'
fine_combined_df['num_train'] = None  # Not applicable for expert data

# Add each synthetic dataset with appropriate markers
for i, num_train in enumerate(num_train_tasks):
    synthetic_df = fine_synthetic_results_dfs[i].copy()
    synthetic_df['data_source'] = 'synthetic'
    synthetic_df['num_train'] = num_train
    fine_combined_df = pd.concat([fine_combined_df, synthetic_df], ignore_index=True)

fine_combined_df.head()

In [None]:
task_list_seed_of_interest = 0
num_train_of_interest = 56 
comparison_df = create_comparison_pivot(fine_combined_df, task_list_seed_of_interest, num_train_of_interest)
comparison_df.head(32)

In [None]:
task_list_seed_of_interest = 0
num_train_of_interest = 98
comparison_df = create_comparison_pivot(fine_combined_df, task_list_seed_of_interest, num_train_of_interest)
comparison_df.head(32)

In [None]:
task_list_seed_of_interest = 1
num_train_of_interest = 56 
comparison_df = create_comparison_pivot(fine_combined_df, task_list_seed_of_interest, num_train_of_interest)
comparison_df.head(32)

In [None]:
task_list_seed_of_interest = 1
num_train_of_interest = 98
comparison_df = create_comparison_pivot(fine_combined_df, task_list_seed_of_interest, num_train_of_interest)
comparison_df.head(32)