In [None]:
import numpy as np
import pandas as pd
from utils.plot_utils import load_results
from scipy import stats

# Load all monitor results
df = load_results("logs/monitor")
if df.empty:
    print("No data found for analysis.")
    exit(0)

# Parse algorithm, environment, and seed from file path for grouping
# Assuming file path format: logs/monitor/<algorithm>_<env_id>_seed<seed>/monitor.csv
df['run_id'] = df['file'].apply(lambda x: os.path.basename(os.path.dirname(x)))  # e.g., ppo_HalfCheetah-v5_seed0
df[['algorithm', 'env_id', 'seed']] = df['run_id'].str.extract(r'([^_]+)_([^_]+-v\d+)_seed(\d+)')
df['seed'] = df['seed'].astype(int)

# Compute AUC (area under curve) or other summary metrics for each run
results = []  # to collect summary for each run
for (alg, env, seed), group in df.groupby(['algorithm', 'env_id', 'seed']):
    # Sort episodes by time to ensure correct order
    group = group.sort_values(by='t')
    # Compute area under learning curve: integrate episode reward over training timesteps
    # Use trapezoidal rule on (t, cumulative reward). Alternatively, use total reward collected.
    auc = np.trapz(y=group['r'].values, x=group['t'].values)
    # Also compute final performance (mean of last 10 episodes)
    final_mean = group['r'].tail(10).mean()
    results.append({
        'algorithm': alg,
        'env_id': env,
        'seed': seed,
        'auc': auc,
        'final_mean_reward': final_mean
    })
stats_df = pd.DataFrame(results)

# Print out mean and std of AUC and final rewards per algorithm-env
summary = stats_df.groupby(['algorithm', 'env_id']).agg({'auc': ['mean','std'], 'final_mean_reward': ['mean','std']})
print("Performance Summary (mean ± std):")
for (alg, env), sub in stats_df.groupby(['algorithm','env_id']):
    auc_mean = sub['auc'].mean(); auc_std = sub['auc'].std()
    final_mean = sub['final_mean_reward'].mean(); final_std = sub['final_mean_reward'].std()
    print(f"{alg.upper()} on {env}: AUC = {auc_mean:.2f} ± {auc_std:.2f}, Final 10-ep Reward = {final_mean:.2f} ± {final_std:.2f}")

# Example statistical test: Compare PPO vs SAC on HalfCheetah for final performance
algo1, algo2 = "ppo", "sac"
env = "HalfCheetah-v5"
data1 = stats_df[(stats_df['algorithm']==algo1) & (stats_df['env_id']==env)]['final_mean_reward']
data2 = stats_df[(stats_df['algorithm']==algo2) & (stats_df['env_id']==env)]['final_mean_reward']
if not data1.empty and not data2.empty:
    t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)  # Welch’s t-test
    print(f"\nT-test comparing final performance of {algo1.upper()} vs {algo2.upper()} on {env}:")
    print(f"t = {t_stat:.3f}, p-value = {p_val:.3f}")
else:
    print(f"Not enough data to compare {algo1} vs {algo2} on {env}.")
