In [58]:
import pandas as pd
import ast
import os
from pathlib import Path

In [59]:
# Configuration
RESULTS_DIR = Path('results')

def is_all_black(palette_str):
    """Check if a palette is all black (invalid response)."""
    try:
        palette = ast.literal_eval(palette_str)
        return all(c.lower() == '#000000' for c in palette)
    except:
        return True  # Treat parse errors as invalid

def load_and_filter_csv(filepath):
    """Load CSV and filter out all-black palettes."""
    df = pd.read_csv(filepath)
    valid_mask = ~df['generated_palette'].apply(is_all_black)
    return df, df[valid_mask]

In [60]:
# Find all CSV files
csv_files = sorted(RESULTS_DIR.glob('*.csv'))
print(f"Found {len(csv_files)} result files:\n")
for f in csv_files:
    print(f"  - {f.name}")

Found 7 result files:

  - eval_fireworks_accounts_fireworks_models_gpt-oss-20b_20260103_024651.csv
  - eval_fireworks_accounts_fireworks_models_llama-v3p3-70b-instruct_20260103_021738.csv
  - eval_fireworks_accounts_fireworks_models_qwen2p5-vl-32b-instruct_20260103_133925.csv
  - eval_fireworks_accounts_fireworks_models_qwen3-8b_20260103_135722.csv
  - eval_openai_gpt-4o-mini_20260103_024052.csv
  - eval_openai_gpt-4o_20260103_012027.csv
  - eval_openai_gpt-5-mini_20260103_021759.csv


In [61]:
# Generate summary statistics for each CSV
summaries = []

for csv_file in csv_files:
    df_all, df_valid = load_and_filter_csv(csv_file)
    
    model = df_all['model'].iloc[0] if len(df_all) > 0 else 'unknown'
    provider = df_all['provider'].iloc[0] if len(df_all) > 0 else 'unknown'
    
    summary = {
        'file': csv_file.name,
        'provider': provider,
        'model': model.split('/')[-1] if '/' in model else model,  # Short name
        'total_samples': len(df_all),
        'valid_samples': len(df_valid),
        'filtered_out': len(df_all) - len(df_valid),
        'valid_pct': len(df_valid) / len(df_all) * 100 if len(df_all) > 0 else 0,
    }
    
    if len(df_valid) > 0:
        summary['mean_norm_D'] = df_valid['norm_D'].mean()
        summary['mean_norm_S'] = df_valid['norm_S'].mean()
        summary['mean_score_R'] = df_valid['score_R'].mean()
        summary['std_norm_D'] = df_valid['norm_D'].std()
        summary['std_norm_S'] = df_valid['norm_S'].std()
        summary['std_score_R'] = df_valid['score_R'].std()
        
        # Raw scores (new fields)
        summary['mean_gt_diversity'] = df_valid['gt_diversity'].mean()
        summary['mean_gen_diversity'] = df_valid['gen_diversity'].mean()
        summary['mean_raw_dccw'] = df_valid['raw_dccw'].mean()
        summary['mean_diversity_diff'] = df_valid['diversity_diff'].mean()
        summary['std_raw_dccw'] = df_valid['raw_dccw'].std()
    else:
        summary['mean_norm_D'] = None
        summary['mean_norm_S'] = None
        summary['mean_score_R'] = None
        summary['std_norm_D'] = None
        summary['std_norm_S'] = None
        summary['std_score_R'] = None
        summary['mean_gt_diversity'] = None
        summary['mean_gen_diversity'] = None
        summary['mean_raw_dccw'] = None
        summary['mean_diversity_diff'] = None
        summary['std_raw_dccw'] = None
    
    summaries.append(summary)

summary_df = pd.DataFrame(summaries)
summary_df

Unnamed: 0,file,provider,model,total_samples,valid_samples,filtered_out,valid_pct,mean_norm_D,mean_norm_S,mean_score_R,std_norm_D,std_norm_S,std_score_R,mean_gt_diversity,mean_gen_diversity,mean_raw_dccw,mean_diversity_diff,std_raw_dccw
0,eval_fireworks_accounts_fireworks_models_gpt-o...,fireworks,gpt-oss-20b,100,78,22,78.0,0.446542,0.532236,0.397103,0.314288,0.292295,0.282543,28.043912,29.436784,26.113184,13.056241,12.465413
1,eval_fireworks_accounts_fireworks_models_llama...,fireworks,llama-v3p3-70b-instruct,100,88,12,88.0,0.449762,0.502575,0.397874,0.304852,0.27982,0.268686,27.289118,30.55109,26.363621,12.270499,10.388477
2,eval_fireworks_accounts_fireworks_models_qwen2...,fireworks,qwen2p5-vl-32b-instruct,100,100,0,100.0,0.387008,0.463592,0.338416,0.305445,0.276094,0.261424,27.522209,37.501595,27.688208,15.181804,10.141089
3,eval_fireworks_accounts_fireworks_models_qwen3...,fireworks,qwen3-8b,100,100,0,100.0,0.476508,0.440834,0.371412,0.306592,0.292838,0.270379,27.522209,29.770412,29.653078,11.781996,13.571049
4,eval_openai_gpt-4o-mini_20260103_024052.csv,openai,gpt-4o-mini,100,100,0,100.0,0.417831,0.47068,0.373384,0.301792,0.282946,0.267164,27.522209,33.059576,27.946879,13.451164,11.414451
5,eval_openai_gpt-4o_20260103_012027.csv,openai,gpt-4o,100,100,0,100.0,0.426817,0.518969,0.390598,0.290667,0.288068,0.273081,27.522209,31.908932,26.112353,13.084647,11.138738
6,eval_openai_gpt-5-mini_20260103_021759.csv,openai,gpt-5-mini,100,100,0,100.0,0.398556,0.558584,0.384526,0.319115,0.262529,0.29082,27.522209,35.681833,24.692585,14.580646,10.43221


In [62]:
# Display detailed statistics per model
print("DETAILED SUMMARY STATISTICS (excluding all-black palettes)")
print("=" * 50)

for _, row in summary_df.iterrows():
    print(f"\n{'─' * 50}")
    print(f"Model: {row['model']}")
    print(f"Provider: {row['provider']}")
    print(f"{'─' * 50}")
    print(f"Total samples: {row['total_samples']}")
    print(f"Valid samples: {row['valid_samples']} ({row['valid_pct']:.1f}%)")
    print(f"Filtered out:  {row['filtered_out']}")
    
    if row['mean_score_R'] is not None:
        print(f"\nNormalized Metrics (mean ± std):")
        print(f"  norm_D:  {row['mean_norm_D']:.4f} ± {row['std_norm_D']:.4f}")
        print(f"  norm_S:  {row['mean_norm_S']:.4f} ± {row['std_norm_S']:.4f}")
        print(f"  score_R: {row['mean_score_R']:.4f} ± {row['std_score_R']:.4f}")
        
        print(f"\nRaw Scores:")
        print(f"  GT Diversity:  {row['mean_gt_diversity']:.2f}")
        print(f"  Gen Diversity: {row['mean_gen_diversity']:.2f}")
        print(f"  Raw DCCW:      {row['mean_raw_dccw']:.2f} ± {row['std_raw_dccw']:.2f}")
        print(f"  Diversity Δ:   {row['mean_diversity_diff']:.2f}")
    else:
        print("\n  No valid samples to compute metrics.")

DETAILED SUMMARY STATISTICS (excluding all-black palettes)

──────────────────────────────────────────────────
Model: gpt-oss-20b
Provider: fireworks
──────────────────────────────────────────────────
Total samples: 100
Valid samples: 78 (78.0%)
Filtered out:  22

Normalized Metrics (mean ± std):
  norm_D:  0.4465 ± 0.3143
  norm_S:  0.5322 ± 0.2923
  score_R: 0.3971 ± 0.2825

Raw Scores:
  GT Diversity:  28.04
  Gen Diversity: 29.44
  Raw DCCW:      26.11 ± 12.47
  Diversity Δ:   13.06

──────────────────────────────────────────────────
Model: llama-v3p3-70b-instruct
Provider: fireworks
──────────────────────────────────────────────────
Total samples: 100
Valid samples: 88 (88.0%)
Filtered out:  12

Normalized Metrics (mean ± std):
  norm_D:  0.4498 ± 0.3049
  norm_S:  0.5026 ± 0.2798
  score_R: 0.3979 ± 0.2687

Raw Scores:
  GT Diversity:  27.29
  Gen Diversity: 30.55
  Raw DCCW:      26.36 ± 10.39
  Diversity Δ:   12.27

──────────────────────────────────────────────────
Model: qwen

In [63]:
# Compare models side by side (best score_R highlighted)
comparison = summary_df[[
    'model', 'provider', 'valid_samples', 
    'mean_score_R', 'mean_norm_D', 'mean_norm_S',
    # 'mean_gt_diversity', 'mean_gen_diversity', 'mean_raw_dccw', 'mean_diversity_diff'
]].copy()
comparison = comparison.dropna(subset=['mean_score_R'])
comparison = comparison.sort_values('mean_score_R', ascending=False)

print("\n" + "=" * 80)
print("MODEL COMPARISON (sorted by score_R)")
print("=" * 80)

comparison


MODEL COMPARISON (sorted by score_R)


Unnamed: 0,model,provider,valid_samples,mean_score_R,mean_norm_D,mean_norm_S
1,llama-v3p3-70b-instruct,fireworks,88,0.397874,0.449762,0.502575
0,gpt-oss-20b,fireworks,78,0.397103,0.446542,0.532236
5,gpt-4o,openai,100,0.390598,0.426817,0.518969
6,gpt-5-mini,openai,100,0.384526,0.398556,0.558584
4,gpt-4o-mini,openai,100,0.373384,0.417831,0.47068
3,qwen3-8b,fireworks,100,0.371412,0.476508,0.440834
2,qwen2p5-vl-32b-instruct,fireworks,100,0.338416,0.387008,0.463592
