In [75]:
import pandas as pd
import ast
import os
from pathlib import Path

In [76]:
# Configuration
RESULTS_DIR = Path('results')

def is_all_black(palette_str):
    """Check if a palette is all black (invalid response)."""
    try:
        palette = ast.literal_eval(palette_str)
        return all(c.lower() == '#000000' for c in palette)
    except:
        return True  # Treat parse errors as invalid

def load_and_filter_csv(filepath):
    """Load CSV and filter out all-black palettes."""
    df = pd.read_csv(filepath)
    valid_mask = ~df['generated_palette'].apply(is_all_black)
    return df, df[valid_mask]

In [77]:
# Find all CSV files
csv_files = sorted(RESULTS_DIR.glob('*.csv'))
print(f"Found {len(csv_files)} result files:\n")
for f in csv_files:
    print(f"  - {f.name}")

Found 16 result files:

  - eval_fireworks_accounts_fireworks_models_gpt-oss-20b_20260103_024651.csv
  - eval_fireworks_accounts_fireworks_models_llama-v3p1-8b-instruct_20260109_204850.csv
  - eval_fireworks_accounts_fireworks_models_llama-v3p1-8b-instruct_HSL_20260110_142344.csv
  - eval_fireworks_accounts_fireworks_models_llama-v3p3-70b-instruct_20260103_021738.csv
  - eval_fireworks_accounts_fireworks_models_qwen2p5-vl-32b-instruct_20260103_133925.csv
  - eval_fireworks_accounts_fireworks_models_qwen3-8b_20260103_135722.csv
  - eval_fireworks_accounts_fireworks_models_qwen3-8b_CIELAB_20260110_150041.csv
  - eval_fireworks_accounts_fireworks_models_qwen3-8b_HSL_20260110_142641.csv
  - eval_fireworks_accounts_sidvenkatayogi_deployedModels_rft-fhiaqwgu-0-1-ka2pgv73_20260109_023624.csv
  - eval_fireworks_accounts_sidvenkatayogi_models_artist-qwen2p5-vl-7b-instruct_20260109_195632.csv
  - eval_fireworks_accounts_sidvenkatayogi_models_artist-qwen3-8b-5-5-8_20260109_195746.csv
  - eval_fir

In [78]:
# Generate summary statistics for each CSV
summaries = []

for csv_file in csv_files:
    df_all, df_valid = load_and_filter_csv(csv_file)
    
    model = df_all['model'].iloc[0] if len(df_all) > 0 else 'unknown'
    provider = df_all['provider'].iloc[0] if len(df_all) > 0 else 'unknown'
    
    summary = {
        'file': csv_file.name,
        'provider': provider,
        'model': model.split('/')[-1] if '/' in model else model,  # Short name
        'total_samples': len(df_all),
        'valid_samples': len(df_valid),
        'filtered_out': len(df_all) - len(df_valid),
        'valid_pct': len(df_valid) / len(df_all) * 100 if len(df_all) > 0 else 0,
    }
    
    if len(df_valid) > 0:
        summary['mean_norm_D'] = df_valid['norm_D'].mean()
        summary['mean_norm_S'] = df_valid['norm_S'].mean()
        summary['mean_score_R'] = df_valid['score_R'].mean()
        summary['std_norm_D'] = df_valid['norm_D'].std()
        summary['std_norm_S'] = df_valid['norm_S'].std()
        summary['std_score_R'] = df_valid['score_R'].std()
        
        # Raw scores (new fields)
        summary['mean_gt_diversity'] = df_valid['gt_diversity'].mean()
        summary['mean_gen_diversity'] = df_valid['gen_diversity'].mean()
        summary['mean_raw_dccw'] = df_valid['raw_dccw'].mean()
        summary['mean_diversity_diff'] = df_valid['diversity_diff'].mean()
        summary['std_raw_dccw'] = df_valid['raw_dccw'].std()
    else:
        summary['mean_norm_D'] = None
        summary['mean_norm_S'] = None
        summary['mean_score_R'] = None
        summary['std_norm_D'] = None
        summary['std_norm_S'] = None
        summary['std_score_R'] = None
        summary['mean_gt_diversity'] = None
        summary['mean_gen_diversity'] = None
        summary['mean_raw_dccw'] = None
        summary['mean_diversity_diff'] = None
        summary['std_raw_dccw'] = None
    
    summaries.append(summary)

summary_df = pd.DataFrame(summaries)
summary_df

Unnamed: 0,file,provider,model,total_samples,valid_samples,filtered_out,valid_pct,mean_norm_D,mean_norm_S,mean_score_R,std_norm_D,std_norm_S,std_score_R,mean_gt_diversity,mean_gen_diversity,mean_raw_dccw,mean_diversity_diff,std_raw_dccw
0,eval_fireworks_accounts_fireworks_models_gpt-o...,fireworks,gpt-oss-20b,100,78,22,78.0,0.588642,0.532236,0.475971,0.333715,0.292295,0.297463,28.043912,29.436784,26.113184,13.056241,12.465413
1,eval_fireworks_accounts_fireworks_models_llama...,fireworks,llama-v3p1-8b-instruct,100,100,0,100.0,0.619746,0.462084,0.457575,0.295903,0.295393,0.28009,27.522209,30.399192,28.591253,11.911222,12.67315
2,eval_fireworks_accounts_fireworks_models_llama...,fireworks,llama-v3p1-8b-instruct_HSL,100,99,1,99.0,0.615814,0.478072,0.477468,0.301995,0.281963,0.267287,27.812907,29.735451,27.224976,11.791401,10.453235
3,eval_fireworks_accounts_fireworks_models_llama...,fireworks,llama-v3p3-70b-instruct,100,88,12,88.0,0.60141,0.502575,0.47914,0.309362,0.27982,0.277015,27.289118,30.55109,26.363621,12.270499,10.388477
4,eval_fireworks_accounts_fireworks_models_qwen2...,fireworks,qwen2p5-vl-32b-instruct,100,100,0,100.0,0.529314,0.463592,0.41467,0.336855,0.276094,0.278038,27.522209,37.501595,27.688208,15.181804,10.141089
5,eval_fireworks_accounts_fireworks_models_qwen3...,fireworks,qwen3-8b,100,100,0,100.0,0.623319,0.440834,0.435674,0.317901,0.292838,0.287413,27.522209,29.770412,29.653078,11.781996,13.571049
6,eval_fireworks_accounts_fireworks_models_qwen3...,fireworks,qwen3-8b_CIELAB,100,100,0,100.0,0.605522,0.470463,0.45834,0.302189,0.286058,0.284723,27.535808,26.137633,27.987131,12.24184,11.717665
7,eval_fireworks_accounts_fireworks_models_qwen3...,fireworks,qwen3-8b_HSL,100,99,1,99.0,0.615814,0.478072,0.477468,0.301995,0.281963,0.267287,27.812907,29.735451,27.224976,11.791401,10.453235
8,eval_fireworks_accounts_sidvenkatayogi_deploye...,fireworks,rft-fhiaqwgu-0-1-ka2pgv73,100,100,0,100.0,0.622311,0.488868,0.468576,0.314691,0.285788,0.286824,27.522209,31.585994,27.353852,11.810915,11.791549
9,eval_fireworks_accounts_sidvenkatayogi_models_...,fireworks,artist-qwen2p5-vl-7b-instruct,100,100,0,100.0,0.562037,0.415719,0.388528,0.337948,0.305584,0.306527,27.522209,29.961306,31.177123,14.130257,14.771891


In [79]:
# Display detailed statistics per model
print("DETAILED SUMMARY STATISTICS (excluding all-black palettes)")
print("=" * 50)

for _, row in summary_df.iterrows():
    print(f"\n{'─' * 50}")
    print(f"Model: {row['model']}")
    print(f"Provider: {row['provider']}")
    print(f"{'─' * 50}")
    print(f"Total samples: {row['total_samples']}")
    print(f"Valid samples: {row['valid_samples']} ({row['valid_pct']:.1f}%)")
    print(f"Filtered out:  {row['filtered_out']}")
    
    if row['mean_score_R'] is not None:
        print(f"\nNormalized Metrics (mean ± std):")
        print(f"  norm_D:  {row['mean_norm_D']:.4f} ± {row['std_norm_D']:.4f}")
        print(f"  norm_S:  {row['mean_norm_S']:.4f} ± {row['std_norm_S']:.4f}")
        print(f"  score_R: {row['mean_score_R']:.4f} ± {row['std_score_R']:.4f}")
        
        print(f"\nRaw Scores:")
        print(f"  GT Diversity:  {row['mean_gt_diversity']:.2f}")
        print(f"  Gen Diversity: {row['mean_gen_diversity']:.2f}")
        print(f"  Raw DCCW:      {row['mean_raw_dccw']:.2f} ± {row['std_raw_dccw']:.2f}")
        print(f"  Diversity Δ:   {row['mean_diversity_diff']:.2f}")
    else:
        print("\n  No valid samples to compute metrics.")

DETAILED SUMMARY STATISTICS (excluding all-black palettes)

──────────────────────────────────────────────────
Model: gpt-oss-20b
Provider: fireworks
──────────────────────────────────────────────────
Total samples: 100
Valid samples: 78 (78.0%)
Filtered out:  22

Normalized Metrics (mean ± std):
  norm_D:  0.5886 ± 0.3337
  norm_S:  0.5322 ± 0.2923
  score_R: 0.4760 ± 0.2975

Raw Scores:
  GT Diversity:  28.04
  Gen Diversity: 29.44
  Raw DCCW:      26.11 ± 12.47
  Diversity Δ:   13.06

──────────────────────────────────────────────────
Model: llama-v3p1-8b-instruct
Provider: fireworks
──────────────────────────────────────────────────
Total samples: 100
Valid samples: 100 (100.0%)
Filtered out:  0

Normalized Metrics (mean ± std):
  norm_D:  0.6197 ± 0.2959
  norm_S:  0.4621 ± 0.2954
  score_R: 0.4576 ± 0.2801

Raw Scores:
  GT Diversity:  27.52
  Gen Diversity: 30.40
  Raw DCCW:      28.59 ± 12.67
  Diversity Δ:   11.91

──────────────────────────────────────────────────
Model: llam

In [80]:
# Compare models side by side (best score_R highlighted)
comparison = summary_df[[
    'model', 'provider', 'valid_samples', 
    'mean_score_R', 'mean_norm_D', 'mean_norm_S',
    # 'mean_gt_diversity', 'mean_gen_diversity', 'mean_raw_dccw', 'mean_diversity_diff'
]].copy()
comparison = comparison.dropna(subset=['mean_score_R'])
comparison = comparison.sort_values('mean_score_R', ascending=False)

print("\n" + "=" * 80)
print("MODEL COMPARISON (sorted by score_R)")
print("=" * 80)

comparison


MODEL COMPARISON (sorted by score_R)


Unnamed: 0,model,provider,valid_samples,mean_score_R,mean_norm_D,mean_norm_S
3,llama-v3p3-70b-instruct,fireworks,88,0.47914,0.60141,0.502575
2,llama-v3p1-8b-instruct_HSL,fireworks,99,0.477468,0.615814,0.478072
7,qwen3-8b_HSL,fireworks,99,0.477468,0.615814,0.478072
12,artist-v3-llama-3p1-8b,fireworks,100,0.476185,0.604483,0.482864
0,gpt-oss-20b,fireworks,78,0.475971,0.588642,0.532236
14,gpt-4o,openai,100,0.475552,0.583893,0.518969
11,artist-v2,fireworks,100,0.472248,0.607634,0.494418
8,rft-fhiaqwgu-0-1-ka2pgv73,fireworks,100,0.468576,0.622311,0.488868
15,gpt-5-mini,openai,100,0.466177,0.533857,0.558584
10,artist-qwen3-8b-5-5-8,fireworks,100,0.464881,0.638413,0.459637


In [81]:
# Compare models side by side (best score_R highlighted)
comparison = summary_df[[
    'model', 'provider', 'valid_samples', 
    'mean_norm_S', 'mean_score_R', 'mean_norm_D',
    # 'mean_gt_diversity', 'mean_gen_diversity', 'mean_raw_dccw', 'mean_diversity_diff'
]].copy()
comparison = comparison.dropna(subset=['mean_norm_S'])
comparison = comparison.sort_values('mean_norm_S', ascending=False)

print("\n" + "=" * 80)
print("MODEL COMPARISON (sorted by norm_S)")
print("=" * 80)

comparison


MODEL COMPARISON (sorted by norm_S)


Unnamed: 0,model,provider,valid_samples,mean_norm_S,mean_score_R,mean_norm_D
15,gpt-5-mini,openai,100,0.558584,0.466177,0.533857
0,gpt-oss-20b,fireworks,78,0.532236,0.475971,0.588642
14,gpt-4o,openai,100,0.518969,0.475552,0.583893
3,llama-v3p3-70b-instruct,fireworks,88,0.502575,0.47914,0.60141
11,artist-v2,fireworks,100,0.494418,0.472248,0.607634
8,rft-fhiaqwgu-0-1-ka2pgv73,fireworks,100,0.488868,0.468576,0.622311
12,artist-v3-llama-3p1-8b,fireworks,100,0.482864,0.476185,0.604483
2,llama-v3p1-8b-instruct_HSL,fireworks,99,0.478072,0.477468,0.615814
7,qwen3-8b_HSL,fireworks,99,0.478072,0.477468,0.615814
13,gpt-4o-mini,openai,100,0.47068,0.453859,0.568782
