In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import json

In [12]:
combined = pd.read_csv('/Users/pranmodu/Library/Mobile Documents/com~apple~CloudDocs/Desktop/coding projects/apart-sprint/data/processed/combined_benchmarks.csv')

# Show initial statistics
print(f"Initial loaded data:")
print(f"Total rows: {len(combined)}")
print(f"Rows with missing model: {combined['model'].isna().sum()}")
print(f"Rows with missing org: {combined['org'].isna().sum()}")

combined

Initial loaded data:
Total rows: 2827
Rows with missing model: 401
Rows with missing org: 595


Unnamed: 0,model,benchmark,date,org,country,training_compute_flops,score,capability
0,gpt-5-2025-08-07_high,aider_polyglot,2025-08-07,OpenAI,United States of America,6.600000e+25,88.000,code_generation
1,gpt-5-2025-08-07_medium,aider_polyglot,2025-08-07,OpenAI,United States of America,6.600000e+25,86.700,code_generation
2,o3-pro-2025-06-10_high,aider_polyglot,2025-06-10,OpenAI,United States of America,,84.900,code_generation
3,gemini-2.5-pro-preview-06-05_32K,aider_polyglot,,Google DeepMind,"United States of America,United Kingdom of Gre...",,83.100,code_generation
4,gpt-5-2025-08-07_low,aider_polyglot,2025-08-07,OpenAI,United States of America,6.600000e+25,81.300,code_generation
...,...,...,...,...,...,...,...,...
2822,opt-13b,hella_swag,2022-05-11,,,,0.699,commonsense_reasoning
2823,gpt-j-6b,hella_swag,2021-08-05,"EleutherAI,LAION","United States of America,Germany",1.500000e+22,0.662,commonsense_reasoning
2824,dolly-v2-12b,hella_swag,2023-04-11,Databricks,United States of America,,0.708,commonsense_reasoning
2825,Cerebras-GPT-13B,hella_swag,2023-03-20,Cerebras Systems,United States of America,2.300000e+22,0.594,commonsense_reasoning


In [11]:
combined['date'] = pd.to_datetime(combined['date'])

# Create year column as integer (Int64 to handle NaN values properly)
combined['year'] = combined['date'].dt.year.astype('Int64')

# First, remove rows with empty model column
print(f"Original rows: {len(combined)}")
combined = combined.dropna(subset=['model'])
print(f"After removing empty models: {len(combined)}")

# Now fill missing org values for models that have org in other rows
models_missing_org = combined[combined['org'].isna()]['model'].unique()
print(f"\nModels with missing org: {len(models_missing_org)}")

# For each model with missing org, try to find org from other rows with the same model
for model in models_missing_org:
    # Get all rows with this model that have an org
    model_with_org = combined[(combined['model'] == model) & (combined['org'].notna())]
    
    if len(model_with_org) > 0:
        # Get the most common org for this model
        org_value = model_with_org['org'].mode()[0] if len(model_with_org['org'].mode()) > 0 else model_with_org['org'].iloc[0]
        
        # Fill missing org for this model
        combined.loc[(combined['model'] == model) & (combined['org'].isna()), 'org'] = org_value
        print(f"Filled org '{org_value}' for model '{model}'")

# Check remaining missing orgs
remaining_missing = combined['org'].isna().sum()
print(f"\nRemaining rows with missing org: {remaining_missing}")

Original rows: 2827
After removing empty models: 2426

Models with missing org: 61

Remaining rows with missing org: 194


In [None]:
# Save cleaned data to CSV
output_path = '/Users/pranmodu/Library/Mobile Documents/com~apple~CloudDocs/Desktop/coding projects/apart-sprint/data/processed/combined_benchmarks_cleaned.csv'
combined.to_csv(output_path, index=False)

print(f"Saved cleaned data with {len(combined)} rows to: {output_path}")