In [1]:
import os
import json
from pathlib import Path
from collections import defaultdict
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import textwrap

In [2]:
data_dir = Path('data')
# Create a nested defaultdict structure with three levels
traces_by_task = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

In [22]:
Human_improve_perc = {
    "base-competition" : {
        "dev" : 0.5,
        "test" : 0.5,
    },
    "llm-merging" : {
        "dev" : 68.2,
        "test" : 68.2,
    },
    "backdoor-trigger-recovery" : {
        "dev" : 621.3,
        "test" : 621.3,
    },
    "perception_temporal_action_loc" : {
        "dev" : 284.6,
        "test" : 284.6,
    },
    "machine_unlearning" : {
        "dev" : 61.9,
        "test" : 61.9,
    },
    "meta-learning" : {
        "dev" : 304.5,
        "test" : 304.5,
    },
}

In [3]:
Baselines = {
    "base-competition" : {
        "dev" : 0.5,
        "test" : 0.5,
        },
    "llm-merging" : {
        # range 0-1
        "dev" : 0.727136371,
        "test" : 0.4933333333,
        },
    "backdoor-trigger-recovery" : {
        # range 0-100
        "dev" : 3.758409347,
        "test" : 9.368725447,
        "debug" : 2,
        },
    "perception_temporal_action_loc" : {
        # range 0-1
        "dev" : 0.2370039379,
        "test" : 0.1263531695,
        "debug" : 0.2
    },
        "machine_unlearning":{
            # range 0-1
        "dev": 0.05389313916,
        "test": 0.06085605833,
        "debug": 233,
    },
    "meta-learning" : {
        # range 0-1
        "dev" : 0.1821651453,
        "test" : 0.1727912574,
        "debug" : 0.15,
    },
    "erasing_invisible_watermarks": {
        "dev": 0.2129997074,
        "test": 0.2097184418,
    },
    "product-recommendation" : {
        # range 0-1
        "dev" : 0.08035839265,
        "test" : 0.08039049179,
    }
}

def improve_per(current_performance, task, split='dev'):
    """
    Calculate the percentage improvement over the baseline for a given task and split.
    
    Args:
        current_performance (float): The current performance value
        task (str): The name of the task
        split (str): The data split ('dev', 'test', or 'debug')
        
    Returns:
        float: Percentage improvement over baseline (positive means better than baseline)
    """
    if task not in Baselines:
        raise ValueError(f"Task '{task}' not found in Baselines")
    
    if split not in Baselines[task]:
        raise ValueError(f"Split '{split}' not found for task '{task}'")
    
    baseline = Baselines[task][split]
    
    # Higher is better, so improvement is percentage change
    improvement = (current_performance - baseline) / baseline * 100
    
    return improvement

In [23]:
for eval_file in data_dir.rglob('env_log/test_idea_evals.json'):
    try:
        with open(eval_file, 'r') as f:
            trace_data = json.load(f)

        path_parts = eval_file.parts
        task_name = path_parts[1]
        model_name = path_parts[2]
        iteration = path_parts[3]

        evals = []
        for step_num, step in enumerate(trace_data['implementations'], 1):
            phase = step['phase']
            performance = step['performance']
            improvement_perc = improve_per(performance, task_name, phase)
            relative_complexity = step['relative_complexity']
            step_num = step['step']
            baseline = Baselines[task_name][phase]
            human_improve_perc = Human_improve_perc[task_name][phase]
            evals.append({
                'step_num': step_num,
                'phase': phase,
                'baseline': baseline,
                'performance': performance,
                'improvement_perc': improvement_perc,   
                'human_improve_perc': human_improve_perc,
                'relative_complexity': relative_complexity
            })

        if evals:
            traces_by_task[task_name][model_name][iteration] = {
                'evals': evals,
            }
            
    except json.JSONDecodeError as e:
        print(f"Error reading {eval_file}: {e}")
                    

In [24]:
# Convert defaultdict to regular dict for JSON serialization
traces_dict = json.loads(json.dumps(traces_by_task))

# Save to JSON file with nice formatting
with open('all_idea_evals.json', 'w') as f:
    json.dump(traces_dict, f, indent=2)