In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
import pandas as pd
from ast import literal_eval

import os
from econ_evals.utils.helper_functions import get_base_dir_path

# Load Data

In [None]:
log_subdirnames = os.listdir(get_base_dir_path() / "experiments/scheduling/logs/")
log_subdirname_to_dirnames = {
    log_subdirname: os.listdir(
        get_base_dir_path() / "experiments/scheduling/logs/" / log_subdirname
    )
    for log_subdirname in log_subdirnames
    if not log_subdirname.startswith(".")
}

# Read precomputed baseline data
with open(
    get_base_dir_path()
    / "experiments/scheduling/"
    / "sample_scheduling_baseline_data.txt",
    "r",
) as f:
    difficulty_to_seed_to_baseline = literal_eval(f.read())

print(
    "Overview of precomputed baseline data. (If missing a difficulty-seed pair you want to run, first run calculate_scheduling_baseline.py with the appropriate difficulties and seeds.)"
)
for difficulty, seed_to_baseline in difficulty_to_seed_to_baseline.items():
    seeds = sorted(list(seed_to_baseline.keys()))
    print(f"Difficulty {difficulty}: precomputed baseline for seeds {seeds}")

In [None]:
table = []
for log_subdirname in log_subdirnames:
    # Skip hidden directories
    if log_subdirname.startswith("."):
        continue
    dirnames = log_subdirname_to_dirnames[log_subdirname]
    short_log_subdirname = "__".join(log_subdirname.split("__")[1:])
    for dirname in dirnames:
        if dirname.startswith("."):
            continue
        df = pd.read_csv(
            get_base_dir_path()
            / "experiments/scheduling/logs/"
            / f"{log_subdirname}/{dirname}/logs.csv"
        )
        global_params = pd.read_csv(
            get_base_dir_path()
            / "experiments/scheduling/logs/"
            / f"{log_subdirname}/{dirname}/global_params.csv"
        ).T.to_dict()[0]
        # Calculate baseline, which is how well a naive solution would do (matching in order)

        difficulty = short_log_subdirname.split("__")[0]
        seed = global_params["seed"]
        num_attempts = global_params["num_attempts"]

        final_attempt_df = df[df["attempt_num"] == num_attempts - 1].reset_index(
            drop=True
        )

        final_num_blocking_pairs = len(
            literal_eval(final_attempt_df["blocking_pairs"].dropna().iloc[0])
        )

        try:
            baseline_data = difficulty_to_seed_to_baseline[difficulty][seed]
        except KeyError:
            print(
                f"Error: (difficulty {difficulty}, seed {seed}) not in baseline data. First, run the script calculate_scheduling_baseline.py, and make sure it is calculating baselines for all difficulty levels and seeds that you have data for."
            )
            continue
        baseline_data = difficulty_to_seed_to_baseline[difficulty][seed]
        baseline_num_blocking_pairs = baseline_data["baseline_num_blocking_pairs"]
        confidence_interval_low = baseline_data["confidence_interval_low"]
        confidence_interval_high = baseline_data["confidence_interval_high"]

        # final_num_blocking_pairs = int(df["num_blocking_pairs"].dropna().iloc[-1])
        final_score = 1 - final_num_blocking_pairs / baseline_num_blocking_pairs
        final_score_floor = max(0, final_score)
        table.append(
            {
                "dirname": dirname,
                "log_subdirname": log_subdirname,
                "short_log_subdirname": short_log_subdirname,
                "final_num_blocking_pairs": final_num_blocking_pairs,
                "baseline_num_blocking_pairs": baseline_num_blocking_pairs,
                "confidence_interval_low": confidence_interval_low,
                "confidence_interval_high": confidence_interval_high,
                "final_score": final_score,
                "final_score_floor": final_score_floor,
                "seed": global_params["seed"],
                "model": global_params["model"],
                "difficulty": short_log_subdirname.split("__")[0],
            }
        )
df_table = pd.DataFrame(table)

# Calculate scheduling benchmark scores for each LLM and difficulty level

In [None]:
df_table[["model", "difficulty", "final_score"]].groupby(
    ["model", "difficulty"]
).mean() * 100

# Calculate full solve rate

In [None]:
df_table["solved"] = df_table["final_score"] == 1
df_table.groupby(["model", "difficulty"])["solved"].sum() / df_table.groupby(
    ["model", "difficulty"]
)["solved"].count()