In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
from econ_evals.utils.helper_functions import get_base_dir_path

# Load data

In [None]:
log_subdirnames = os.listdir(get_base_dir_path() / "experiments/procurement/logs/")

log_subdirname_to_dirnames = {
    log_subdirname: os.listdir(
        get_base_dir_path() / "experiments/procurement/logs/" / log_subdirname
    )
    for log_subdirname in log_subdirnames
}

In [None]:
table = []
for log_subdirname in log_subdirnames:
    dirnames = log_subdirname_to_dirnames[log_subdirname]
    for dirname in dirnames:
        try:
            df = pd.read_csv(
                get_base_dir_path()
                / "experiments/procurement/logs/"
                / f"{log_subdirname}/{dirname}/logs.csv"
            )
            global_params = pd.read_csv(
                get_base_dir_path()
                / "experiments/procurement/logs/"
                / f"{log_subdirname}/{dirname}/global_params.csv"
            ).to_dict(orient="records")[0]

        except FileNotFoundError:
            continue

        model = global_params["model"]

        plot_df = df[
            ["attempt_num", "alloc", "cost", "utility", "is_feasible"]
        ].dropna()

        # Calculate eval metrics
        max_utility = plot_df[plot_df["is_feasible"]]["utility"].max()

        opt_utility = global_params["opt_utility"]

        max_ratio = max_utility / opt_utility

        short_log_subdirname = "__".join(log_subdirname.split("__")[1:])

        table.append(
            {
                "dirname": dirname,
                "log_subdirname": log_subdirname,
                "short_log_subdirname": short_log_subdirname.split(model)[0] + model,
                "max_ratio": max_ratio,
                "seed": global_params["seed"],
                "model": model,
                "difficulty": short_log_subdirname.split("__")[0],
                "exploration_rate": len(plot_df["alloc"].unique()) / len(plot_df),
            }
        )

df_table = pd.DataFrame(table)

if len(df_table) == 0:
    print(
        "Warning: no data in procurement/logs/. First collect benchmark data with run_procurement_batch.py."
    )

In [None]:
df_table

# Calculate procurement benchmark scores for each LLM and difficulty level

In [None]:
df_table[["model", "difficulty", "max_ratio"]].groupby(
    ["model", "difficulty"]
).mean() * 100

In [None]:
df_table[["model", "difficulty", "exploration_rate"]].groupby(
    ["model", "difficulty"]
).mean() * 100

# Calculate full solve rate

In [None]:
df_table["solved"] = df_table["max_ratio"] == 1
df_table.groupby(["model", "difficulty"])["solved"].sum() / df_table.groupby(
    ["model", "difficulty"]
)["solved"].count()