In [None]:
import json
from pathlib import Path
run_names = [
    # "gpt-4o-mini-2024-07-18_baseline_temp=0.0",
]
paths = list(Path(f"../webarena_results").glob("**/summary_info.json"))

In [None]:
import webarena
import importlib.resources
all_webarena_configs = {}
all_configs_str = importlib.resources.files(webarena).joinpath("test.raw.json").read_text()
all_configs = json.loads(all_configs_str)
for conf in all_configs:
    task_id = conf["task_id"]
    all_webarena_configs[task_id] = conf["sites"]

def get_webarena_website(task_id):
    sites = all_webarena_configs[task_id]
    if len(sites) == 1:
        return sites[0]
    elif len(sites) == 2:
        return "multisite"
    else:
        raise ValueError(f"Expected 1 or 2 sites, got {len(sites)} for task {task_id}")

In [None]:
import pandas as pd

results = []
counter = 0
for p in paths:
    experiment_name = p.parent.parent.name
    if experiment_name not in run_names:
        continue
    if p.parent.name.startswith("_"):
        continue
    with open(p, "r") as f:
        data = json.load(f)
    task_id = int(p.parts[-2].split(".")[-1].split("_")[0])
    r = data['cum_reward']
    results.append({
        "experiment_name": experiment_name,
        "task_id": task_id,
        "site_name": get_webarena_website(task_id),
        "n_steps": data.get("n_steps", None),
        "reward": r,
    })
    counter += 1
assert len(results) == 812*len(run_names) == counter, f"Expected {812*len(run_names)} paths, got {counter}"
df_results = pd.DataFrame(results).sort_values(by="experiment_name", ascending=True).reset_index(drop=True)
display(df_results)
df_results_agg = df_results.groupby("experiment_name").agg({"n_steps": "mean", "reward": "mean"}).reset_index()
df_results_agg.round(2)

### Performance analysis

In [None]:
columns = ["gitlab", "shopping_admin", "shopping", "reddit", "map", "multisite"]
for run_name in run_names:
    df_run = df_results[df_results["experiment_name"] == run_name]
    task_sr = df_run.groupby("task_id")["reward"].mean()
    print(f"Run: {run_name}")
    print(f"Average reward: {task_sr.mean():.2f}")

    # Site-wise average reward analysis
    site_sr = df_run.groupby("site_name", sort=True)["reward"].mean().reindex(columns)
    print("Site-wise average reward:")
    print("\t".join(site_sr.index))
    print("\t".join([f"{avg_reward:.2f}" for avg_reward in site_sr.values]))
    print("-" * 40)