In [None]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.

# Paper experiments re-production instructions

In this notebook we produce the plots for the paper based on experimental runs
in the `project-agent-evaluator` repository.

The general workflow is as follows:

1. Run a set of experiments on a dataset (each configuration corresponds to a different agent(s))
2. Generate synthetic agents combining all baselines and the agent runs.
4. Create plots, combining all of these runs.

Note: we do not necessarily recommend running the experiments directly out of this notebooks. Instead it may be more useful to copy the relevant commands into a shell script, e.g.`experiments.sh`, and then running the script in the background, e.g `nohup bash experiments.sh &`. The purpose of this notebook is to provide all relevant commands in a single place. Good luck with the experiments!

In [None]:
import pathlib
import os
import re
from typing import List, Literal
import shutil
from tqdm import tqdm
from functools import partial

In [None]:
# setting up main path to results

DIR = pathlib.Path("./paper_results/")
# assert not DIR.exists(), f"Results directory already exist, this will overwrite them! ({DIR.absolute()})"
# os.makedirs(DIR.absolute())

---
#### Main experiments on specific domains

In [None]:
def to_snake_case(s):
    # Insert underscores before any uppercase letter that is followed by a lowercase letter
    s = re.sub(r'(?<!^)(?=[A-Z])', '_', s)
    # Convert the entire string to lowercase
    return s.lower()

def print_ageval_cmd(exp_config_path: str, experiment_name: str, *, output_file: str | None = None, data_path: str | None = None, data_name: str | None = None, n_seeds: int | None = None, prompt_type: Literal["arena_hard"] | None = None, other_args: List[str] | None = None) -> str:
    exp_file_name = to_snake_case(os.path.basename(exp_config_path))
    experiment_name = to_snake_case(experiment_name)
    hydra_dir = os.path.join(DIR.absolute(), experiment_name, output_file or exp_file_name)
    cmd = f"ageval -m -cd {exp_config_path} hydra.sweep.dir={hydra_dir}"
    if data_path:
        assert data_name is None
        cmd += f" data_name=null data_path=\"{data_path}\""
    elif data_name:
        assert data_path is None
        cmd += f" data_path=null data_name=\"{data_name}\""
    if n_seeds is not None:
        dummy_value = ",".join([str(i) for i in range(n_seeds)])
        cmd += f" dummy=\"{dummy_value}\""
    if prompt_type is not None:
        cmd += f" evaluator_kwargs.prompt_type=\"{prompt_type}\""
    if other_args:
        processed_other_args = []
        for other_arg in other_args:
            processed_other_arg = other_arg.replace('"', '\\"')
            equal_split = processed_other_arg.split("=")
            remaining_equals = "=".join(equal_split[1:])
            processed_other_arg = f'{equal_split[0]}="{remaining_equals}"'
            processed_other_args.append(processed_other_arg)
        cmd += " " + " ".join(processed_other_args)
    
    print(cmd)

In [None]:
# Experiment 1: LongFact
exp_name = "LongFact"
long_fact = partial(print_ageval_cmd, experiment_name=exp_name)
# gpt4o baselines
long_fact("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
long_fact("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
long_fact("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
long_fact("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
long_fact("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# long_fact("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

In [None]:
# Experiment 2: GSM8k maths
exp_name="gsm8k_cot_hard"
data_path="./data/external/gsm8k/gsm8k_cot_hard.csv"
gsm8k_math = partial(print_ageval_cmd, experiment_name=exp_name, data_path=data_path)
# gpt4o baselines
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

In [None]:
# Experiment 3: APPS
exp_name="apps_competitive_gpt4"
data_path="data/external/code_apps/competitive_gpt4.csv"
apps = partial(print_ageval_cmd, experiment_name=exp_name, data_path=data_path)
# gpt4o baselines
apps("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
apps("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
apps("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
apps("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
apps("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# apps("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

---
#### Experiments on general domains

In [None]:
# Experiment: Rewardbench non-code, non-maths
# gpt4o baselines
data_name="alpacaeval-easy, alpacaeval-hard, alpacaeval-length, donotanswer, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual, llmbar-adver-neighbor, llmbar-natural, mt-bench-easy, mt-bench-hard, mt-bench-med, refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond"
exp_name="rewardbench_general"
n_seeds=3  # this is a large amount of data, less seeds necessary to obtain low variance estimates
rewardbench_general = partial(print_ageval_cmd, experiment_name=exp_name, data_name=data_name, n_seeds=n_seeds)
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

In [None]:
# Experiment: TruthfulQA
# Experiment: Rewardbench non-code, non-maths
# gpt4o baselines
data_name="truthful-qa-v2"
exp_name="truthfulqa"
truthfulqa = partial(print_ageval_cmd, experiment_name=exp_name, data_name=data_name)
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# truthfulqa("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline", exp_name, data_name=data_name)

In [None]:
# Experiment: Rewardbench code
# gpt4o baselines
data_name="hep-python, hep-cpp, hep-go, hep-java, hep-js, hep-rust"
exp_name="rewardbench_code"
rewardbench_code = partial(print_ageval_cmd, experiment_name=exp_name, data_name=data_name)
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

In [None]:
# Experiment: Rewardbench maths
# gpt4o baselines
data_name="math-prm"
exp_name="rewardbench_math"
rewardbench_math = partial(print_ageval_cmd, experiment_name=exp_name, data_name=data_name)
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/001_gpt4o_baselines")
# alpacaeval baseline
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/002_ae_baseline")
# gpt35 baseline and agent
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_baseline")
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/004_gpt35t_agent")
# gpt4o agent
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent")
# gemini baseline (currently very rate limited, need other key to run these experiments)
# rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/003_gemini_baseline")

---
#### Ablatations

In [None]:
# Experiment: always using all tools (removing the reverting to baseline)
n_seeds = 2
long_fact("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)
rewardbench_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)
truthfulqa("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_always_tools", other_args=['evaluator_kwargs.tools_to_always_run=["fact_check", "code_interpreter", "math_checker"]'], n_seeds=2)

In [None]:
# Experiment: use the arena_hard prompt-type, but in the agent.
long_fact("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_ae_prompt", prompt_type="arena_hard", n_seeds=2)
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_ae_prompt", prompt_type="arena_hard", n_seeds=2)
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_ae_prompt", prompt_type="arena_hard", n_seeds=2)
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_ae_prompt", prompt_type="arena_hard", n_seeds=2)

In [None]:
# Experiment: don't fallback to the baseline, just run without tool output.
long_fact("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_nofallback", other_args=['evaluator_kwargs.baseline=None'], n_seeds=2)
gsm8k_math("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_nofallback", other_args=['evaluator_kwargs.baseline=None'], n_seeds=2)
rewardbench_code("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_nofallback", other_args=['evaluator_kwargs.baseline=None'], n_seeds=2)
rewardbench_general("exp/configs/2024_08_22_paper/001_longfact/005_gpt4o_agent", output_file="005_gpt4o_agent_nofallback", other_args=['evaluator_kwargs.baseline=None'], n_seeds=2)

In [None]:
# not implemented yet
# Experiment: replacing external validation with feedback from the same LLM (chain-of-thought vs external validation)

In [None]:
# Experiment: running GPT-4o OpenAI assistant with code interpreter and web search activated as baseline (test of whether our scaffolding adds anything)
model_kwargs = '{tools: ["code_interpreter", "search"]}'
config = "exp/configs/2024_08_22_paper/001_longfact/006_gpt4o_assistant_baseline"
long_fact(config, output_file="006_gpt4o_assistant_baseline", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
gsm8k_math(config, output_file="006_gpt4o_assistant_baseline", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
rewardbench_code(config, output_file="006_gpt4o_assistant_baseline", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
rewardbench_general(config, output_file="006_gpt4o_assistant_baseline", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)

# Experiment: running GPT-4o OpenAI assistant with code interpreter activated as baseline (test of whether our scaffolding adds anything)
model_kwargs = '{tools: ["code_interpreter"]}'
long_fact(config, output_file="006_gpt4o_assistant_baseline_codeonly", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
gsm8k_math(config, output_file="006_gpt4o_assistant_baseline_codeonly", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
rewardbench_code(config, output_file="006_gpt4o_assistant_baseline_codeonly", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)
rewardbench_general(config, output_file="006_gpt4o_assistant_baseline_codeonly", other_args=['evaluator_kwargs.baseline=None', f'+evaluator_kwargs.model_kwargs={model_kwargs}'], n_seeds=1)

---
#### Post-processing

Create 'virtual agents', we don't need to re-run anything, but we combine data to replace the baseline models where fallback is used.

In [None]:
import pandas as pd
import ageval.analysis.post_processing
import ageval.analysis.data_loader

In [None]:
DELETE_EXISTING_SYNTHETIC_AGENTS = True
multirun_paths = {
    "rewardbench_general/agent_synth": [
        DIR / "rewardbench_general" / "005_gpt4o_agent",
        DIR / "rewardbench_general" / "001_gpt4o_baselines",
        DIR / "rewardbench_general" / "002_ae_baseline",
        DIR / "rewardbench_general" / "004_gpt35t_baseline",
        DIR / "rewardbench_general" / "004_gpt35t_agent",
    ],
    "rewardbench_code/agent_synth": [
        DIR / "rewardbench_code" / "005_gpt4o_agent",
        DIR / "rewardbench_code" / "001_gpt4o_baselines",
        DIR / "rewardbench_code" / "002_ae_baseline",
        DIR / "rewardbench_code" / "004_gpt35t_baseline",
        DIR / "rewardbench_code" / "004_gpt35t_agent",
    ],
    "rewardbench_math/agent_synth": [
        DIR / "rewardbench_math" / "005_gpt4o_agent",
        DIR / "rewardbench_math" / "001_gpt4o_baselines",
        DIR / "rewardbench_math" / "002_ae_baseline",
        DIR / "rewardbench_math" / "004_gpt35t_baseline",
        DIR / "rewardbench_math" / "004_gpt35t_agent",
    ],
    "truthfulqa/agent_synth": [
        DIR / "truthfulqa" / "005_gpt4o_agent",
        DIR / "truthfulqa" / "001_gpt4o_baselines",
        DIR / "truthfulqa" / "002_ae_baseline",
        DIR / "truthfulqa" / "004_gpt35t_baseline",
        DIR / "truthfulqa" / "004_gpt35t_agent",
    ],
    "long_fact/agent_synth": [
        DIR / "long_fact" / "005_gpt4o_agent",
        DIR / "long_fact" / "001_gpt4o_baselines",
        DIR / "long_fact" / "002_ae_baseline",
        DIR / "long_fact" / "004_gpt35t_baseline",
        DIR / "long_fact" / "004_gpt35t_agent",
    ],
    "gsm8k_cot_hard/agent_synth": [
        DIR / "gsm8k_cot_hard" / "005_gpt4o_agent",
        DIR / "gsm8k_cot_hard" / "001_gpt4o_baselines",
        DIR / "gsm8k_cot_hard" / "002_ae_baseline",
        DIR / "gsm8k_cot_hard" / "004_gpt35t_baseline",
        DIR / "gsm8k_cot_hard" / "004_gpt35t_agent",
    ],
    "apps_competitive_gpt4/agent_synth": [
        DIR / "apps_competitive_gpt4" / "005_gpt4o_agent",
        DIR / "apps_competitive_gpt4" / "001_gpt4o_baselines",
        DIR / "apps_competitive_gpt4" / "002_ae_baseline",
        DIR / "apps_competitive_gpt4" / "004_gpt35t_baseline",
        DIR / "apps_competitive_gpt4" / "004_gpt35t_agent",
    ],
}
agent_name="agent_gpt-4o-2024-05-13_fact_check_code_interpreter_math_checker_base-basic"

progress_bar = tqdm(multirun_paths.items())
for output_dir, paths in progress_bar:
    progress_bar.set_description(output_dir)
    if DELETE_EXISTING_SYNTHETIC_AGENTS and (DIR / output_dir).exists():
        shutil.rmtree(DIR / output_dir)
    multirun_path = ", ".join(str(p) for p in paths)
    ageval.analysis.post_processing.generate_synth_agent_from_multirun(multirun_path=multirun_path, save_dir=DIR / output_dir, agent_name=agent_name)

---
#### Plotting

For each folder that has been created, create the plots for the paper.

In [None]:
import ageval.analysis
import ageval.analysis.data_loader

In [None]:
# Create human data mapping.
human_data = {
    "long_fact": {
        "model": "Human (3 annotators)",
        "Agreement rate": None,
        "Agreement rate (incl NAs)": None,
        "Agreed": None,
        "Not agreed": None,
        "Num text_a evaluator pref": None,
        "Num text_b evaluator pref": None,
        "No evaluator annotation available": None,
        "Not avail.": None,
        "Sum": None,
        "Agreed (%)": 76.82926829268293,
        "Not agreed (%)": 0.0,
        "Not avail. (%)": 0.0
    }
}

In [None]:
experiments_toplevel = [os.path.join(DIR, d) for d in os.listdir(DIR) if os.path.isdir(os.path.join(DIR, d)) if d != "plots"]
for experiment_toplevel in experiments_toplevel:
    exp_name_base = os.path.basename(experiment_toplevel)
    human_df = pd.DataFrame([human_data[exp_name_base]]) if exp_name_base in human_data else None
    individual_exps = [str(dir) for dir in ageval.analysis.data_loader.get_nested_valid_run_parent_dirs(pathlib.Path(experiment_toplevel))]
    ageval.experiments.logging.analyze_combined_results(multirun_path=",".join(individual_exps), save_path=DIR / "plots" / os.path.basename(experiment_toplevel), additional_results=human_df, models_to_hide=["agent_gpt-4o-2024-05-13_synthetic_base-5-turbo-0125", "agent_gpt-4o-2024-05-13_synthetic_base-basic_gpt-4o-2024-05-13", "agent_gpt-4o-2024-05-13_synthetic_base-basic_gpt-3.5-turbo-0125"])
