In [None]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.
# Notebook for replotting plots from presentation into light background version
# for paper

import ageval.experiments.logging
import pathlib

# path to clone of results repo (project-agent-evaluator-results)
exp_parent_dir = pathlib.Path("../../../project-agent-evaluator-results/2024_08_22_paper_v1")

In [None]:
# LongFact and TruthfulQA results

all_result_dirs = [
    exp_parent_dir / "0930_fact_agent_synth/longfact_v8", # all synth agent runs for longfact
    exp_parent_dir / "0930_fact_agent_synth/truthful_qa_400_v2",  # all synth agent runs for truthfulqa
    exp_parent_dir / "0030_fact_baselines", # baseline runs
    exp_parent_dir / "0031_fact_baselines_alpacaeval", # alpacaeval baseline runs
]
multirun_path = ", ".join([str(result) for result in all_result_dirs])


# human annotations
import pandas as pd

human_data_point = {
    "model": "Human (3 annotators)",
    "Agreement rate": None,
    "Agreement rate (incl NAs)": None,
    "Agreed": None,
    "Not agreed": None,
    "Num text_a evaluator pref": None,
    "Num text_b evaluator pref": None,
    "No evaluator annotation available": None,
    "Not avail.": None,
    "Sum": None,
    "Agreed (%)": 76.82926829268293,
    "Not agreed (%)": 0.0,
    "Not avail. (%)": 0.0
}

human_df = pd.DataFrame([human_data_point])

ageval.experiments.logging.analyze_combined_results(
    multirun_path=multirun_path, save_path=exp_parent_dir / "paper_replotting", additional_results=human_df,
    num_seeds=5,
)

In [None]:
# Maths results (GSM8k)

all_result_dirs = [
    exp_parent_dir / "0950_gsm8k_agent_synth/gsm8k_cot_hard",
    exp_parent_dir / "0400_gsm8k_agent",
    exp_parent_dir / "0410_gsm8k_baseline_alpacaeval",
]
multirun_path = ", ".join([str(result) for result in all_result_dirs])

ageval.experiments.logging.analyze_combined_results(multirun_path=multirun_path, save_path=exp_parent_dir / "paper_replotting")

In [None]:
# Coding (APPS)

all_result_dirs = [
    exp_parent_dir / "0320_apps_agent_initial",
]
multirun_path = ", ".join([str(result) for result in all_result_dirs])

ageval.experiments.logging.analyze_combined_results(multirun_path=multirun_path, save_path=exp_parent_dir / "paper_replotting")

In [None]:
# General rewardbench (non-coding, non-maths)

def get_subdirs(path):
    return list(path.glob("*/"))

all_result_dirs = [
    exp_parent_dir / "0010_general_baselines",
    *get_subdirs(exp_parent_dir / "0011_general_baselines_alpacaeval"),
    *get_subdirs(exp_parent_dir / "0960_general_agent_synth"), # note that agent results are in subdirs (1 per dataset)
]
multirun_path = ", ".join([str(result) for result in all_result_dirs])

ageval.experiments.logging.analyze_combined_results(multirun_path=multirun_path, save_path=exp_parent_dir / "paper_replotting/general/")