In [None]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.
import ageval.analysis.data_loader
import pathlib

multirun_path = "../../data/testing/result_0211_truthfulqa_test"

exp_parent_dir = pathlib.Path("../../../project-agent-evaluator-results/2024_08_19_presentation_v3")

all_result_dirs = ", ".join([str(exp_parent_dir / "0110_general_agent"), str(exp_parent_dir / "0010_general_baselines")])

annotation_dfs, metric_dfs, results_dict = (
        ageval.analysis.data_loader.load_experiments_from_multirun(all_result_dirs)
    )

In [None]:
import pandas as pd

def get_average_metric_df(metric_dfs: dict[pd.DataFrame]) -> pd.DataFrame:
    initial_df = list(metric_dfs.values())[0].sort_values(["model"]).copy()
    average_df = None
    num_datasets = len(metric_dfs)
    ncols = initial_df.select_dtypes("number").columns

    for i, (data_name, df) in enumerate(metric_dfs.items()):

        df = df.sort_values(["model"])

        # check same number of trials for each dataset
        assert len(df) == len(initial_df), f"Problem: {len(df)} vs {len(initial_df)} ({data_name})"

        # check unique model names is identical
        assert sorted(initial_df.model.unique()) == sorted(df.model.unique()), f"Problem: {sorted(initial_df.model.unique())} != {sorted(df.model.unique())}({data_name})"

        assert (initial_df.model.values == df.model.values).all(), f"Not the same models tested ({i})\n{initial_df.model.values}\n{df.model.values}"

        if average_df is None:
            average_df = df
        else:
            average_df[ncols] += df[ncols]

    average_df[ncols] = average_df[ncols] / num_datasets

    assert (average_df["Agreement rate"] < 1).all()

    return average_df





In [None]:
initial_df

In [None]:
average_df

In [None]:
from ageval.analysis.plotting import plot_runs

plot_runs(metric_df)

In [None]:
import ageval.analysis.data_loader

multirun_path = "../../data/testing/result_0211_truthfulqa_test"

annotation_dfs, metric_dfs, results_dict = (
        ageval.analysis.data_loader.load_experiments_from_multirun(multirun_path)
    )

In [None]:
df = metric_dfs["./data/external/truthful_qa/truthful_qa_5.csv"]
num_cols = df.select_dtypes("number").columns
df[num_cols] = (df[num_cols] + df[num_cols]) / 2

In [None]:
df