In [1]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.
# Notebook to check the effect of removing parts of the dataset for publication

MINIMAL_REMOVAL = [1,2,4,10,13,14,43,86,95,84] # excluding more controversial topics
LARGE_REMOVAL = MINIMAL_REMOVAL + [15,25,45,51,55,59,68,85,88,93,97,99] # excluding general scandal + many people topics
MAXIMAL_REMOVAL = LARGE_REMOVAL + [3,41,47,62,72,74,75,78] # excluding all people topics

print("min removal len:", len(MINIMAL_REMOVAL))
print("large removal len:", len(LARGE_REMOVAL))
print("max removal len:", len(MAXIMAL_REMOVAL))

min removal len: 10
large removal len: 22
max removal len: 30


In [2]:
import pathlib
import pandas as pd

ANNOTATION_PATH = pathlib.Path("../../../project-agent-evaluator-results/2024_08_22_paper_v1/paper_replotting/overall_results/longfact_v8-combined_annotations.csv")

main_df = pd.read_csv(ANNOTATION_PATH)

In [3]:
def generate_accuracy_for_different_removal(
      agent_col = f"agent_gpt-4o-2024-05-13_synthetic_base-basic_gpt-4o-2024-05-13_n0",
      baseline_col = f"basic_gpt-4o-2024-05-13_n0",
):

    def get_acc(value_counts):
        return value_counts[True]/(value_counts[False] + value_counts[True])

    for removal_list in [[], MINIMAL_REMOVAL, LARGE_REMOVAL, MAXIMAL_REMOVAL]:

        df = main_df.drop(removal_list)

        print("---")
        print(f"Removal list len: {len(removal_list)} (datalen: {len(df)})")
        agent_vals = (df[agent_col] == df["preferred_text"]).value_counts()
        print("AGENT", get_acc(agent_vals))
        baseline_vals = (df[baseline_col] == df["preferred_text"]).value_counts()
        print("BASELINE", get_acc(baseline_vals))

In [4]:
i = 0
AGENT_COL = f"agent_gpt-4o-2024-05-13_synthetic_base-basic_gpt-4o-2024-05-13_n{i}"
BASELINE_COL = f"basic_gpt-4o-2024-05-13_n{i}"

generate_accuracy_for_different_removal(AGENT_COL, BASELINE_COL)

---
Removal list len: 0 (datalen: 100)
AGENT 0.81
BASELINE 0.66
---
Removal list len: 10 (datalen: 90)
AGENT 0.8111111111111111
BASELINE 0.6444444444444445
---
Removal list len: 22 (datalen: 78)
AGENT 0.8076923076923077
BASELINE 0.6410256410256411
---
Removal list len: 30 (datalen: 70)
AGENT 0.8
BASELINE 0.6285714285714286


In [5]:
# closest baseline to agent in our result: ArenaHard baseline

i = 0
AGENT_COL = f"agent_gpt-4o-2024-05-13_synthetic_base-arenahard_gpt-4o-2024-05-13_n{i}"
BASELINE_COL = f"arenahard_gpt-4o-2024-05-13_n{i}"

generate_accuracy_for_different_removal(AGENT_COL, BASELINE_COL)

---
Removal list len: 0 (datalen: 100)
AGENT 0.81
BASELINE 0.74
---
Removal list len: 10 (datalen: 90)
AGENT 0.8111111111111111
BASELINE 0.7333333333333333
---
Removal list len: 22 (datalen: 78)
AGENT 0.8076923076923077
BASELINE 0.7307692307692307
---
Removal list len: 30 (datalen: 70)
AGENT 0.8142857142857143
BASELINE 0.7285714285714285
