In [1]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.
# get human annotation data as csv

import pandas as pd

df = pd.read_csv("../../data/human/arduin-long-form-factuality-20240806-2255_(uep-119589).csv")

In [2]:
# get original data that given to annotators
# (this includes the ground-truth preference)

import json

with open("../../data/generated/longfact/individual_generations_v6_addmore/longfact-initial-test-20240805.jsonl") as f:
    data = json.load(f)
    
df_gt = pd.json_normalize(data) # this is a highly nested json, normalize it

In [3]:
# merge both datasets on prompt id

full_df = pd.merge(df, df_gt, left_on="Prompt Id", right_on="prompt_id")

In [4]:
# get human preference from OPR rating

def get_human_pref(row):
    rating = row["Satisfaction OPR text_b,text_a"]
    if rating < 0:
        return "text_b"
    elif rating > 0:
        return "text_a"
    else:
        return "tie"

full_df["converted_ratings"] = full_df.apply(get_human_pref, axis=1)

# add columns with intuitive names
full_df[["human_pref", "gt_pref"]] = full_df[["converted_ratings", "enrichments.meta.longfact-preferred"]]

# add 
full_df["agreed"] = full_df["human_pref"] == full_df["gt_pref"]

In [5]:
# get value counts of agreement
full_df.agreed.value_counts()

agreed
True     23
False    17
Name: count, dtype: int64

In [6]:
# remove ties, and get value counts
filtered_df = full_df.copy()
filtered_df = filtered_df[filtered_df["human_pref"] != "tie"]
filtered_df.agreed.value_counts()

agreed
True     23
False     9
Name: count, dtype: int64

---

In [7]:
# Other set of annotations

# get human annotation data as csv

import pandas as pd

df = pd.read_csv("../../data/human/arduin-long-form-factuality-20240813-20240814-1240_(uep-120627).csv")

In [8]:
import json

with open("../../data/generated/longfact/individual_generations_v7_100_old/longfact-40to99-20240813.jsonl") as f:
    data = json.load(f)
    
df_gt = pd.json_normalize(data) # this is a highly nested json, normalize it

In [9]:
# merge both datasets on prompt id
full_df = pd.merge(df, df_gt, left_on="Prompt Id", right_on="prompt_id")
# ensure df is actually in order
full_df = full_df.sort_values(by=["prompt_id"]).reset_index()

# fix known data issue
START_IDX = 70-40
END_IDX = 75-40
# IMPORTANT: look indices are inclusive! (not excluding end index itself)
full_df.loc[START_IDX:END_IDX-1, "enrichments.meta.longfact-preferred"] = "text_2"

# get human preference from OPR rating

def get_human_pref(row):
    rating = row["Satisfaction OPR text_2,text_1"]
    if rating < 0:
        return "text_2"
    elif rating > 0:
        return "text_1"
    else:
        return "tie"

full_df["converted_ratings"] = full_df.apply(get_human_pref, axis=1)

# add columns with intuitive names
full_df[["human_pref", "gt_pref"]] = full_df[["converted_ratings", "enrichments.meta.longfact-preferred"]]

# add 
full_df["agreed"] = full_df["human_pref"] == full_df["gt_pref"]

In [10]:

for idx in range(START_IDX-2, END_IDX + 2):
    print("---")
    print("id", full_df["prompt_id"].loc[idx])
    print("PREF:", full_df["enrichments.meta.longfact-preferred"].loc[idx])
    print("PREF:", full_df["gt_pref"].loc[idx])
    print("HUMAN PREF:", full_df["human_pref"].loc[idx])
    print("agreed", full_df["agreed"].loc[idx])
    print("text_1", full_df["responses.text_1.response"].loc[idx])
    print("text_2", full_df["responses.text_2.response"].loc[idx])


---
id longfact-pairwise-20240813-0000068
PREF: text_1
PREF: text_1
HUMAN PREF: text_1
agreed True
text_1 Kevin Mitnick is a renowned cybersecurity analyst and former hacker, often referred to as one of the most famous computer criminals in history. Born on August 6, 1963, in Van Nuys, California, he gained notoriety in the 1980s and 1990s for his high-profile hacks, including breaching the networks of major corporations like Nokia, IBM, and Motorola. In 1995, he was arrested by the FBI after a well-publicized pursuit, which included a 74-day stint as a fugitive, and he was subsequently sentenced to five years in prison. After his release in 2000, Mitnick transitioned to a career in cybersecurity, becoming a consultant and author, and he published books such as "The Art of Deception" and "The Art of Intrusion." Today, he is the Chief Hacking Officer at KnowBe4, a security awareness training company, where he shares his expertise to help organizations improve their cybersecurity defense

In [11]:
full_df.agreed.value_counts()

agreed
True     40
False    20
Name: count, dtype: int64

In [12]:
# remove ties, and get value counts
filtered_df = full_df.copy()
filtered_df = filtered_df[filtered_df["human_pref"] != "tie"]
filtered_df.agreed.value_counts()

agreed
True     40
False    10
Name: count, dtype: int64

In [14]:
# human agreement (on non-ties)
(23 + 40) / (10 + 9 + 23 + 40)

0.7682926829268293