In [1]:
import pandas as pd
from datasets import load_dataset

import io
import json
import requests

from tqdm import tqdm

from pprint import pprint


pd.set_option('max_colwidth', 100)


In [2]:
!ls ../data/test-data/

AggreFact		     HaluEval
benchmark_llm_summarization  llm_aggrefact.json
benchmark_test_data.json     llm_aggrefact_partial.json
BUMP			     test.json
dadc-nli-test.jsonl	     tofueval
DiaSumFact		     trueteacher_unique_sampled.json
GUMSum4EVAL


In [3]:
import glob

glob.glob('../data/test-data/*/*')

['../data/test-data/benchmark_llm_summarization/likert_evaluation_results.json',
 '../data/test-data/BUMP/task1_dataset.json',
 '../data/test-data/GUMSum4EVAL/data',
 '../data/test-data/HaluEval/summarization_data.json',
 '../data/test-data/DiaSumFact/annotations.json',
 '../data/test-data/tofueval/mediasum_dev_doc.csv',
 '../data/test-data/tofueval/meetingbank_dev_doc.csv',
 '../data/test-data/tofueval/mediasum_test_doc.csv',
 '../data/test-data/tofueval/news_dialogue.json',
 '../data/test-data/tofueval/meetingbank_test_doc.csv',
 '../data/test-data/tofueval/document_ids_dev_test_split.json',
 '../data/test-data/tofueval/news_dialogue.zip',
 '../data/test-data/AggreFact/aggre_fact_sota.csv']

In [4]:
all_test_dfs = pd.DataFrame()


def append_df(df):
    global all_test_dfs
    all_test_dfs = pd.concat([all_test_dfs, df], axis=0)

In [5]:
# AggreFact


aggrefact = pd.read_csv("https://raw.githubusercontent.com/Liyan06/AggreFact/main/data/aggre_fact_sota.csv")
aggrefact.head(2)

Unnamed: 0,dataset,origin,id,doc,summary,model_name,label,cut,DAE_score,DAE_label,QuestEval_score,QuestEval_label,SummaC-ZS_score,SummaC-ZS_label,SummaC-Conv_score,SummaC-Conv_label,QAFactEval_score,QAFactEval_label
0,Polytope,cnndm,b383-10,looking after elderly parents can be difficult at the best of times .\nbut this man takes caring...,lu xincai takes his 84-year-old mother to work with him on the back of his motorbike every day ....,BART,1,val,0.912608,,0.556987,,0.827759,,0.966453,,4.856083,
1,Polytope,cnndm,b364-10,tokyo ( cnn ) a bizarre and alarming discovery is raising concerns in japan about the potential ...,a drone carrying traces of a radioactive material was found on the rooftop of japan 's equivalen...,BART,1,val,0.780049,,0.518639,,0.887718,,0.84848,,4.721094,


In [6]:
aggrefact = aggrefact[['doc','summary','label', 'origin']]

aggrefact = aggrefact.rename(columns={'origin':'source'})
aggrefact['benchmark'] = 'aggrefact'

append_df(aggrefact)

In [7]:
# BUMP

bump = pd.read_json("https://raw.githubusercontent.com/dataminr-ai/BUMP/main/data/task1_dataset.json", lines=False)

print(bump['error_type'].value_counts())
bump.sample(2)


error_type
Extrinsic Circumstance Error    99
Extrinsic Entity Error          99
Intrinsic Circumstance Error    99
Intrinsic Entity Error          99
Extrinsic Predicate Error       99
Intrinsic Predicate Error       99
Coreference Error               99
Name: count, dtype: int64


Unnamed: 0,id,article_id,article,reference_summary,edited_summary,error_type,corrected_error_type,scores
636,636,1337,Wigan climbed up to third place in the First Utility Super League after exacting sweet revenge o...,Wigan exact revenge on St Helens in Super League Grand-Final rematch . Warriors come out on top ...,Wigan exact revenge on Warriors in Super League Grand-Final rematch . Warriors come out on top i...,Intrinsic Circumstance Error,Intrinsic Entity Error,"{'QAFactEval_edited': 0.42573932670000003, 'CoCo_edited': 0.35365559999999996, 'DAE_edited': 0.5..."
78,78,1587,A Chinese villager who was desperate to become a grandfather has been arrested for buying a wife...,"The man, known only as Xu, was keen to have a grandchild . He bought the 'daughter-in-law' for £...","The man, known only as Xu, was keen to have a grandchild . He left the 'daughter-in-law' for £1,...",Extrinsic Predicate Error,Extrinsic Predicate Error,"{'QAFactEval_edited': 0.6213724862000001, 'CoCo_edited': 0.39436847, 'DAE_edited': 0.6608075, 'F..."


In [8]:
bump_correct = bump[['article', 'reference_summary']].rename(columns={'article': 'doc', 'reference_summary': 'summary'})
bump_correct['label'] = 1


bump_incorrect = bump[['article', 'edited_summary']].rename(columns={'article': 'doc', 'edited_summary': 'summary'})
bump_incorrect['label'] = 0

bump = pd.concat([bump_correct, bump_incorrect], axis=0)
bump = bump.drop_duplicates(subset=['doc', 'summary'])


bump['benchmark'] = 'bump'
bump['source'] = 'cnndm'

append_df(bump)
bump.head(2)

Unnamed: 0,doc,summary,label,benchmark,source
0,"(CNN)As the model for Norman Rockwell's ""Rosie the Riveter,"" Mary Doyle Keefe became the symbol ...","Rosie the Riveter appeared on the cover of the Saturday Evening Post on May 29, 1943 . Mary Doyl...",1,bump,cnndm
7,"(CNN)Famed cosmologist Stephen Hawking has proved his comedy chops on shows like ""The Big Bang T...","Stephen Hawking is a famed cosmologist and mathematician . He sings Monty Python's ""Galaxy Song""...",1,bump,cnndm


In [9]:
# HaluEval

halueval = pd.read_json('https://raw.githubusercontent.com/RUCAIBox/HaluEval/main/data/summarization_data.json', lines=True)

rows = []
for row in halueval.itertuples():
    document = row[1]
    right_summary = row[2]
    hallucinated_summary = row[3]
    rows.append([document, right_summary, 1])
    rows.append([document, hallucinated_summary, 0])


# halueval.head(2)
halueval = pd.DataFrame(rows, columns=['doc', 'summary', 'label'])
halueval['source'] = 'cnndm'
halueval['benchmark'] = 'halueval'

print(halueval.label.value_counts())
append_df(halueval)
halueval.head(2)

label
1    10000
0    10000
Name: count, dtype: int64


Unnamed: 0,doc,summary,label,source,benchmark
0,"Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwi...","Marseille prosecutor says ""so far no videos were used in the crash investigation"" despite media ...",1,cnndm,halueval
1,"Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwi...",A video showing the final moments of Germanwings Flight 9525 has been recovered by investigators...,0,cnndm,halueval


In [10]:
# FIB

fib = load_dataset("r-three/fib")['test']
fib_df = fib.to_pandas()
fib_df = fib_df.explode('list_choices')

# fib_df.loc[fib_df['correct_choice'] == fib_df['list_choices'], 'label'] = 1
# fib_df.loc[fib_df['correct_choice'] != fib_df['list_choices'], 'label'] = 0

def fix(row):
    choice = row['list_choices']
    correct_choice = row['correct_choice']
    
    if choice == correct_choice:
        row['label'] = 1
    else:
        row['label'] = 0
        
    return row

fib_df = fib_df.apply(fix, axis=1)

fib_df = fib_df[['input', 'list_choices', 'label', 'dataset']].rename(columns={'input': 'doc', 
                                                                                 'list_choices': 'summary', 
                                                                                 'dataset': 'source'
                                                                                 })

fib_df['summary'] = fib_df['summary'].apply(lambda x: x.replace('<t>', ' ').replace('</t>', ' ').strip())
fib_df['summary'] = fib_df['summary'].apply(lambda x: ' '.join(x.split()))
fib_df['label'] = fib_df['label'].astype(int)

fib_df['benchmark'] = 'fib'
fib_df['source'] = fib_df['source'].replace({"cnn_dm": "cnndm"})

print(fib_df.label.value_counts())
append_df(fib_df)
fib_df.sample(10)




label
0    3579
1    3579
Name: count, dtype: int64


Unnamed: 0,doc,summary,label,source,benchmark
2889,Strikes planned for Christmas were suspended after the new offer was made but members rejected i...,"British Airways has suspended a further 2,000 customers from its contingency plan in a row of st...",0,xsum,fib
1009,"For its 3-2 victory over a team backed by phone-maker Samsung, SKT's five members shared a prize...",SKT has won the League of Legends (LoL) final game.,1,xsum,fib
2374,"That was just over over half of last year's haul, when the website auctioned a coffee meeting wi...",A lunch date with Mr Cook has sold on the auction website CharityBuzz.,1,xsum,fib
2491,He and his son Gorka were arrested earlier this month as part of a corruption investigation.\nVi...,Suspended Spanish Football Federation head Villar has resigned from his role in Uefa.,1,xsum,fib
1535,"The court in Novorossiysk gave two of the dancers 10 days in jail each, a third 15 days and two ...","A Russian court has sentenced three teenagers to jail for performing an ""erotic"" twerk dance at ...",0,xsum,fib
553,Media playback is unsupported on your device\n16 December 2014 Last updated at 08:58 GMT\nDr And...,Hospitals in Wales may have to cut back on non-urgent operations to cope with pressures over the...,0,xsum,fib
3401,"( cnn ) a 32-year-old massachusetts man is facing murder charges , authorities said wednesday , ...","earlier this week , colina was arraigned on charges of assault and battery causing serious bodil...",0,cnndm,fib
1861,"The 26-year-old, who has been linked with a move to Real Madrid, fractured his right ankle while...",Chelsea's Hazard will miss the start of the Premier League season after having surgery on a brok...,1,xsum,fib
2842,"The sturgeon, named Steve, swam out of World of Water in Romsey, Hampshire when it was inundated...",A fish that escaped from an aquatic shop during flooding has been found.,1,xsum,fib
163,"In 2014, a report by the public protector said Mr Zuma had ""benefited unduly"" from the upgrades....",President Jacob Zuma has agreed to repay at least some of the money controversially spent on upg...,1,xsum,fib


In [11]:
# Zhang et al. (LLM Summaries) https://arxiv.org/abs/2301.13848

llm_summaries = pd.read_json('https://raw.githubusercontent.com/Tiiiger/benchmark_llm_summarization/main/likert_evaluation_results.json', lines=False)
llm_summaries = llm_summaries.rename(columns={'article': 'doc', 'summary': 'summary', 'faithfulness': 'label', 'dataset': 'source'})
llm_summaries = llm_summaries[['doc', 'summary', 'label', 'source']]
llm_summaries.dropna(inplace=True)
llm_summaries.drop_duplicates(subset=['summary'], inplace=True)

remove = "original article"
llm_summaries = llm_summaries[~llm_summaries['summary'].str.contains(remove)]

llm_summaries['benchmark'] = 'llm_summaries'
print(llm_summaries.label.value_counts())
append_df(llm_summaries)
llm_summaries.head(2)

label
1    2992
0     564
Name: count, dtype: int64


Unnamed: 0,doc,summary,label,source,benchmark
0,"(CNN)Two years ago, the storied Boston Marathon ended in terror and altered the lives of runners...",The two bombers who killed 3 and injured 264 were sentenced to death.\n\nThe two bombers who kil...,1,cnndm,llm_summaries
12,She might be a new mother to twins but that didn't prevent a fresh-faced Charlene of Monaco from...,What is the main idea of the article?\n\nWhat is the main idea of the article?\n\nWhat is the ma...,0,cnndm,llm_summaries


In [12]:
# GUMSUM

path = "../data/test-data/GUMSum4EVAL/data/GUMSum_final.json"

gumsum_dict = json.load(io.open(path))
labels = pd.read_csv('../data/test-data/GUMSum4EVAL/data/labels.csv')

gumsum = pd.DataFrame()
cols = ['doc', 'summary', 'label']

for row in labels.itertuples():
    key = row[1]
    brio = row[2]
    simcls = row[3]
    gpt3 = row[4]
    human = row[5]
    
    fulltext = gumsum_dict[key]['fulltext']
    brio_summary = gumsum_dict[key]['brio']
    simcls_summary = gumsum_dict[key]['simcls']
    gpt3_summary = gumsum_dict[key]['gpt3']
    human_summary = gumsum_dict[key]['human1']
    
    row1 = [ fulltext, brio_summary, brio]
    row2 = [ fulltext, simcls_summary, simcls]
    row3 = [ fulltext, gpt3_summary, gpt3]
    row4 = [fulltext, human_summary, human]
    
    gumsum = pd.concat([gumsum, pd.DataFrame([row1, row2, row3, row4], columns=cols)], axis=0)
    
    
gumsum = gumsum.replace({'label': {'yes': 0, 'no': 1}})
gumsum['source'] = 'gumsum'
gumsum['benchmark'] = 'gumsum'

print(gumsum.label.value_counts())

append_df(gumsum)

gumsum.head(2)

label
1    60
0    36
Name: count, dtype: int64


Unnamed: 0,doc,summary,label,source,benchmark
0,The prevalence of discrimination across racial groups in contemporary America: Results from a na...,A new study by Kessler and colleagues estimates the prevalence of discrimination across racial g...,0,gumsum,gumsum
1,The prevalence of discrimination across racial groups in contemporary America: Results from a na...,This study is the first to estimate the prevalence of discrimination across racial groups in con...,0,gumsum,gumsum


# NEW 

In [13]:
#InstruSum
instrusum = load_dataset("Salesforce/InstruSum", "human_eval")

In [14]:

append_rows = []

for row in tqdm(instrusum['data']):
    # pprint(row)
    doc = row['article']
    annotations = row['annotations']
    for annotation in annotations:
        # print(annotation)
        summary = annotations[annotation]['summary']
        label = int(annotations[annotation]['score']['factual'])
        # print(doc, summary, label)
        append_rows.append([doc, summary, label])
        # break
    # break

x = pd.DataFrame(append_rows, columns=['doc', 'summary', 'label'])
x['source'] = 'instrusum'
x['benchmark'] = 'instrusum'

print(x.label.value_counts())

append_df(x)

x.head(2)

100%|██████████| 100/100 [00:00<00:00, 12750.58it/s]

label
1    387
0    113
Name: count, dtype: int64





Unnamed: 0,doc,summary,label,source,benchmark
0,"""I was shaking with rage and stress, I couldn't believe this had happened."" By Dan WhitworthMone...","Lloyds Bank has refunded more than £14,000 stolen from the account of a dementia sufferer after ...",1,instrusum,instrusum
1,"""I was shaking with rage and stress, I couldn't believe this had happened."" By Dan WhitworthMone...","Lloyds Bank has apologized and refunded over £14,000, plus interest and £600 in compensation, to...",1,instrusum,instrusum


In [15]:
#TofuEval - MediaSum

def obtain_dialogue_mediasum(dialogue_selected):
    dialogue_df = pd.DataFrame(columns=['doc_id', 'source'])
    for dialogue in dialogue_selected:
        dialogue_id = dialogue['id']
        speakers = dialogue['speaker']
        utts = dialogue['utt']
        transcript = ''
        for speaker, utt in zip(speakers, utts):
            transcript += f"{speaker}: {utt}\n"
        transcript = transcript.strip()
        dialogue_df.loc[len(dialogue_df)] = [dialogue_id, transcript]
    return dialogue_df

base_path = "../data/test-data/tofueval/"

with open(base_path+"document_ids_dev_test_split.json") as file:
    document_mapping = json.load(file)

meetingbank_dev_ids = document_mapping['dev']['meetingbank']
meetingbank_test_ids = document_mapping['test']['meetingbank']
mediasum_dev_ids = document_mapping['dev']['mediasum']
mediasum_test_ids = document_mapping['test']['mediasum']


meetingbank = pd.DataFrame(load_dataset("lytang/MeetingBank-transcript")['test'])
meetingbank[meetingbank.meeting_id.isin(meetingbank_dev_ids)][['meeting_id', 'source']].reset_index(drop=True).to_csv(base_path+"meetingbank_dev_doc.csv", index=False)
meetingbank[meetingbank.meeting_id.isin(meetingbank_test_ids)][['meeting_id', 'source']].reset_index(drop=True).to_csv(base_path+"meetingbank_test_doc.csv", index=False)

meetingbank_dev = pd.read_csv(base_path+"meetingbank_dev_doc.csv")
meetingbank_test = pd.read_csv(base_path+"meetingbank_test_doc.csv")


with open(base_path+"news_dialogue.json") as file:
    news_dialogue = json.load(file)
dialogue_dev = [dialogue for dialogue in news_dialogue if dialogue['id'] in mediasum_dev_ids]
dialogue_test = [dialogue for dialogue in news_dialogue if dialogue['id'] in mediasum_test_ids]

obtain_dialogue_mediasum(dialogue_dev).to_csv(base_path+"mediasum_dev_doc.csv", index=False)
obtain_dialogue_mediasum(dialogue_test).to_csv(base_path+"mediasum_test_doc.csv", index=False)

mediasum_dev = pd.read_csv(base_path+"mediasum_dev_doc.csv")
mediasum_test = pd.read_csv(base_path+"mediasum_test_doc.csv")

In [16]:
mediasum_eval_dev = pd.read_csv("https://raw.githubusercontent.com/amazon-science/tofueval/main/factual_consistency/mediasum_factual_eval_dev.csv")
mediasum_eval_test = pd.read_csv("https://raw.githubusercontent.com/amazon-science/tofueval/main/factual_consistency/mediasum_factual_eval_test.csv")

meetingbank_eval_dev = pd.read_csv("https://raw.githubusercontent.com/amazon-science/tofueval/main/factual_consistency/meetingbank_factual_eval_dev.csv")
meetingbank_eval_test = pd.read_csv("https://raw.githubusercontent.com/amazon-science/tofueval/main/factual_consistency/meetingbank_factual_eval_test.csv")

In [17]:
mediasum_dev_final = mediasum_dev.merge(mediasum_eval_dev, left_on='doc_id', right_on='doc_id', how='inner')
mediasum_dev_final['label'] = mediasum_dev_final['sent_label'].apply(lambda x: 1 if x == 'yes' else 0)
mediasum_dev_final['doc'] = mediasum_dev_final['source']
mediasum_dev_final['summary'] = mediasum_dev_final['summ_sent']
mediasum_dev_final['source'] = 'mediasum'
mediasum_dev_final['benchmark'] = 'tofueval'

mediasum_dev_final = mediasum_dev_final[['doc', 'summary', 'label', 'source', 'benchmark']]

mediasum_test_final = mediasum_test.merge(mediasum_eval_test, left_on='doc_id', right_on='doc_id', how='inner')
mediasum_test_final['label'] = mediasum_test_final['sent_label'].apply(lambda x: 1 if x == 'yes' else 0)
mediasum_test_final['doc'] = mediasum_test_final['source']
mediasum_test_final['summary'] = mediasum_test_final['summ_sent']
mediasum_test_final['source'] = 'mediasum'
mediasum_test_final['benchmark'] = 'tofueval'

mediasum_test_final = mediasum_test_final[['doc', 'summary', 'label', 'source', 'benchmark']]
# mediasum_test_final.head(2)

meetingbank_dev_final = meetingbank_dev.merge(meetingbank_eval_dev, left_on='meeting_id', right_on='doc_id', how='inner')
meetingbank_dev_final['label'] = meetingbank_dev_final['sent_label'].apply(lambda x: 1 if x == 'yes' else 0)
meetingbank_dev_final['doc'] = meetingbank_dev_final['source']
meetingbank_dev_final['summary'] = meetingbank_dev_final['summ_sent']
meetingbank_dev_final['source'] = 'meetingbank'
meetingbank_dev_final['benchmark'] = 'tofueval'

meetingbank_dev_final = meetingbank_dev_final[['doc', 'summary', 'label', 'source', 'benchmark']]

meetingbank_test_final = meetingbank_test.merge(meetingbank_eval_test, left_on='meeting_id', right_on='doc_id', how='inner')
meetingbank_test_final['label'] = meetingbank_test_final['sent_label'].apply(lambda x: 1 if x == 'yes' else 0)
meetingbank_test_final['doc'] = meetingbank_test_final['source']
meetingbank_test_final['summary'] = meetingbank_test_final['summ_sent']
meetingbank_test_final['source'] = 'meetingbank'
meetingbank_test_final['benchmark'] = 'tofueval'

meetingbank_test_final = meetingbank_test_final[['doc', 'summary', 'label', 'source', 'benchmark']]

append_df(mediasum_test_final)
append_df(meetingbank_test_final)

meetingbank_test_final.head(2)

Unnamed: 0,doc,summary,label,source,benchmark
0,Speaker 0: Thank you very much. That concludes public comment. We're going on to Iowa. We have t...,"Reduction in conditional use permit fees for development services in Long Beach, California.",1,meetingbank,tofueval
1,Speaker 0: Thank you very much. That concludes public comment. We're going on to Iowa. We have t...,Fee reduction of $950 and establishment of a new tiered fee structure for public noticing.,1,meetingbank,tofueval


In [18]:
# print label value counts
print(meetingbank_dev_final.label.value_counts())
print(meetingbank_test_final.label.value_counts())
print(mediasum_dev_final.label.value_counts())
print(mediasum_test_final.label.value_counts())

label
1    1334
0     290
Name: count, dtype: int64
label
1    627
0    150
Name: count, dtype: int64
label
1    1459
0     354
Name: count, dtype: int64
label
1    561
0    172
Name: count, dtype: int64


In [19]:
# #Diversumm

# diversumm = pd.read_csv("https://raw.githubusercontent.com/HJZnlp/Infuse/main/DiverSumm.csv")[['doc','summary','label','origin']]

# diversumm = diversumm[diversumm['origin'].isin(['multinews','qmsum'])]
# diversumm = diversumm.rename(columns={'origin':'source'})

# diversumm['benchmark'] = 'unisumm'


# append_df(diversumm)

# diversumm.label.value_counts()

In [20]:
# DiaSumFact

# https://aclanthology.org/2023.acl-long.377/

import json 
import requests


url = 'https://raw.githubusercontent.com/731935354/Dia-Sum-Fact/main/annotations.json'
response = requests.get(url)
annotations = response.json()


diasumfact = []
for k, v in annotations.items():
    utterances = v['utterances']
    dataset_name = v['dataset_name']
    doc = "\n".join([u for u in utterances])
    predictions = v['annotations']
    for _, prediction in predictions.items():
        sentence_annotations = prediction['sentence_annotations']
        for s in sentence_annotations:
            sentence = s['sentence']
            label = s['annotation'][0]['error_class']
            diasumfact.append([doc, sentence, label, dataset_name])

In [21]:
diasumfact = pd.DataFrame(diasumfact, columns=['doc', 'summary', 'label', 'source'])
diasumfact['benchmark'] = 'diasumfact'

# replace "No Error" with 1 and everything else with 0
diasumfact['label'] = diasumfact['label'].apply(lambda x: 1 if x == "No Error" else 0)


append_df(diasumfact)
diasumfact.label.value_counts()

label
1    853
0    487
Name: count, dtype: int64

In [22]:
# # zero_shot_faceval_domains

# cols = ['article', 'summary', 'score', 'task']

# zero_shot_faceval_billsum = pd.read_csv("https://raw.githubusercontent.com/sanjanaramprasad/zero_shot_faceval_domains/main/datasets/annotations/billsum_annotation_scores.csv")[cols]
# zero_shot_faceval_news = pd.read_csv("https://raw.githubusercontent.com/sanjanaramprasad/zero_shot_faceval_domains/main/datasets/annotations/news_annotation_scores.csv")[cols]
# zero_shot_faceval_pubmed = pd.read_csv("https://raw.githubusercontent.com/sanjanaramprasad/zero_shot_faceval_domains/main/datasets/annotations/pubmed_annotation_scores.csv")[cols]

# # replace scores < 1 with 0 and 1.0 with 1
# zero_shot_faceval_billsum['score'] = zero_shot_faceval_billsum['score'].apply(lambda x: 1 if x == 1.0 else 0)
# zero_shot_faceval_news['score'] = zero_shot_faceval_news['score'].apply(lambda x: 1 if x == 1.0 else 0)
# zero_shot_faceval_pubmed['score'] = zero_shot_faceval_pubmed['score'].apply(lambda x: 1 if x == 1.0 else 0)

# zero_shot_faceval_billsum = zero_shot_faceval_billsum[['article', 'summary', 'score', 'task']].rename(columns={'article': 'doc', 'task': 'source', 'score': 'label'})
# zero_shot_faceval_news = zero_shot_faceval_news[['article', 'summary', 'score', 'task']].rename(columns={'article': 'doc', 'task': 'source', 'score': 'label'})
# zero_shot_faceval_pubmed = zero_shot_faceval_pubmed[['article', 'summary', 'score', 'task']].rename(columns={'article': 'doc', 'task': 'source', 'score': 'label'})

# zero_shot_faceval = pd.concat([zero_shot_faceval_billsum, zero_shot_faceval_news, zero_shot_faceval_pubmed], axis=0)
# zero_shot_faceval['benchmark'] = 'zero_shot_faceval'

# append_df(zero_shot_faceval)

In [23]:
# Count total words


def count_words(row):
    doc = row['doc']
    summary = row['summary']
    words = len(doc.split()) + len(summary.split())
    row['length'] = words
    return row

all_test_dfs = all_test_dfs.apply(count_words, axis=1)
all_test_dfs.head(2)

Unnamed: 0,doc,summary,label,source,benchmark,length
0,looking after elderly parents can be difficult at the best of times .\nbut this man takes caring...,lu xincai takes his 84-year-old mother to work with him on the back of his motorbike every day ....,1,cnndm,aggrefact,507
1,tokyo ( cnn ) a bizarre and alarming discovery is raising concerns in japan about the potential ...,a drone carrying traces of a radioactive material was found on the rooftop of japan 's equivalen...,1,cnndm,aggrefact,460


In [24]:
all_test_dfs['length'].sum()

21705375

In [25]:
all_test_dfs = all_test_dfs.rename({
    'doc': 'source',
    'summary': 'target',
    'source': 'source_dataset',
}, axis=1)


all_test_dfs

Unnamed: 0,source,target,label,source_dataset,benchmark,length
0,looking after elderly parents can be difficult at the best of times .\nbut this man takes caring...,lu xincai takes his 84-year-old mother to work with him on the back of his motorbike every day ....,1,cnndm,aggrefact,507
1,tokyo ( cnn ) a bizarre and alarming discovery is raising concerns in japan about the potential ...,a drone carrying traces of a radioactive material was found on the rooftop of japan 's equivalen...,1,cnndm,aggrefact,460
2,more than 25 women have been airlifted from royal navy ships because of pregnancy .\none ship - ...,more than 25 women have been airlifted from royal navy ships because of pregnancy . \none ship -...,1,cnndm,aggrefact,606
3,breast cancer patients may be spared chemotherapy thanks to new tests that pinpoint genetic ` ma...,breast cancer patients may be spared chemotherapy thanks to new tests . \ntests pinpoint genetic...,1,cnndm,aggrefact,641
4,amazing pictures have emerged of blue fluorescent algae lighting up australia 's east coast .\nt...,amazing pictures have emerged of blue fluorescent algae lighting up australia 's east coast . \n...,1,cnndm,aggrefact,358
...,...,...,...,...,...,...
1335,"david hopkins: Yes, sure. The delegation levels are already very high in most authority areas, a...","we're trying to agree with partners, including estyn and the welsh government, a broader range o...",1,QMSum,diasumfact,409
1336,"david hopkins: Yes, sure. The delegation levels are already very high in most authority areas, a...","david hopkins thought that the delegation levels were already very high in most authority areas,...",1,QMSum,diasumfact,432
1337,"david hopkins: Yes, sure. The delegation levels are already very high in most authority areas, a...","on the additional learning needs side, although the minister had currently made some more money ...",1,QMSum,diasumfact,459
1338,"david hopkins: Yes, sure. The delegation levels are already very high in most authority areas, a...","the delegation levels were already very high in most authority areas, and they had got agreement...",1,QMSum,diasumfact,429


In [26]:
# Replace 1 and 0 with Yes and No in the label column

all_test_dfs['label'] = all_test_dfs['label'].replace({1: 'Yes', 0: 'No'})

In [27]:
all_test_dfs['benchmark'].value_counts(), all_test_dfs['source_dataset'].value_counts(), all_test_dfs.label.value_counts()

(benchmark
 halueval         20000
 fib               7158
 llm_summaries     3556
 aggrefact         2353
 tofueval          1510
 diasumfact        1340
 bump               785
 instrusum          500
 gumsum              96
 Name: count, dtype: int64,
 source_dataset
 cnndm          24546
 xsum            9306
 meetingbank      777
 SAMSum           757
 mediasum         733
 QMSum            583
 instrusum        500
 gumsum            96
 Name: count, dtype: int64,
 label
 Yes    20739
 No     16559
 Name: count, dtype: int64)

In [28]:
# combine source_dataset and benchmark into one column called benchmark
all_test_dfs['subset'] =  all_test_dfs['benchmark'] + "_" + all_test_dfs['source_dataset'] 

In [2]:
output_columns = ['source', 'target', 'label', 'subset']

In [29]:
other_testset = pd.read_json("../data/combined-test.jsonl", lines=True)

# join the two dataframes
all_test_dfs = pd.concat([all_test_dfs[output_columns], other_testset], axis=0)

In [30]:
all_test_dfs['subset'].value_counts()

subset
halueval_cnndm          20000
fib_xsum                 6244
alisawuffles/WANLI       5000
Seahorse                 4138
anli                     3200
scitail                  2126
DeFacto                  1848
llm_summaries_cnndm      1829
llm_summaries_xsum       1727
FoolMeTwice              1379
aggrefact_xsum           1335
aggrefact_cnndm          1018
fib_cnndm                 914
bump_cnndm                785
tofueval_meetingbank      777
DADC-NLI                  766
diasumfact_SAMSum         757
tofueval_mediasum         733
diasumfact_QMSum          583
instrusum_instrusum       500
WiCE                      358
gumsum_gumsum              96
Name: count, dtype: int64

In [31]:
all_test_dfs[output_columns].\
    to_json('../data/test-data/benchmark_test_data.json', orient='records', lines=True, force_ascii=False)

In [32]:
all_test_dfs[output_columns].sample(frac=0.1).\
    to_json('../data/test-data/test.json', orient='records', lines=True, force_ascii=False)

In [3]:
# Deduplicate the data

test_data = pd.read_json("../data/test-data/benchmark_test_data.json", lines=True)


import hashlib

def get_hash(row):
    source = row['source']
    target = row['target']
    row['hash'] = hashlib.md5(f"{source}{target}".encode()).hexdigest()
    
    return row


test_data = test_data.apply(get_hash, axis=1)

test_data = test_data.drop_duplicates(subset=['hash'])

len(test_data)



52853

In [7]:
test_data[output_columns].\
    to_json('../data/test-data/benchmark_test_data.json', orient='records', lines=True, force_ascii=False)

In [34]:
all_test_dfs = pd.read_json('../data/test-data/benchmark_test_data.json', lines=True)

all_test_dfs[all_test_dfs['subset'].isnull()]

Unnamed: 0,source,target,label,subset


### add llm_aggrefact to the benchmark

In [17]:
from datasets import load_dataset
llm_aggrefact = load_dataset("lytang/LLM-AggreFact")
llm_aggrefact = llm_aggrefact['test'].to_pandas()
llm_aggrefact.sample(2)

Unnamed: 0,dataset,doc,claim,label
11275,Lfqa,"[1] When it comes to tire design, the most important factor is the ability of air to be compress...","This makes air-filled tires particularly useful for high-speed automobiles, since they are able ...",1
9486,ExpertQA,"Community-based tourism approach for The Bahamas: The fundamentals, pt.1 - The Nassau Guardian t...",This type of tourism contributes to the well-being of communities.,0


In [19]:
llm_aggrefact = llm_aggrefact.rename(columns={'doc': 'source', 'claim': 'target', 'label': 'label', 'dataset': 'subset'})

# replace 1 and 0 with Yes and No
def replace_label(row):
    label = row['label']
    if label == 1:
        row['label'] = 'Yes'
    else:
        row['label'] = 'No'
    return row
print(len(llm_aggrefact))
llm_aggrefact = llm_aggrefact.apply(replace_label, axis=1).drop_duplicates(subset=['source', 'target'])
print(len(llm_aggrefact))

12949
12939


In [20]:
# label value counts for each subset

for subset in llm_aggrefact['subset'].unique():
    print(subset)
    print(llm_aggrefact[llm_aggrefact['subset'] == subset]['label'].value_counts())
    print()

# Create a subset for llm_aggrefact without "AggreFact-CNN", "AggreFact-XSum", "TofuEval-MediaS", "TofuEval-MeetB", "wice"

to_remove = ["AggreFact-CNN", "AggreFact-XSum", "TofuEval-MediaS", "TofuEval-MeetB", "Wice"]
aggrefact_keep = ["AggreFact-CNN", "AggreFact-XSum"]
llm_aggrefact_partial = llm_aggrefact[~llm_aggrefact['subset'].isin(to_remove)]
llm_aggrefact_partial_2 = llm_aggrefact[llm_aggrefact['subset'].isin(aggrefact_keep)]

llm_aggrefact_partial['subset'].value_counts(), llm_aggrefact_partial_2['subset'].value_counts()

AggreFact-CNN
label
Yes    501
No      57
Name: count, dtype: int64

AggreFact-XSum
label
Yes    285
No     273
Name: count, dtype: int64

TofuEval-MediaS
label
Yes    553
No     172
Name: count, dtype: int64

TofuEval-MeetB
label
Yes    620
No     150
Name: count, dtype: int64

Wice
label
No     247
Yes    111
Name: count, dtype: int64

Reveal
label
No     1307
Yes     398
Name: count, dtype: int64

ClaimVerify
label
Yes    789
No     298
Name: count, dtype: int64

FactCheck-GPT
label
No     1189
Yes     376
Name: count, dtype: int64

ExpertQA
label
Yes    2971
No      731
Name: count, dtype: int64

Lfqa
label
Yes    1121
No      790
Name: count, dtype: int64



(subset
 ExpertQA         3702
 Lfqa             1911
 Reveal           1705
 FactCheck-GPT    1565
 ClaimVerify      1087
 Name: count, dtype: int64,
 subset
 AggreFact-CNN     558
 AggreFact-XSum    558
 Name: count, dtype: int64)

In [13]:
# Add llm_aggrefact_partial to the final benchmark test data

df = pd.read_json('../data/test-data/benchmark_test_data.json', lines=True)
final_combined_df = pd.concat([df, llm_aggrefact_partial[output_columns]], axis=0)

print(len(df)), print(len(llm_aggrefact_partial)), print(len(final_combined_df))

52853
9970
62823


(None, None, None)

In [14]:
final_combined_df = final_combined_df.drop_duplicates(subset=['source', 'target'])
print(len(final_combined_df))

62823


In [15]:
llm_aggrefact[output_columns].to_json('../data/test-data/llm_aggrefact.json', orient='records', lines=True, force_ascii=False)
llm_aggrefact_partial[output_columns].to_json('../data/test-data/llm_aggrefact_partial.json', orient='records', lines=True, force_ascii=False)
final_combined_df[output_columns].to_json('../data/test-data/final_combined_test_data.json', orient='records', lines=True, force_ascii=False)
llm_aggrefact_partial_2[output_columns].to_json('../data/test-data/llm_aggrefact_partial_2.json', orient='records', lines=True, force_ascii=False)

In [16]:
# !head -n 2 ../data/test-data/benchmark_test_data.json

{"source":"looking after elderly parents can be difficult at the best of times .\nbut this man takes caring for his alzheimer 's - suffering mother to another level .\na security guard from china has touched hearts across the country because he takes his 84-year-old mother with him to work on the back of his motorbike every single day , reported the people 's daily online .\nlu xincai , who lives in zhejiang province in eastern china , says that he is scared his mother will get lost if he leaves her at home by herself because she suffers from the degenerative disease .\ndevoted : lu xincai takes his 84-year-old mother to work with him on the back of his motorbike every day .\nhe ties a sash around both of their waists to make sure she does n't fall off\nshe would often go up to the mountains to collect firewood and there were a few occasions when she got lost after dark .\nwhen mr lu 's father passed away earlier this year , he decided to take his mother with him to work because there 

In [3]:
# # test loading

# df = pd.read_json('../data/test-data/final_combined_test_data.json', lines=True)
# print(len(df))
# print(df['subset'].value_counts())
# print(df['label'].value_counts())

62823
subset
halueval_cnndm          19998
alisawuffles/WANLI       5000
Seahorse                 4135
ExpertQA                 3702
fib_xsum                 3534
anli                     3200
scitail                  2126
Lfqa                     1911
DeFacto                  1836
llm_summaries_cnndm      1829
llm_summaries_xsum       1726
Reveal                   1705
FactCheck-GPT            1565
FoolMeTwice              1379
aggrefact_xsum           1335
ClaimVerify              1087
aggrefact_cnndm          1017
bump_cnndm                785
tofueval_meetingbank      770
DADC-NLI                  766
tofueval_mediasum         725
diasumfact_SAMSum         669
diasumfact_QMSum          569
fib_cnndm                 543
instrusum_instrusum       458
WiCE                      358
gumsum_gumsum              95
Name: count, dtype: int64
label
Yes    31469
No     31354
Name: count, dtype: int64
