In [1]:
# This notebook runs the NND experiment in 
# Section 5.2 of the paper: "Fine-Grained Model Comparison"
# BART-Large-cnn is compared to the PEGASUS-cnn model
# using SummEval NND for broad comparison
# and FRANK NND for more specific comparison on factual consistency
# [UPDATE] Added the BRIO model to the notebook

In [1]:
from utils_nnd_dataset import load_frank_nnd, load_summeval_nnd
from utils_nnd import GeneratorHF, run_nnd
import pandas as pd

In [2]:
summeval_nnd_test = load_summeval_nnd("/export/share/plaban/data/summac_benchmark/summeval/")
frank_nnd_test = load_frank_nnd("/export/share/plaban/data/summac_benchmark/frank/")
print("SummEval has %d NND test pairs; FRANK has %d" % (len(summeval_nnd_test), len(frank_nnd_test)))

Reusing dataset cnn_dailymail (/export/home/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


SummEval has 3613 NND test pairs; FRANK has 848


In [3]:
summarizers = [
    {"model_name": "BART-L", "model": GeneratorHF(model_card="facebook/bart-large-cnn")},
    {"model_name": "PEGASUS", "model": GeneratorHF(model_card="google/pegasus-cnn_dailymail")},
    {"model_name": "BRIO", "model": GeneratorHF(model_card="Yale-LILY/brio-cnndm-cased")},
]



In [4]:
results = [run_nnd(summeval_nnd_test, summ["model"], summ["model_name"], no_error_label="no error", report_type="accuracy") for summ in summarizers]
with pd.option_context('display.max_rows', 300, 'display.max_columns', 7, 'display.expand_frame_repr', False):
    print(pd.DataFrame(results))

NND BART-L: 100%|██████████| 3613/3613 [00:37<00:00, 95.13it/s]  
NND PEGASUS: 100%|██████████| 3613/3613 [00:48<00:00, 73.84it/s]  
NND BRIO: 100%|██████████| 3613/3613 [00:37<00:00, 97.33it/s]  

  model_name   accuracy  A_consistency  A_no error  A_coherence  A_fluency  A_relevance
0     BART-L  71.104345       0.713274    0.711043     0.747475   0.688830     0.685921
1    PEGASUS  72.986438       0.766372    0.729864     0.744108   0.727394     0.697653
2       BRIO  73.290894       0.711504    0.732909     0.737374   0.753989     0.724729





In [5]:
results = [run_nnd(frank_nnd_test, summ["model"], summ["model_name"], report_type="accuracy") for summ in summarizers]
pd.DataFrame(results)

NND BART-L: 100%|██████████| 848/848 [00:36<00:00, 23.54it/s]
NND PEGASUS: 100%|██████████| 848/848 [00:47<00:00, 17.83it/s]
NND BRIO: 100%|██████████| 848/848 [00:35<00:00, 23.64it/s]


Unnamed: 0,model_name,accuracy,A_NoE,A_CorefE,A_GramE,A_CircE,A_LinkE,A_EntE,A_RelE,A_OtherE,A_OutE
0,BART-L,80.778302,0.807783,0.701299,0.81203,0.820896,0.72973,0.83758,0.891892,0.875,0.822222
1,PEGASUS,81.485849,0.814858,0.707792,0.864662,0.776119,0.783784,0.843949,0.878378,0.875,0.777778
2,BRIO,77.004717,0.770047,0.779221,0.804511,0.731343,0.567568,0.786624,0.783784,0.833333,0.688889
