# [Understanding Factual Errors in Summarization: Errors, Summarizers, Datasets, Error Detectors](https://arxiv.org/abs/2205.12854)



```
@misc{tang2023understanding,
      title={Understanding Factual Errors in Summarization: Errors, Summarizers, Datasets, Error Detectors}, 
      author={Liyan Tang and Tanya Goyal and Alexander R. Fabbri and Philippe Laban and Jiacheng Xu and Semih Yavuz and Wojciech Kryściński and Justin F. Rousseau and Greg Durrett},
      year={2023},
      eprint={2205.12854},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

```

In [1]:
import pandas as pd
import numpy as np
import sklearn
from utils import choose_best_threshold
import warnings
warnings.filterwarnings('ignore')

from utils import SOTA, XFORMER, OLD, MAPPING

In [2]:
dataset_path="data/aggre_fact_final.csv"
df = pd.read_csv(dataset_path, index_col = 0)

# split data
df_val = df[df.cut == 'val']
df_val_sota = df_val[df_val.model_name.isin(SOTA)]
df_test = df[df.cut == 'test']
df_test_sota = df_test[df_test.model_name.isin(SOTA)]

dataset_list = ['XSumFaith', 'Polytope', 'FactCC', 'SummEval', 'FRANK', 'Wang20', 'CLIFF', 'Goyal21', 'Cao22']
systems = ['DAE', 'QuestEval', 'SummaC-ZS', 'SummaC-Conv', 'QAFactEval', 'AlignScore', 'FActScore', 'FIZZ', 'FIZZ-wo-GE', 'FIZZ-wo-AF']
origins = ['cnndm', 'xsum']

In [3]:
main_df = pd.DataFrame(
    columns=['system', 'origin', 'count', 'dataset', 'category', 'bl_acc']
)

results = []

for system in systems:
    df[f'{system}_label'] = None

for system in systems:
    for origin in origins:
        for dataset in dataset_list:
            for i, model_novelty in enumerate([SOTA, XFORMER, OLD]):
                df_val_temp = df_val[(df_val.dataset == dataset) & (df_val.origin == origin) & (df_val.model_name.isin(model_novelty))]
                df_test_temp = df_test[(df_test.dataset == dataset) & (df_test.origin == origin) & (df_test.model_name.isin(model_novelty))]
                if len(df_val_temp) > 0 and len(df_test_temp) > 0:
                    best_thresh, best_f1 = choose_best_threshold(df_val_temp.label.values, df_val_temp[f'{system}_score'].values)
                    scores_test = df_test_temp[f'{system}_score'].values
                    preds_test = [1 if score > best_thresh else 0 for score in scores_test]
                    df.loc[df_test_temp.index, f'{system}_label'] = preds_test
                    
                    balanced_acc = sklearn.metrics.balanced_accuracy_score(df_test_temp.label.values, preds_test)

                    main_df.loc[len(main_df.index)] = [
                        system, origin, len(preds_test), dataset, MAPPING[i], balanced_acc
                    ]

                    results.append({"system": system, "dataset_name": dataset, 'origin': origin, 
                    'count': len(scores_test), 'cat': MAPPING[i], "labels": df_test_temp.label.values, 
                    "preds": preds_test, "scores": scores_test})

df = df.reindex(
    columns=['dataset', 'origin', 'id', 'doc', 'summary', 'model_name', 'label',
       'cut', 'DAE_score', 'DAE_label', 'QuestEval_score', 'QuestEval_label',
       'SummaC-ZS_score', 'SummaC-ZS_label', 'SummaC-Conv_score', 'SummaC-Conv_label', 
       'QAFactEval_score' , 'QAFactEval_label'],
)

#  Dataset-wise comparsion between factuality systems

In [4]:
# Table 8
main_df_pivot_bacc = main_df.pivot(index=['origin', 'dataset', 'category', 'count'], columns='system', values='bl_acc')
main_df_pivot_bacc = main_df_pivot_bacc.reindex(columns=systems)
main_df_pivot_bacc.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,system,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval,AlignScore,FActScore,FIZZ,FIZZ-wo-GE,FIZZ-wo-AF
origin,dataset,category,count,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
cnndm,CLIFF,SOTA,150,0.73,0.74,0.646,0.649,0.716,0.685,0.645,0.735,0.785,0.647
cnndm,FRANK,OLD,523,0.704,0.67,0.692,0.727,0.773,0.759,0.684,0.714,0.73,0.748
cnndm,FRANK,SOTA,175,0.699,0.626,0.57,0.601,0.547,0.592,0.542,0.692,0.68,0.68
cnndm,FRANK,XFORMER,175,0.574,0.556,0.631,0.634,0.646,0.637,0.628,0.649,0.651,0.579
cnndm,FactCC,OLD,503,0.704,0.655,0.835,0.891,0.843,0.859,0.76,0.786,0.802,0.861
cnndm,Goyal21,OLD,25,0.188,0.146,0.375,0.354,0.271,0.5,0.271,0.479,0.479,0.271
cnndm,Polytope,OLD,450,0.779,0.687,0.802,0.791,0.824,0.887,0.746,0.815,0.817,0.862
cnndm,Polytope,SOTA,34,0.294,0.176,0.971,0.735,0.324,0.324,0.882,0.794,0.735,0.912
cnndm,Polytope,XFORMER,150,0.774,0.733,0.97,0.811,0.726,0.838,0.831,0.666,0.672,0.753
cnndm,SummEval,OLD,548,0.661,0.649,0.773,0.801,0.814,0.732,0.791,0.751,0.785,0.793


# AggreFact-CNN

In [5]:
# Table 4
scores = []
for cat in MAPPING.values():
    score = []
    for system in systems:
        system_df = main_df[(main_df.system == system) & (main_df.category == cat) & (main_df.origin == 'cnndm')]
        value = sum(system_df['count'] * system_df['bl_acc']) / sum(system_df['count'])
        score.append(round(value, 3))
    scores.append(score)

weighted_df = pd.DataFrame(
    scores,
    columns=systems,
    index=['SOTA', 'XFORMER', 'OLD']
)
weighted_df

Unnamed: 0,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval,AlignScore,FActScore,FIZZ,FIZZ-wo-GE,FIZZ-wo-AF
SOTA,0.594,0.637,0.633,0.703,0.616,0.534,0.699,0.732,0.68,0.724
XFORMER,0.679,0.643,0.765,0.698,0.691,0.731,0.716,0.673,0.671,0.67
OLD,0.697,0.652,0.763,0.79,0.803,0.802,0.739,0.76,0.776,0.804


# AggreFact-XSum

In [6]:
# Table 4
scores = []
for cat in MAPPING.values():
    score = []
    for system in systems:
        system_df = main_df[(main_df.system == system) & (main_df.category == cat) & (main_df.origin == 'xsum')]
        value = sum(system_df['count'] * system_df['bl_acc']) / sum(system_df['count'])
        score.append(round(value, 3))
    scores.append(score)

weighted_df = pd.DataFrame(
    scores,
    columns=systems,
    index=['SOTA', 'XFORMER', 'OLD']
)
weighted_df

Unnamed: 0,DAE,QuestEval,SummaC-ZS,SummaC-Conv,QAFactEval,AlignScore,FActScore,FIZZ,FIZZ-wo-GE,FIZZ-wo-AF
SOTA,0.731,0.616,0.561,0.67,0.66,0.702,0.68,0.697,0.694,0.649
XFORMER,0.855,0.601,0.514,0.646,0.596,0.801,0.635,0.724,0.715,0.671
OLD,0.834,0.597,0.533,0.675,0.605,0.637,0.668,0.685,0.698,0.661
