# [Understanding Factual Errors in Summarization: Errors, Summarizers, Datasets, Error Detectors](https://arxiv.org/abs/2205.12854)



```
@misc{tang2023understanding,
      title={Understanding Factual Errors in Summarization: Errors, Summarizers, Datasets, Error Detectors}, 
      author={Liyan Tang and Tanya Goyal and Alexander R. Fabbri and Philippe Laban and Jiacheng Xu and Semih Yavuz and Wojciech Kryściński and Justin F. Rousseau and Greg Durrett},
      year={2023},
      eprint={2205.12854},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

```

In [1]:
import pandas as pd
import numpy as np
import sklearn
from utils import choose_best_threshold
import warnings
warnings.filterwarnings('ignore')

from utils import SOTA, XFORMER, OLD, MAPPING

In [2]:
dataset_path="data/aggre_fact_sota.csv"
# dataset_path="data/aggre_fact_sota_granularity.csv"
df = pd.read_csv(dataset_path, index_col = 0)

# split data
df_val = df[df.cut == 'val']
df_val_sota = df_val[df_val.model_name.isin(SOTA)]
df_test = df[df.cut == 'test']
df_test_sota = df_test[df_test.model_name.isin(SOTA)]

dataset_list = ['XSumFaith', 'Polytope', 'FactCC', 'SummEval', 'FRANK', 'Wang20', 'CLIFF', 'Goyal21', 'Cao22']
systems = ['DAE', 'QuestEval', 'SummaC-ZS', 'SummaC-Conv', 'QAFactEval', 'FaCED', 'FaCED_wo_GE', 'AlignScore']
# systems = ['DAE', 'QuestEval', 'SummaC-ZS', 'SummaC-Conv', 'QAFactEval', 'FaCED-1G', 'FaCED-2G', 'FaCED-3G', 'FaCED-4G']
origins = ['cnndm', 'xsum']

# AggreFact-CNN/XSum-SOTA

In [3]:
from utils import resample_balanced_acc

main_sota_df = pd.DataFrame(
    columns=['system', 'origin', 'bl_acc']
)

results = []

for system in systems:
    for origin in origins:
        df_val_temp = df_val_sota[(df_val_sota.origin == origin)]
        df_test_temp = df_test_sota[(df_test_sota.origin == origin)]

        best_thresh, best_f1 = choose_best_threshold(df_val_temp.label.values, df_val_temp[f'{system}_score'].values)
        scores_test = df_test_temp[f'{system}_score'].values
        preds_test = [1 if score > best_thresh else 0 for score in scores_test]

        f1_score = sklearn.metrics.balanced_accuracy_score(df_test_temp.label.values, preds_test)

        main_sota_df.loc[len(main_sota_df.index)] = [
            system, origin, f1_score
        ]

        results.append({"system": system, 'origin': origin,  "labels": df_test_temp.label.values, 
        "preds": preds_test, "scores": scores_test})

In [4]:
# Table 5
# standard deviation may differ due to randomness

# from https://github.com/tingofurro/summac/
P5 = 5 / 2 # Correction due to the fact that we are running 2 tests with the same data
P1 = 1 / 2 # Correction due to the fact that we are running 2 tests with the same data

for origin in origins:
    sampled_batch_preds = {res["system"]: [] for res in results}
    
    for res in results:
        if res['origin'] == origin:
    
            samples = resample_balanced_acc(res["preds"], res["labels"])
            sampled_batch_preds[res["system"]].append(samples)
            low5, high5 = np.percentile(samples, P5), np.percentile(samples, 100-P5)
            low1, high1 = np.percentile(samples, P1), np.percentile(samples, 100-P1)
            bacc = sklearn.metrics.balanced_accuracy_score(res["labels"], res["preds"])

            print(res['origin'].center(6), res["system"].center(20), " - %.3f, %.3f" % (bacc, bacc-low5))
    print()

cnndm          DAE           - 0.654, 0.034
cnndm       QuestEval        - 0.702, 0.030
cnndm       SummaC-ZS        - 0.640, 0.033
cnndm      SummaC-Conv       - 0.610, 0.032
cnndm       QAFactEval       - 0.678, 0.033
cnndm         FaCED          - 0.726, 0.029
cnndm      FaCED_wo_GE       - 0.722, 0.029
cnndm       AlignScore       - 0.625, 0.033

 xsum          DAE           - 0.702, 0.019
 xsum       QuestEval        - 0.595, 0.021
 xsum       SummaC-ZS        - 0.564, 0.014
 xsum      SummaC-Conv       - 0.650, 0.018
 xsum       QAFactEval       - 0.639, 0.020
 xsum         FaCED          - 0.693, 0.020
 xsum      FaCED_wo_GE       - 0.663, 0.019
 xsum       AlignScore       - 0.696, 0.017

