In [1]:
import math 
import string 
import re

import json 

import pandas as pd 
import sqlite3
import spacy
import numpy as np
from rouge import Rouge 
nlp = spacy.load('en_core_web_sm')

In [2]:
def read_annotations(task):
    df_ann = pd.read_csv(f'/home/ramprasad.sa/factual_annotation_llm_summaries/annotations/{task}_annotations.csv', keep_default_na=False)
    df_ann_scores = pd.read_csv(f'/home/ramprasad.sa/factual_annotation_llm_summaries/annotations/{task}_annotation_scores.csv', keep_default_na=False)
    return df_ann, df_ann_scores

In [3]:
df_ann_news, df_ann_scores_news = read_annotations('news')
df_ann_billsum, df_ann_scores_billsum = read_annotations('billsum')
df_ann_pubmed, df_ann_scores_pubmed = read_annotations('pubmed')

In [6]:
# from QAFactEval.qafacteval import QAFactEval
# from questeval.questeval_metric import QuestEval
from summac.model_summac import SummaCZS, SummaCConv

def get_questeval_score(articles, summaries):
    questeval = QuestEval(no_cuda=False)
    score = questeval.corpus_questeval(
    hypothesis=summaries, 
    sources=articles
    )
    return score['ex_level_scores']

def get_summac_score(articles, summaries):
    device = 'cuda'
    model_zs = SummaCZS(granularity="sentence", model_name="vitc", device=device) # If you have a GPU: switch to: device="cuda"
    model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device=device, start_file="default", agg="mean")
    score_zs1 = model_zs.score(articles, summaries)
    score_conv1 = model_conv.score(articles, summaries)
    return score_zs1['scores'], score_conv1['scores']

def get_qafacteval_score(articles, summaries):
    kwargs = {"cuda_device": 0, "use_lerc_quip": True, \
        "verbose": True, "generation_batch_size": 32, \
        "answering_batch_size": 32, "lerc_batch_size": 8}

    model_folder = "/home/ramprasad.sa/factual_annotation_llm_summaries/analysis/notebook/QAFactEval/models" # path to models downloaded with download_models.sh
    metric = QAFactEval(
        lerc_quip_path=f"{model_folder}/quip-512-mocha",
        generation_model_path=f"{model_folder}/generation/model.tar.gz",
        answering_model_dir=f"{model_folder}/answering",
        lerc_model_path=f"{model_folder}/lerc/model.tar.gz",
        lerc_pretrained_model_path=f"{model_folder}/lerc/pretraining.tar.gz",
        **kwargs
    )
    qafacteval_scores = []
    docs = []
    summs = []
    for art, summ in list(zip(articles, summaries)):
        docs.append(art)
        summs.append([summ])
    results = metric.score_batch_qafacteval(docs, summs, return_qa_pairs=True)
    results = [each[0]['qa-eval']['lerc_quip'] for each in results]
    # results = metric.score_batch_qafacteval([art], [[summ]], return_qa_pairs=True)
    # score = results[0][0]['qa-eval']['lerc_quip']
    # qafacteval_scores.append(score)
    return results


In [7]:
from scipy import stats
# from bart_score import BARTScorer
def make_summac(df_errors):
    summaczs, summacc = get_summac_score(list(df_errors['article'].values), list(df_errors['summary'].values))
    df_errors['SummaC_Conv'] = summacc
    df_errors['SummaC_ZS'] = summaczs
    y_scores = [1 - each for each in df_errors['score']]
    print(stats.spearmanr(summaczs, y_scores), stats.spearmanr(summacc, y_scores))
    print(stats.pearsonr(summaczs, y_scores), stats.pearsonr(summacc, y_scores))
    return df_errors

def make_questeval(df_errors):
    questeval_score = get_questeval_score(list(df_errors['article'].values), list(df_errors['summary'].values))
    df_errors['QuestEval'] = questeval_score
    y_scores = [1 - each for each in df_errors['score']]
    print(stats.spearmanr(questeval_score, y_scores))
    print(stats.pearsonr(questeval_score, y_scores))
    return df_errors

def make_qafacteval(df_errors):
    qafacteval_scores = get_qafacteval_score(list(df_errors['article'].values), list(df_errors['summary'].values))
    df_errors['QAFactEval'] = qafacteval_scores
    y_scores = [1 - each for each in df_errors['score']]
    print(stats.spearmanr(qafacteval_scores, y_scores))
    print(stats.pearsonr(qafacteval_scores, y_scores))
    return df_errors


In [8]:
# df_ann_scores_news = make_qafacteval(df_ann_scores_news)
# df_ann_scores_billsum = make_qafacteval(df_ann_scores_billsum)
# df_ann_scores_pubmed = make_qafacteval(df_ann_scores_pubmed)

In [9]:
# df_ann_scores_news = make_questeval(df_ann_scores_news)
# df_ann_scores_billsum = make_questeval(df_ann_scores_billsum)
# df_ann_scores_pubmed = make_questeval(df_ann_scores_pubmed)

In [13]:
# df_ann_scores_news = make_summac(df_ann_scores_news)
# df_ann_scores_billsum = make_summac(df_ann_scores_billsum)
df_ann_scores_pubmed = make_summac(df_ann_scores_pubmed)

<All keys matched successfully>
SignificanceResult(statistic=0.1147427565379258, pvalue=0.25563834223345394) SignificanceResult(statistic=0.06192422559742772, pvalue=0.5405082149295017)
PearsonRResult(statistic=0.007647056425181448, pvalue=0.9398087557446342) PearsonRResult(statistic=-0.04615249113414532, pvalue=0.6484147889125043)
