In [1]:
import math 
import string 
import re

import json 

import pandas as pd 
import sqlite3
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [9]:
qualified_annotators_dict = {'pubmed': ['ann_jclvzw', 'ann_eftpco'],
                            'news': ['ann_japq', 'ann_tpfo'],
                            'billsum': ['ann_krcnbm', 'ann_hguilf']}


def connect_to_db(db_path):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    return conn, c 


def read_annotations(db_path, task):
    # /human_annotations_factuality/XSUM_CNN
    parent_path = '/home/ramprasad.sa'
    db_path_round1 = f'{parent_path}/{db_path}/set1/annotated/{task}_summaries_set1_final.db'
    print(db_path_round1)
    conn, c = connect_to_db(db_path_round1)
    df_annotations_round1  = pd.read_sql('SELECT * from label', conn)

    db_path_round2 = f'{parent_path}/{db_path}/set1_round2/annotated/{task}_nonfactual_annotated_generated_summaries_fin.db'
    conn, c = connect_to_db(db_path_round2)
    df_annotations_round2 = pd.read_sql('SELECT * from error_label', conn)
    return df_annotations_round1, df_annotations_round2
    
    
def remove_duplicates(df, task):
    
    qualified_annotators_dict = {'pubmed': ['ann_jclvzw', 'ann_eftpco'],
                            'news': ['ann_japq', 'ann_tpfo'],
                            'billsum': ['ann_krcnbm', 'ann_hguilf']}
    qualified_annotators = qualified_annotators_dict[task]
    processed_rows = []
    
    for annotator in qualified_annotators:
        df_qualified_annotators = df[df['user_id'] == annotator]
        # print(df_qualified_annotators)
        for each_id in list(set(df_qualified_annotators['summary_uuid'].values)):
            df_uid = df_qualified_annotators[df_qualified_annotators['summary_uuid'] == each_id]
            if len(df_uid) == 1:
                row_append = df_uid.iloc[[0]]
            else:
                row_append = df_uid.iloc[[-1]]
            processed_rows.append(row_append)
            
    df_processed = pd.concat(processed_rows)
    return df_processed

def make_pids(df_annotations):
    pids = []
    for idx, row in df_annotations.iterrows():
        summary_uuid = row['summary_uuid'].split('_ann_')[0].strip()
        pids.append(summary_uuid)
    df_annotations['pid'] = pids
    return df_annotations

def refine_df(df_annotations, task):
    
    df_annotations = remove_duplicates(df_annotations, task)
    df_annotations = make_pids(df_annotations)
    return df_annotations

def test_annotation_rounds(df_annotations_round1, df_annotations_round2, task):
    
    
    for qual_ann in qualified_annotators_dict[task]:
        df_ann_r1 = df_annotations_round1[df_annotations_round1['user_id'] == qual_ann]
        df_ann_r1_sents = df_ann_r1[df_ann_r1['label_type'] == 'non_factual']['nonfactual_sentences'].values
        df_ann_r1_sents = [len(each.split('<new_annotation>')) for each in df_ann_r1_sents ]
        
        df_ann_r2 = df_annotations_round2[df_annotations_round2['user_id'] == qual_ann]
        assert sum(df_ann_r1_sents) ==  len(df_ann_r2)
    return



def make_sentence_error_maps(task_annotators,  df_round2_article_summ):
    sentence_error_category_map = {}
    if not len(df_round2_article_summ):
            sentence_error_category_map[None] = {0: (None, None, None, None), 1: (None, None, None, None)}
            
    else:
        for annotator in task_annotators:
            df_round2_article_summ_annotator = df_round2_article_summ[df_round2_article_summ['user_id'] == annotator]
            for error_idx, error_row in df_round2_article_summ_annotator.iterrows():
                    user_id = task_annotators.index(error_row['user_id'])
                    nonfactual_sentence = error_row['nonfactual_sentence']
                    inaccuracy_severity = error_row['inaccuracy_severity']
                    error_type = error_row['error_type']
                    error_factuality = error_row['error_factuality'] 
                    error_factuality = error_factuality if error_factuality else None
                    comments = error_row['comments']
                    comments = comments if comments else None

                    if nonfactual_sentence not in sentence_error_category_map:
                        sentence_error_category_map[nonfactual_sentence] = {0: (None, None, None, None), 1: (None, None, None, None)}
                    sentence_error_category_map[nonfactual_sentence][user_id] = (inaccuracy_severity, error_type, error_factuality, comments)
    return sentence_error_category_map
    


def rectified_annotations(df_aggr, ann_idx):
    
    label_types_ann = list(df_aggr[f'label_type_ann{ann_idx}'].values)
    inaccuracy_severity_ann = list(df_aggr[f'inaccuracy_severity_ann{ann_idx}'].values)
    error_type_ann = list(df_aggr[f'error_type_ann{ann_idx}'].values)
    error_factuality_ann = list(df_aggr[f'error_factuality_ann{ann_idx}'].values)
    
    for idx, row in df_aggr.iterrows():
        sumid = row['summary_uuid']
        comment = row[f'comments_ann{ann_idx}']
        error_type = row[f'error_type_ann{ann_idx}']
        if comment != None :
            comment = comment.lower()
            if 'accurate' in comment or 'factual' in comment or 'this is correct' in comment or 'misidentified' in comment:
                # print(comment, idx, label_types_ann[idx])
                rect_ids = list(df_aggr[df_aggr['summary_uuid'] == sumid].index)
                for rid in rect_ids:
                    label_types_ann[rid] =  'factual'
                    inaccuracy_severity_ann[rid] = None
                    error_type_ann[rid] = None
                    error_factuality_ann[rid] = None
                
    df_aggr[f'label_type_ann{ann_idx}'] = label_types_ann
    df_aggr[f'inaccuracy_severity_ann{ann_idx}'] = inaccuracy_severity_ann
    df_aggr[f'error_type_ann{ann_idx}'] = error_type_ann
    df_aggr[f'error_factuality_ann{ann_idx}'] = error_factuality_ann
    return df_aggr

def make_aggr_df(df_annotations_round1, df_annotations_round2, task):
    
    df_processed_dict = {
    'summary_uuid': [],
    'pid': [],
    'summary': [],
    'article': [],
    'summ_id': [],
    'system_id': [],
    'nonfactual_sentence': [],
    'label_type_ann1': [],
    'label_type_ann2': [],
    'inaccuracy_severity_ann1': [],
    'inaccuracy_severity_ann2': [],
    'error_type_ann1': [],
    'error_type_ann2': [],  
    'error_factuality_ann1': [],
    'error_factuality_ann2': [],
    'comments_ann1': [],
    'comments_ann2': [],

    }
    
    unique_articles = list(set(df_annotations_round1['article'].values))
    task_annotators = qualified_annotators_dict[task]

    for article in unique_articles:
        df_round1_article = df_annotations_round1[df_annotations_round1['article'] == article]
        df_round2_article = df_annotations_round2[df_annotations_round2['article'] == article]

        article_summaries = list(set(df_round1_article['summary'].values))

        for article_summary in article_summaries:

            df_round1_article_summ = df_round1_article[df_round1_article['summary'] == article_summary]
            df_round2_article_summ = df_round2_article[df_round2_article['summary'] == article_summary]
            if len(df_round1_article_summ) <2 :
                print(article_summary)
            else:
                label_type_annotations = []
                for annotator in task_annotators:
                    df_round1_article_summ_annotator = df_round1_article_summ[df_round1_article_summ['user_id'] == annotator]

                    label_type_annotations.append(df_round1_article_summ_annotator['label_type'].values[0])

                sentence_error_category_map = make_sentence_error_maps(task_annotators,  df_round2_article_summ)



                for nonfactual_sentence, nonfactual_sentence_annotations in sentence_error_category_map.items():
                    summary_uuid = df_round1_article_summ['summary_uuid'].values[0]
                    summ_id = df_round1_article_summ['summ_id'].values[0]
                    system_id = df_round1_article_summ['system_id'].values[0]
                    pid = list(set(df_round2_article_summ['pid'].values))
                    summary = df_round1_article_summ['summary'].values[0]

                    df_processed_dict['summary_uuid'].append(summary_uuid)
                    df_processed_dict['summ_id'].append(summ_id)
                    system_id_append = 'GPT 3.5' if system_id == 'gpt3' else 'Flan T5'
                    df_processed_dict['system_id'].append(system_id_append)
                    df_processed_dict['pid'].append(pid)
                    # df_processed_dict['pid_2'].append(pid)
                    df_processed_dict['summary'].append(summary)
                    df_processed_dict['article'].append(article)

                    df_processed_dict['nonfactual_sentence'].append(nonfactual_sentence)
                    for lid, label_annotations in enumerate(label_type_annotations):
                        df_processed_dict[f'label_type_ann{lid+1}'] += [label_annotations]

                    # ann_vals = {
                    for ann_id, ann_vals in nonfactual_sentence_annotations.items():
                        ann_id = ann_id + 1
                        df_processed_dict[f'inaccuracy_severity_ann{ann_id}'].append(ann_vals[0])
                        df_processed_dict[f'error_type_ann{ann_id}'].append(ann_vals[1])
                        df_processed_dict[f'error_factuality_ann{ann_id}'].append(ann_vals[2])
                        df_processed_dict[f'comments_ann{ann_id}'].append(ann_vals[3])
                        
    return pd.DataFrame(df_processed_dict)

            
    


def compute_score(df_aggr, task):

    unique_ids = list(set(df_aggr['summary_uuid']))
    df_errors_scores = {'summary_uuid': [], 'score': [], 'model': [], 'task': [], 'summary': [], 'article': []}
    for model in ['Flan T5', 'GPT 3.5']:
        
        df_model = df_aggr[df_aggr['system_id'] == model]
        unique_ids = list(set(df_model['summary_uuid']))
        print(len(unique_ids))
        model_sentbased_scores = []
        for uuid in unique_ids:

            df_uuid = df_model[df_model['summary_uuid'] == uuid]
            # print(df_uuid)
            # summary = df_uuid['summary'].values[0]
            
            ann1_scores = []
            ann2_scores = []
            for idx, row in df_uuid.iterrows():
                summary = row['summary']
                summ_sents =  list(nlp(summary).sents)
                nonfactual_sentence = row['nonfactual_sentence']
                label_type_ann1 = row['inaccuracy_severity_ann1']
                label_type_ann2 = row['inaccuracy_severity_ann2']
                ann1_score = 0
                ann2_score = 0
                
                if label_type_ann1 != None:
                    ann1_score = 1
                if label_type_ann2 != None:
                    ann2_score = 1

                ann1_scores.append(ann1_score)
                ann2_scores.append(ann2_score)

            ann1_score_uuid = sum(ann1_scores)
            ann2_score_uuid = sum(ann2_scores)

            sentbased_score = np.mean([ann1_score_uuid, ann2_score_uuid])/len(summ_sents)
            # print([ann1_score_uuid, ann2_score_uuid], row)
            model_sentbased_scores.append(sentbased_score)
            df_errors_scores['summary_uuid'].append(uuid) 
            df_errors_scores['score'].append(sentbased_score)
            # model_append = 'GPT-3.5' if model == 'gpt3' else 'Flan-T5-XL'
            df_errors_scores['model'].append(model)
            df_errors_scores['task'].append(task)
            df_errors_scores['summary'].append(summary)
            df_errors_scores['article'].append(row['article'])
        print(model.upper(), np.mean(model_sentbased_scores))
        
    return pd.DataFrame(df_errors_scores)
    
    
    
    
    


In [10]:
def run_scoring(db_path, task):
    df_annotations_round1, df_annotations_round2 = read_annotations(db_path, task)
    print(len(df_annotations_round1), len(df_annotations_round2))
    df_annotations_round2 = refine_df(df_annotations_round2, task)
    df_annotations_round1 = remove_duplicates(df_annotations_round1, task)
    print('removed duplicates...', len(df_annotations_round1), len(df_annotations_round2))

    test_annotation_rounds(df_annotations_round1, df_annotations_round2, task)
    df_aggr = make_aggr_df(df_annotations_round1, df_annotations_round2, task)
    df_aggr = rectified_annotations(df_aggr, ann_idx = 1)
    df_aggr = rectified_annotations(df_aggr, ann_idx = 2)
    # label_aggr = df_aggr[df_aggr['label_type_ann1'] == df_aggr['label_type_ann2']] 
    
    # df_aggr_sent = df_aggr[(~df_aggr['error_type_ann1'].isnull()) | (~df_aggr['error_type_ann2'].isnull())]
    df_errors = compute_score(df_aggr, task)
    
    write_path = '/home/ramprasad.sa/factual_annotation_llm_summaries/annotations'
    df_errors.to_csv(f'{write_path}/{task}_annotations_scores.csv')
    df_aggr.to_csv(f'{write_path}/{task}_annotations.csv')
    return df_errors, df_aggr

In [None]:
db_path_news = '/human_annotations_factuality/XSUM_CNN'
task = 'news'
df_errors_news, df_aggr_news = run_scoring(db_path_news, 'news')

db_path_billsum = '/human_annotations_factuality/billsum'
task = 'billsum'
df_errors_billsum, df_aggr_billsum = run_scoring(db_path_billsum, task)

db_path_pubmed = '/human_annotations_factuality/pubmed'
task = 'pubmed'
df_errors_pubmed, df_aggr_pubmed= run_scoring(db_path_pubmed, task)



/home/ramprasad.sa//human_annotations_factuality/XSUM_CNN/set1/annotated/news_summaries_set1_final.db
219 118
removed duplicates... 201 114
Delhi Dynamos has expressed an interest in signing Leicester City's midfielder, Esteban Cambiasso, for India's I-League. Nigel Pearson desires to keep the 34-year-old, who signed a one-year deal with Leicester last summer, for another season, however, this is dependent on Leicester's status in the Premier League. Leicester is currently in 18th place in the Premier League and faces a critical relegation encounter against Burnley at Turf Moor on Saturday.
50
FLAN T5 0.27
50
GPT 3.5 0.18593290043290042
/home/ramprasad.sa//human_annotations_factuality/billsum/set1/annotated/billsum_summaries_set1_final.db
233 123
removed duplicates... 200 117
50
FLAN T5 0.19
50
