In [127]:
import pandas as pd
from GPTModel import GPTInference
from tqdm import tqdm
import re, string
import numpy as np
import math
from tqdm import tqdm 

In [569]:
def make_incontext_examples(df, num_samples, instruction):
    each_sample = int(num_samples/2)
    num_errors = []
    print('Before sampling', len(df))
    for ann in list(df['Annotations'].values):
        ann = eval(ann)
        num_errors += [sum([len(spans) for sent, spans in ann.items()])]
    df['Errors'] = num_errors
    df_fewshot_spans_one = df[df['Errors'] == 1].sample(each_sample )
    df_fewshot_spans_two = df[df['Errors'] > 1].sample(each_sample )
    df_fewshot_nospans = df[df['Errors'] == 0].sample(1)
    df_fewshot =pd.concat([df_fewshot_spans_one, df_fewshot_spans_two, df_fewshot_nospans])
    df_fewshot = df_fewshot.sample(len(df_fewshot))
    df_fewshot.to_csv('fewshot_examples.csv')

    prompt_strs = []
    for idx, row in df_fewshot.iterrows():
        df = df.drop(idx)
        dialogue = row['Dialogue']
        inconsistent_spans = eval(row['Annotations'])
        inconsistent_spans = []
        for sent, sent_spans in eval(row['Annotations']).items():
            # sent_spans = ', '.join([each[0] for each in sent_spans])
            sent_spans = [each[0] for each in sent_spans]
            # inconsistent_spans += [f'SENT {sent} SPANS {sent_spans}']
            inconsistent_spans += sent_spans
            print(sent_spans)
        inconsistent_spans = '\n'.join(inconsistent_spans)
        inconsistent_spans = inconsistent_spans if inconsistent_spans else "None"
        prompt_strs.append(f'Dialogue: {dialogue}\nSummary: {row["Summary"]}\n{instruction}\nInconsistent Spans: {inconsistent_spans}')
    fewshot_prompt_str = '\n\n'.join(prompt_strs)
    print('After sampling', len(df))
    return fewshot_prompt_str, df 

In [570]:
'''
Scoring code
'''

def postprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text
    

def get_candidate_comparison(gpt_response, annotations):
    scores = []
    comparisons = []
    for k ,v in annotations.items():
        gpt_response_processed = set(postprocess(gpt_response).split())
        k_processed = set(postprocess(k).split())
        sentence_overlap = 0 
        if len(gpt_response_processed):
            sentence_overlap = len(gpt_response_processed.intersection(k_processed))/len(gpt_response_processed)
        scores += [sentence_overlap]
    
    # max_score = max(scores)
    if scores:
        max_scores_idx = [idx for idx, each in enumerate(scores) if each == max(scores)]     
        v_items = list(annotations.values())
        comparisons = v_items[max_scores_idx[0]]
    # comparisons = [ each for idx in max_scores_idx for each in v_items[idx]]
    return comparisons

def span_scores(gpt_response, annotations):

    # comparisons = get_candidate_comparison(gpt_response, annotations)
    
    span_accuracies = []
    span_f1_scores = []
    # print('COMPARISONS', comparisons, gpt_response)
    comparisons = []
    for each in annotations.values():
        comparisons += each
        
    for comp in comparisons:
        accuracy = 0
        f1_score = 0
        
        y_text = comp[0]
        gpt_response_processed = set(postprocess(gpt_response).split())
        y_text_processed = set(postprocess(y_text).split())
        
        overlap = gpt_response_processed.intersection(y_text_processed)
        
        precision = len(overlap)/len(gpt_response_processed) if len(gpt_response_processed) else 0
        recall = len(overlap)/len(y_text_processed)

        # print(gpt_response, y_text, overlap, precision, recall)
        
        if precision + recall > 0:
            f1_score = (2 * precision * recall)/(precision + recall)
    
        if len(gpt_response_processed) and len(overlap)/len(gpt_response_processed) == 1:
            accuracy = 1
            
        span_accuracies += [accuracy]
        span_f1_scores += [f1_score]

    span_accuracies = span_accuracies if span_accuracies else [0]
    span_f1_scores = span_f1_scores if span_f1_scores else [0]
    # if comparisons:
    # print('MOST SIM', comparisons[span_accuracies.index(max(span_accuracies))], gpt_response)
    return max(span_accuracies), max(span_f1_scores)

In [571]:
'''
Zero shot dialogue and atomic facts prompting
'''

def gpt_text_inconsistent(gpt_response):
    if not gpt_response.strip():
        return True
    elif [each  for each in ['no inconsistent', 'none', '[]'] if each in gpt_response.lower()]:
        return True
    return False
        
def get_gpt_span_scores(prompt, annotations, few_shot = True):
    gpt_response = GPTInference().get_chatgpt_response(prompt)
    
    # gpt_response = eval(gpt_response.split('\n')[0])
    # print('GPT RESPONSE', gpt_response)
    # print('ANNOTATIONS', annotations)
    all_spans = gpt_response.split('\n')
    # if few_shot:
    #     all_spans = []
    #     for sent_spans in gpt_response.split('\n'):
    #         all_spans += sent_spans.split('SPANS')[1].strip().split(', ')
    # print('GPT RESPONSE', gpt_response, all_spans)
    row_f1_scores = []
    row_acc_scores = []
    
    if gpt_text_inconsistent(gpt_response) and not annotations:
                row_f1_scores += [1]
                row_acc_scores += [1]
        
    else:
        for gpt_span in all_spans:
                gpt_span_scores = span_scores(gpt_span, annotations)
                
                row_f1_scores += [gpt_span_scores[1]]
                row_acc_scores += [gpt_span_scores[0]]
                # print('SCORES', gpt_span, gpt_span_scores)
    # print('=' * 13)
    return np.mean(row_f1_scores), np.mean(row_acc_scores), gpt_response

def get_zero_shot_scores(df, source_type = 'Dialogue'):
    span_scores_acc = []
    span_scores_f1 = []
    gpt_responses = []
    # df_sample = df.sample(20)
    df_sample = df[df['Errors'] > 1]
    # df_sample = df_sample.sample(10)
    for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
    # for idx, row in df_sample.iterrows():
        # print(idx)
        dlg = row['Dialogue']
        # dlg_atomic = get_atomic_facts_gpt(dlg)
        summ = row['Summary']
        
        annotations = eval(row['Annotations'])
        # print(annotations)
        
        instruction = f"Identify and list the inconsistent phrases or words in the dialogue summary. Note that consistency means all information in the summary is supported by the {source_type}. List each span in a new line"
        prompt_dlg = f'{instruction}\n{source_type}: {dlg}\nSummary: {summ}\nInconsistent Spans:'

        # source_type = 'Source'
        # instruction = f"Extract only the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the {source_type}. List each span in a new line"
        # prompt_dlg_atomic = f'{instruction}\n{source_type}: {dlg_atomic}\nSummary: {summ}\nInconsistent Spans:'
        # print(prompt_dlg)
        span_f1, span_acc, gpt_response= get_gpt_span_scores(prompt_dlg, annotations, few_shot=False)
        span_scores_acc.append(span_acc)
        span_scores_f1.append(span_f1)
        gpt_responses.append(gpt_response)
        # print('*' * 20)
        # print(span_f1, span_acc)
    return span_scores_f1, span_scores_acc, gpt_responses
        



# get_zero_shot_scores(df)



In [572]:
def get_few_shot_scores(df, fewshot_str, instruction):
    df_sample = df.sample(1)
    span_scores_acc = []
    span_scores_f1 = []
    gpt_responses = []
    for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
        dlg = row['Dialogue']
        # dlg_atomic = get_atomic_facts_gpt(dlg)
        summ = row['Summary']
        # print('SUMMARY', summ)
        annotations = eval(row['Annotations'])
        
        prompt_dlg = f'{fewshot_str}\n\nDialogue: {dialogue}\nSummary: {row["Summary"]}\n{instruction}\nInconsistent Spans:'
        # print(prompt_dlg)
        span_f1, span_acc, gpt_response= get_gpt_span_scores(prompt_dlg, annotations)
        span_scores_acc.append(span_acc)
        span_scores_f1.append(span_f1)
        gpt_responses.append(gpt_response)
    return span_scores_f1, span_scores_acc, gpt_responses

In [573]:
# get_few_shot_scores(df, ic_dlg, ic_summary_span)

In [589]:
df = pd.read_csv('/home/sanjana/factual_evaluation_source_based/datasets/sota_annotations/dialogue_finegrained_aggrefact.csv')
source_type = "dialogue"
instruction =  f"Identify and list inconsistent phrases or words from the dialogue summary. Note inconsistency here refers to any information in the summary not supported by the {source_type}. List each span in a new line"

fewshot_prompt_str, df  = make_incontext_examples(df, 4, instruction)

Before sampling 295
['and needs']
['borrow']
['Chris and Ann']
['Mason']
['a woman']
['will watch']
After sampling 290


In [595]:
print(fewshot_prompt_str)

Dialogue: Derek McCarthy: Filip - are you around? Would you have an Android cable I could borrow for an hour? I'm almost out of charge and I have a power pack  but forgot my cable😭
Tommy: I am in Poland but can ring my wife and she will give you one
Tommy: Do you want me to?
Tommy: 67 glenoaks close
Derek McCarthy: That would be great if you could!! Otherwise I'm sitting here in the dark for an hour <emoticon_smile>
Tommy: Put it in gps and start driving
Derek McCarthy: <emoticon_thumbup>
Tommy: She might be at work for next 15 min but will help you for sure
Derek McCarthy: Thanks a lot mate
Tommy: Sent her msg. She will give it to you. Approx time when she will be at home is 8:15 pm
Derek McCarthy: Thanks again!! What's your wife's name  again??
Tommy: Paulina
Summary: Tommy is in Poland and needs an Android cable to power his power pack. Tommy will call his wife Paulina to borrow one.
Identify and list inconsistent phrases or words from the dialogue summary. Note inconsistency here r

In [591]:
# print(fewshot_prompt_str)

In [592]:
# span_scores_f1_fewshot, span_scores_acc_fewshot = get_few_shot_scores(df, fewshot_prompt_str, instruction)
span_scores_f1, span_scores_acc = get_zero_shot_scores(df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 290/290 [04:32<00:00,  1.07it/s]


ValueError: too many values to unpack (expected 2)

In [561]:
np.mean(span_scores_f1), np.mean(span_scores_acc)

(0.5386548711701515, 0.35535714285714287)

In [593]:
span_scores_f1_fewshot, span_scores_acc_fewshot, gpt_responses_fewshot = get_few_shot_scores(df, fewshot_prompt_str, instruction)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 290/290 [18:08<00:00,  3.75s/it]


In [594]:
np.mean(span_scores_f1_fewshot), np.mean(span_scores_acc_fewshot)

(0.12469500482870277, 0.012471264367816091)

In [582]:
df['fewshot_span_f1'] = span_scores_f1_fewshot
df['fewshot_span_acc'] = span_scores_acc_fewshot
df['fewshot_gpt_text'] = gpt_responses_fewshot

In [583]:
span_scores_f1, span_scores_acc, gpt_responses = get_zero_shot_scores(df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 278/278 [03:26<00:00,  1.35it/s]


In [584]:
df['zeroshot_span_f1'] = span_scores_f1
df['zeroshot_span_acc'] = span_scores_acc
df['zeroshot_gpt_text'] = gpt_responses

In [585]:
df.to_csv('finegrained_gpteval.csv')

In [588]:
df

Unnamed: 0.1,Unnamed: 0,Dialogue,Model,Summary,origin,Annotations,Reference_Summary,Errors,fewshot_span_f1,fewshot_span_acc,fewshot_gpt_text,zeroshot_span_f1,zeroshot_span_acc,zeroshot_gpt_text
0,0,"Natacha: hi, i can come and pick you up at the...",gpt3_finetune,Charles will probably arrive at the train stat...,SAMSum,{'Charles will probably arrive at the train st...,Charles has just landed and he will be at RER ...,1,0.153846,0.000000,Charles\nwill probably arrive at the train sta...,0.000000,0.0,No inconsistent spans.
1,1,"Natacha: hi, i can come and pick you up at the...",pegasus,Natacha will pick Charles up at the RER at 5:3...,SAMSum,{'Natacha will pick Charles up at the RER at 5...,Charles has just landed and he will be at RER ...,2,0.232323,0.000000,Natacha will pick Charles up at the RER at 5:3...,0.348485,0.0,"""Natacha will pick Charles up at the RER at 5:..."
2,2,"Natacha: hi, i can come and pick you up at the...",structure_aware_bart,Natacha will pick Charles up at the RER at 5:3...,SAMSum,{'Natacha will pick Charles up at the RER at 5...,Charles has just landed and he will be at RER ...,1,0.461538,0.000000,Natacha will pick Charles up at the RER at 5:3...,0.461538,0.0,"""Natacha will pick Charles up at the RER at 5:..."
3,3,"Natacha: hi, i can come and pick you up at the...",condigsum,Natacha will pick Charles up at the RER statio...,SAMSum,{'Natacha will pick Charles up at the RER stat...,Charles has just landed and he will be at RER ...,1,0.500000,0.000000,Natacha will pick Charles up at the RER statio...,0.000000,0.0,"""at 5:30 pm."""
4,4,"Natacha: hi, i can come and pick you up at the...",bart,Natacha will pick Charles up at the station Ve...,SAMSum,{'Natacha will pick Charles up at the station ...,Charles has just landed and he will be at RER ...,1,0.166667,0.000000,Natacha will pick Charles up at the station Ve...,0.000000,0.0,"""at 5:30 pm"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,290,"Mark: So, we've got our where and when. Packag...",gpt3_finetune,"Mark, Anna, George and Julia are going on a pa...",SAMSum,{},They are going to do some research on holiday ...,0,0.000000,0.000000,"Mark, Anna, George and Julia are going on a pa...",0.000000,0.0,"""George: Self-organised. Cheaper."""
291,291,"Mark: So, we've got our where and when. Packag...",condigsum,"Mark, Julia, Anna, George and George are going...",SAMSum,"{'Mark, Julia, Anna, George and George are goi...",They are going to do some research on holiday ...,1,0.173913,0.000000,"Mark, Julia, Anna, George and George are going...",0.153846,0.0,"""Mark, Julia, Anna, George and George are goin..."
292,292,"Mark: So, we've got our where and when. Packag...",bart,"Mark, Anna, Julia, George and Anna are going o...",SAMSum,"{'Mark, Anna, Julia, George and Anna are going...",They are going to do some research on holiday ...,1,0.111111,0.166667,Mark\nAnna\nJulia\nGeorge\npackage tour\ngoing...,0.153846,0.0,"""Mark, Anna, Julia, George and Anna are going ..."
293,293,"Mark: So, we've got our where and when. Packag...",pegasus,"Mark, Anna, George and Julia are looking for a...",SAMSum,{},They are going to do some research on holiday ...,0,0.000000,0.000000,"Mark, Anna, George and Julia\nlooking for a pa...",1.000000,1.0,


In [221]:
import re, string
import numpy as np
import math
from tqdm import tqdm 


    

def get_atomic_facts_gpt( text, text_type = 'dialogue'):
        instr = f'Segment the following {text_type.lower()} into atomic facts without introducing any unsupported information'
        prompt = f'{instr}\nDialogue: {text}'
        # print(prompt)
        gpt_response = GPTInference().get_chatgpt_response(prompt)
        return gpt_response

    

span_scores_acc = []
span_scores_f1 = []
# df_sample = df.sample(100)
for idx, row in tqdm(df.iterrows(), total = df.shape[0]):
    dlg = row['Dialogue']
    dlg = get_atomic_facts_gpt(dlg)
    summ = row['Summary']
    annotations = eval(row['Annotations'])
    instruction = "Extract only the inconsistent phrases or words in the summary. Note that consistency means all information in the summary is supported by the source. List each span in a new line"
    
    # prompt = f'{instruction}\nDialogue: {dlg}\nSummary: {summ}\nInconsistent Spans:'
    prompt = f'{instruction}\nSource: {dlg}\nSummary: {summ}\nInconsistent Spans:'
    # print(prompt)
    gpt_response = GPTInference().get_chatgpt_response(prompt)
    # print('Model: ', row['Model'])
    # print('Dialogue', row['Dialogue'])
    # print('SUMMARY: ', row['Summary'])
    # print('GPT RESPONSE: ', gpt_response)
    # print('ANNOTATIONS: ', annotations)
    all_spans = gpt_response.split('\n')
    row_f1_scores = []
    row_acc_scores = []

    if [each  for each in ['no inconsistent', 'none', '[]'] if each in gpt_response.lower()] and not annotations:
            # if not annotations:
            row_f1_scores += [1]
            row_acc_scores += [1]
    
    else:
        for gpt_span in all_spans:
            gpt_span_scores = span_scores(gpt_span, annotations)
            
            row_f1_scores += gpt_span_scores[1]
            row_acc_scores += gpt_span_scores[0]
            # print('SPAN', gpt_span)
            
    # print('Accuracy', row_acc_scores)
    # print('F1', row_f1_scores)
    
    span_scores_acc += [np.mean(row_acc_scores)]
    span_scores_f1 += [np.mean(row_f1_scores)]
    
    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 295/295 [50:10<00:00, 10.21s/it]


In [224]:
np.mean(span_scores_acc)

0.1944632768361582

In [210]:
len(df)

295

In [164]:
gpt_spans = gpt_response.split(',')
for gpt_span in gpt_spans:
    print(postprocess(gpt_span), annotations, span_scores(gpt_span, annotations))

[('is sleeping', 'intrinsic', 'PredE')]
Catherine is sleeping is sleeping
Catherine is sleeping {'Catherine is sleeping.': [('is sleeping', 'intrinsic', 'PredE')], 'Ana and Catherine are sleeping well.': [('are sleeping well', 'extrinsic', 'PredE')]} ([1], [0.8])
[('is sleeping', 'intrinsic', 'PredE'), ('are sleeping well', 'extrinsic', 'PredE')]
 Ana is going to visit grandma tomorrow is sleeping
 Ana is going to visit grandma tomorrow are sleeping well
 Ana is going to visit grandma tomorrow {'Catherine is sleeping.': [('is sleeping', 'intrinsic', 'PredE')], 'Ana and Catherine are sleeping well.': [('are sleeping well', 'extrinsic', 'PredE')]} ([0, 0], [0.22222222222222224, 0])
[('are sleeping well', 'extrinsic', 'PredE')]
 Ana and Catherine are sleeping well are sleeping well
 Ana and Catherine are sleeping well {'Catherine is sleeping.': [('is sleeping', 'intrinsic', 'PredE')], 'Ana and Catherine are sleeping well.': [('are sleeping well', 'extrinsic', 'PredE')]} ([1], [0.666666666

In [101]:
import spacy

nlp = spacy.load('en_core_web_sm')
text = "How are you today? I hope you have a great day"
tokens = nlp(text)
list(tokens.sents)

[How are you today?, I hope you have a great day]

In [102]:
summ = '''"Natalie told them in confidence that she\'s pregnant","Henriette knows who\'s the father."'''
import nltk
nltk.sent_tokenize(summ)
list(nlp(summ).sents)

[",
 Natalie told them in confidence that she's pregnant","Henriette knows who's the father."]

In [105]:
import math
2 == math.nan

False

In [50]:
#case 1: gpt_response --> span & annotation --> span 
#case 2: 


    
        

In [7]:
check_if_inconsistent(gpt_response)

True

In [8]:
gpt_response

'"Hans will bring the ball"'

In [166]:
''' Score strategy 
1) If annotations are empty but GPT found something --> overlap spans, overlap_sents
2) If annotation is not empty and GPT did not find something --> no overlap spans, no overlap_sents
3) If both are present --> Find most consistent ovelrap sentence, check if in annotation
For the sentence, check span overlap 
'''

In [167]:
get_score(annotations, gpt_response)

(1.0, 1.0)

1.0 ['vesinet'] "Charles will be at the station Vesinet at 5:30 pm."


In [101]:
annotations

{'Natacha will pick Charles up at the RER at 5:30 pm.': [('5:30 pm',
   'intrinsic',
   'CirE')],
 'Charles will be at the station Vesinet at 5:30 pm.': [('Vesinet',
   'intrinsic',
   'CirE')]}

In [72]:
gpt_response.split()

['"Charles',
 'will',
 'be',
 'at',
 'the',
 'station',
 'Vesinet',
 'at',
 '5:30',
 'pm."']

In [75]:
list(annotations.keys())[1].split()

['Charles',
 'will',
 'be',
 'at',
 'the',
 'station',
 'Vesinet',
 'at',
 '5:30',
 'pm.']