In [83]:
import pandas as pd
from tqdm import tqdm
import re, string
import numpy as np
import math
from tqdm import tqdm 
from experiments.RQ1.utils import get_chatgpt_response, get_atomic_facts_gpt
import spacy
nlp = spacy.load('en_core_web_sm')

In [84]:
model_filter = {
    'FacEval': ['bart_large', 'co-ref bart large', 'condigsum bart large','gpt4-32k-0613','mv-bart_large', 'alpaca-13b'],
    'SAMSum': ['BART', 'CODS', 'MV-BART', 'UniLM', 'gpt4-32k-0613', 'alpaca-13b'],
    'DialogueSum': ['BART', 'CODS', 'MV-BART', 'UniLM', 'gpt4-32k-0613', 'alpaca-13b']

}
def read_filter(filename):
    df = pd.read_csv(filename)
    df_datasets = []
    unique_datasets = list(set(df['origin'].values))
    for dataset in unique_datasets: 
        df_origin = df[df['origin'] == dataset]
        df_origin = df_origin[df_origin['model'].isin(model_filter[dataset])]
        # print(len(df_origin))
        unique_docids = list(set(df_origin['docid'].values))
        #### test ###
        num_models = []
        for udocid in unique_docids:
            df_docid = df_origin[df_origin['docid'] == udocid]
            num_models.append(len(list(set(df_docid['model'].values))))
        # print(df_docid)
        assert(len(set(num_models)) == 1) 
        df_datasets.append(df_origin)
    df_filtered = pd.concat(df_datasets)
    assert(len(df_filtered) <= len(df))
    return df_filtered


In [85]:
df = read_filter('/home/ramprasad.sa/factual_evaluation_source_based/annotations/xformer_llm_annotated.csv')
df = df[df['origin'] != 'FacEval']
df[df['factual_error'] == 1][:1]

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts
7,7,test_133,BART,['They '],[],#Person1# and #Person2# talk about the heavy s...,1,['Intrinsic_Error'],"#Person1#: It was a heavy storm last night, wa...",DialogueSum,There was a heavy storm last night.\nThe wind ...


In [100]:
import nltk
import re
import string 
import spacy 
import nltk
from nltk.corpus import stopwords
import numpy as np
from random import sample
import spacy
from thefuzz import fuzz

nltk.download('stopwords')
stop_words = stopwords.words('english')

nlp = spacy.load('en_core_web_sm')


def postprocess(text):
    text = text.strip(string.punctuation).lower()
    return text
    

def calculate_f1(matched, pred_spans, annotated_spans):
    found_pred_spans = set([each[0] for each in matched])
    found_rec_spans = set([each[1] for each in matched])
    
    precision = len(found_pred_spans)/len(pred_spans)
    recall  = len(found_rec_spans)/len(annotated_spans)
    
    if precision + recall > 0:
        f1_score = (2 * precision * recall)/(precision + recall)
    else:
        f1_score = 0
    return f1_score
    
def get_f1_scores(all_pred_spans, nonfactual_spans):
    
    matched_pred_fuzzy = []
    matched_pred_exact = []
    for pred_span in all_pred_spans:
        for ref_span in nonfactual_spans:
            fuzzy_score = fuzz.partial_ratio(pred_span, ref_span)
            # print(pred_span, ref_span, fuzzy_score)
            if fuzzy_score > 80:
                matched_pred_fuzzy.append((pred_span, ref_span))
            if pred_span == ref_span:
                matched_pred_exact.append((pred_span, ref_span))
            
    matched_pred_fuzzy = list(set(matched_pred_fuzzy))
    matched_pred_exact = list(set(matched_pred_exact))
    # print('MATCHED LEN AND STRICT', matched_pred_fuzzy, matched_pred_exact)
    # print('PRED, ANN', all_pred_spans, nonfactual_spans)
    
    f1_score_lenient = calculate_f1(matched_pred_fuzzy, all_pred_spans, nonfactual_spans)
    f1_score_strict = calculate_f1(matched_pred_exact, all_pred_spans, nonfactual_spans)
    return f1_score_lenient, f1_score_strict
    

def make_masked_sentences(summ):
    # summ = summ.lower()
    instruction = 'Extract all grammatical units including subjects in the sentence. List each in a new line'
    prompt_template = f"{{instruction}}:\nSentence:{{sent}}\nAnswer:"
    
    summ = nlp(summ)
    summ_sentences = [each.text.lower() for each in summ.sents]
    
    summ_sent_mask_infill = []
    for sent_idx, sent in enumerate(summ_sentences):
        prompt = prompt_template.format(instruction = instruction, sent = sent)
        # print(prompt)
        response = get_chatgpt_response(prompt)
        spans = response.split('\n')
        # print(sent, spans)
        for span in spans:
            masked_sent = sent.replace(' '+span+ ' ', ' ___ ')
            if '___' in masked_sent:
                summ_blank = summ_sentences[:sent_idx] + [masked_sent] + summ_sentences[sent_idx + 1:]
                summ_sent_mask_infill.append((' '.join(summ_blank), masked_sent, span))
    return summ_sent_mask_infill


def process_gpt_response_infill(response):
    if "None" in response:
        response = [response]
        
    elif len(response.split('\n')) > 1:
        response = response.split('\n')
        
    elif len(response.split(',')) > 1:
        response = response.split(',')
        
    else:
        response = [response]
    # response = [re.sub('[1-9]', '', each) for each in response]
    # response = [each.strip(string.punctuation).strip() for each in response]
    return response

class GPTPrompttInfill():
    def __init__(self):
        self.gpt_instructions = {
            'infill': '''Given below is a dialogue snippet and a its summary. Complete the summary using information from the dialogue.\nLimit your answer to 1-3 words. If the summary cannot be completed accurately, respond with "None".''',
            'entailment': 'Decide if the hypothesis is consistent with the corresponding premise.'
            # The dialogue snippet above is provided for context',
    
        }
        self.gpt_prompt_templates = {
            'infill': f"{self.gpt_instructions['infill']}\nDialogue: {{source}}\nSummary: {{summary}}\nAnswer Span:",
            "infill_fewshot" : f"{self.gpt_instructions['infill']}\nSource: {{source}}\n{{fewshot_str}}\nSummary: {{summary}}\nAnswer:",
            'entailment': f"{self.gpt_instructions['entailment']}\nPremise: {{premise}}\nHypothesis: {{hypothesis}}\nRate on a scale of 1-100:"
        }

    def check_entailment_score_gpt(self, source ,premise, hypothesis):
        
        prompt = self.gpt_prompt_templates['entailment'].format(
                                       premise = premise,
                                       hypothesis = hypothesis)
        # print(prompt)
        answer = get_chatgpt_response(prompt)
        answer = re.findall(r'\d+', answer)
        if not answer:
            return 0
        else:
            answer = answer[0]
        # if eval(answer) < 90:
        #     print(prompt)
        return eval(answer)

    def phrase_similarity_infilled(self, summary_span, infill_answers, masked_sentence, dlg):
        summary_sentence = masked_sentence.replace('___', summary_span)
    
        all_scores = []
        for infill in infill_answers:
            if infill.lower().strip() == 'none':
                return True
            infill_sentence = masked_sentence.replace('___', infill)
            entailment_score = self.check_entailment_score_gpt(source = dlg, 
                                                               premise = infill_sentence, 
                                                               hypothesis= summary_sentence)

            overlap = set(summary_span.lower().split(' ')).intersection(set(infill.lower().split(' '))) 
            precision = len(overlap)/len(summary_span.lower().split(' '))
            recall = len(overlap)/len(infill.lower().split(' '))
            f1_score = 0
            if precision + recall > 0:
                f1_score = (2* (precision * recall)) / (precision + recall)

            
            # if (f1_score < 0.6) and (entailment_score < 90):
                
            #     print('SENT', masked_sentence)
            #     print('SPAN', summary_span)
            #     print('RECC', infill)
            #     print(f1_score, entailment_score)
            #     print('**')
            if (f1_score >= 0.6) or (entailment_score >= 90):
                # print('SENT', masked_sentence)
                # print('SPAN', summary_span)
                # print('RECC', infill)
                # print(f1_score, entailment_score)
                # print('**')
                return True
        # print('='*13)
        return False

    def infill_spans(self, dlg, summ, ):
        all_masked_results = make_masked_sentences(summ)
        answers = []
        nonfactual_sent_span = []
        print_sample = 1
        for summary, sentence, answer in all_masked_results:   
            
            prompt = self.gpt_prompt_templates['infill'].format(source = dlg, 
                                                                summary = sentence)
                
            if print_sample < 1:
                print('PROMPT', prompt)
                # print_sample = False
                print_sample += 1
            # print(sentence, answer)
            gpt_response = get_chatgpt_response(prompt)
            response = process_gpt_response_infill(gpt_response)

            if not self.phrase_similarity_infilled(answer, response, sentence
                                                   , dlg):
                answers += [answer]
                # print('===')

        return answers
    


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ramprasad.sa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:
# ### sentence level and entailment score 90 

# df= df[df['factual_error'] == 1]
# for idx, row in df[:30].iterrows():
#     dlg = row['dialogue']
#     summ = row['summary']
#     pred_spans = GPTPrompttInfill().infill_spans(dlg, summ)
#     print('Dialogue', dlg)
#     print('SUmmary', summ)
#     print('PRED', pred_spans)
#     print('ANN', row['nonfactual_spans'])
#     print('='*13)

def score(all_pred_spans, nonfactual_spans):
    if not all_pred_spans:
        if not nonfactual_spans:
            f1_score_lenient = 1
            f1_score_strict = 1
        else:
            f1_score_lenient = 0
            f1_score_strict = 0
            
    else:
        if not nonfactual_spans:
            f1_score_lenient = 0
            f1_score_strict = 0
        else:
            f1_score_lenient, f1_score_strict = get_f1_scores(all_pred_spans, nonfactual_spans)
    return f1_score_lenient, f1_score_strict

In [102]:
# df= df[df['factual_error'] == 1]
span_f1_lenient_scores = []
span_f1_strict_scores = []
for idx, row in tqdm(df.iterrows(), total = len(df)):
    dlg = row['dialogue']
    summ = row['summary']
    nonfactual_spans = row['nonfactual_spans']
    pred_spans = GPTPrompttInfill().infill_spans(dlg, summ)
    span_f1_lenient, span_f1_strict = score(list(set(pred_spans)), eval(nonfactual_spans))
    span_f1_lenient_scores.append(span_f1_lenient)
    span_f1_strict_scores.append(span_f1_strict)
    # f1_scores_len += [get_f1_scores(pred_spans, nonfactual_spans)]
    # print('Dialogue', dlg)
    # print('SUmmary', summ)
    # print('PRED', pred_spans)
    # print('ANN', row['nonfactual_spans'])
    # print(span_f1_lenient)
    # print('='*13)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1050/1050 [4:27:00<00:00, 15.26s/it]


In [93]:
np.mean(span_f1_lenient_scores)

0.47826199071686437

In [31]:
make_masked_sentences(summ)

[('bill ___ happy because he made a move and tells #person1# his roommate, brain locker, ___ happy today.',
  'bill ___ happy because he made a move and tells #person1# his roommate, brain locker, ___ happy today.',
  'is'),
 ('bill is ___ because he made a move and tells #person1# his roommate, brain locker, is ___ today.',
  'bill is ___ because he made a move and tells #person1# his roommate, brain locker, is ___ today.',
  'happy'),
 ('bill is happy ___ he made a move and tells #person1# his roommate, brain locker, is happy today.',
  'bill is happy ___ he made a move and tells #person1# his roommate, brain locker, is happy today.',
  'because'),
 ('bill is happy because ___ made a move and tells #person1# his roommate, brain locker, is happy today.',
  'bill is happy because ___ made a move and tells #person1# his roommate, brain locker, is happy today.',
  'he'),
 ('bill is happy because he ___ a move and tells #person1# his roommate, brain locker, is happy today.',
  'bill is ha

In [None]:
summ

In [None]:
# def get_masked_cand(all_masked_results, seen_last_idx):
#     masked_check_cand_ind = -1
#     masked_check_cand = None
    
#     for each_idx, each in enumerate(all_masked_results):
#         blank_idx = [widx for widx, word in enumerate(each[0].split(' ')) if '<BLANK>' in word][0]
#         # print(blank_idx)
#         if blank_idx >= seen_last_idx:
#             masked_check_cand_ind = each_idx
#             masked_check_cand = each
#             break
#     return masked_check_cand, masked_check_cand_ind
        
# def get_mask_infill_errors(dlg, summ):
#     seen_last_idx = 0
#     updated_summ = summ
    
#     all_masked_results = make_masked_sentences( updated_summ)
#     masked_check_cand, masked_check_cand_ind = get_masked_cand(all_masked_results, seen_last_idx) 
#     # print(masked_check_cand)
#     answers = []
#     while masked_check_cand:
#         print('SEEN', seen_last_idx)
#         masked_summ = masked_check_cand[0]
#         masked_sent = masked_check_cand[1]
#         masked_span = masked_check_cand[2]
        
#         prompt = GPTPrompttInfill().gpt_prompt_templates['infill'].format(source = dlg,suggested_span = masked_span,
#                                                                 summary = updated_summ)
#         # print(prompt)
#         gpt_response = get_chatgpt_response(prompt)
#         response = process_gpt_response_infill(gpt_response)
#         if not GPTPrompttInfill().phrase_similarity_infilled(masked_span, response[:1], masked_summ):
#             updated_summ = masked_summ.replace('<BLANK>', response[0])
#             answers.append(masked_span)
#         seen_last_idx = [idx for idx, word in enumerate(masked_summ.split(' ')) if '<BLANK>' in word][0]
#         seen_last_idx += 1
#         all_masked_results = make_masked_sentences( updated_summ)
#         masked_check_cand, masked_check_cand_ind = get_masked_cand(all_masked_results, seen_last_idx)
        
#     print(answers , updated_summ)



In [23]:
for idx, row in df[9:13].iterrows():
    dlg = row['dialogue']
    summ = row['summary']  
    get_mask_infill_errors(dlg, summ)

# all_masked_results = make_masked_sentences(summ)
# for summary, sentence, answer in all_masked_results:   
#     prompt = GPTPrompttInfill().gpt_prompt_templates['infill'].format(source = dlg, summary = summary)
    
#     gpt_response = get_chatgpt_response(prompt)
#     response = process_gpt_response_infilled(gpt_response)
#     print(GPTPrompttInfill().phrase_similarity_infilled(answer, response, sentence))

# get_mask_infill_errors(dlg, summ)
                

SEEN 0
SENT <BLANK> drives person2 to the french garden restaurant . person3 orders a bottle of water , a tuna fish sandwich , and vegetable soup .
SPAN person1
RECC person2
0 1
**
SEEN 1
SENT person2 <BLANK> person2 to the french garden restaurant . person3 orders a bottle of water , a tuna fish sandwich , and vegetable soup .
SPAN drives
RECC None
0 2
**
SEEN 2
SEEN 4
SENT person2 none person2 to the french garden restaurant . <BLANK> orders a bottle of water , a tuna fish sandwich , and vegetable soup .
SPAN person3
RECC person2
0 1
**
SEEN 10
SENT person2 none person2 to the french garden restaurant . person2 <BLANK> a bottle of water , a tuna fish sandwich , and vegetable soup .
SPAN orders
RECC went
0 2
**
SEEN 11
SEEN 12
SENT person2 none person2 to the french garden restaurant . person2 went a bottle of water , <BLANK> , and vegetable soup .
SPAN a tuna fish sandwich
RECC a bottle of water
0.25 1
**
SEEN 17
SENT person2 none person2 to the french garden restaurant . person2 wen


KeyboardInterrupt



In [211]:
row['nonfactual_spans']

"['she her she ', 'having a lawyer ']"

In [213]:
def get_masked_cand(all_masked_results, seen_last_idx):
    masked_check_cand_ind = -1
    masked_check_cand = None
    
    for each_idx, each in enumerate(all_masked_results):
        blank_idx = [widx for widx, word in enumerate(each[0].split(' ')) if '<BLANK>' in word][0]
        # print(blank_idx)
        if blank_idx >= seen_last_idx:
            masked_check_cand_ind = each_idx
            masked_check_cand = each
            break
    return masked_check_cand, masked_check_cand_ind
        
def get_mask_infill_errors(dlg, summ):
    seen_last_idx = 0
    updated_summ = summ
    
    all_masked_results = make_masked_sentences( updated_summ)
    masked_check_cand, masked_check_cand_ind = get_masked_cand(all_masked_results, seen_last_idx) 
    # print(masked_check_cand)
    answers = []
    while masked_check_cand:
        print('SEEN', seen_last_idx)
        masked_summ = masked_check_cand[0]
        masked_sent = masked_check_cand[1]
        masked_span = masked_check_cand[2]
        prompt = GPTPrompttInfill().gpt_prompt_templates['infill'].format(source = dlg, summary = masked_summ)
        # print(prompt)
        gpt_response = get_chatgpt_response(prompt)
        response = process_gpt_response_infilled(gpt_response)
        if not GPTPrompttInfill().phrase_similarity_infilled(masked_span, response[:1], masked_summ):
            updated_summ = masked_summ.replace('<BLANK>', response[0])
            answers.append(masked_span)
        seen_last_idx = [idx for idx, word in enumerate(masked_summ.split(' ')) if '<BLANK>' in word][0]
        seen_last_idx += 1
        all_masked_results = make_masked_sentences( updated_summ)
        masked_check_cand, masked_check_cand_ind = get_masked_cand(all_masked_results, seen_last_idx)
        
    print(answers , updated_summ)



'vet tells person1 she usually eats a cucumber and go to bed to deal with stress and depression . her favorite part of having a daughter is having a lawyer . vet tells person1 she wants to be a lawyer and wants to start small .'