In [61]:
import pandas as pd
import re, string
import numpy as np
import math
from tqdm import tqdm 
import nltk
from experiments.RQ1.utils import get_chatgpt_response, get_atomic_facts_gpt
from experiments.models.AlpacaModel import AlpacaInference

In [62]:
def get_fewshot_sample(df, num_samples = 4):
    #### nonfactual examples ###
    print('Before sampling', len(df))
    df_nonfactual = df[df['factual_error'] == 1]
    df_sample_nonfactual = df_nonfactual.sample(num_samples)
    drop_idx = list(df_sample_nonfactual.index)
    df = df.drop(index=drop_idx)

    df_factual = df[df['factual_error'] == 0]
    df_sample_factual = df_factual.sample(num_samples)
    drop_idx = list(df_sample_factual.index)
    df = df.drop(index=drop_idx)
    print('After Sampling', len(df))
    df_fewshot = pd.concat([df_sample_nonfactual, df_sample_factual])
    df_fewshot = df_fewshot.sample(len(df_fewshot))
    return df, df_fewshot

In [63]:
read_path = '/home/ramprasad.sa/factual_evaluation_source_based/annotations/xformer_llm_annotated.csv'

df = pd.read_csv(read_path)
df.head()[:1]

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts
0,0,test_312,CODS,[],[],#Person1# asks #Person2# to go to China with #...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...


In [64]:
df, df_fewshot = get_fewshot_sample(df, num_samples = 2)

Before sampling 2034
After Sampling 2030


In [65]:
df_fewshot

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts
1560,1560,13681560,gpt4-32k-0613,['stressed'],['Not found'],Hayden is stressed about deciding what to stud...,1,['Extrinsic_Error'],Hayden: Anyway I have 1 month to write my thes...,FacEval,Hayden has 1 month left to write his thesis.\n...
1061,1061,13729704,alpaca-13b,['Ewan congratulated his uncle Jayson on his g...,['Ewan : Uncle I graduated!'],Ewan congratulated his uncle Jayson on his gra...,1,['Intrinsic_Error'],Ewan: Uncle I graduated!\r\nUncle Jayson: My n...,FacEval,Ewan graduated.\nJayson is Ewan's uncle.\nUncl...
1026,1026,test_191,CODS,[],[],#Person2# helps #Person1# get #Person1#'s test...,0,[],"#Person1#: Hi, I was wondering if I could get ...",DialogueSum,Person1 wants to get their test results from t...
458,458,test_266,CODS,[],[],#Person1#'s dad tells #Person1# about #Person1...,0,[],"#Person1#: Dad, you keep talking about family ...",DialogueSum,Person2 has family in New Zealand.\nUncle Bill...


In [66]:
def process_atomic_facts(afact):
    sents = nltk.sent_tokenize(afact)
    sents = [each for each in sents if not each.strip(string.punctuation).isdigit()]
    return '\n'.join(sents)
    
class PromptBaselines():
    def __init__(self):
        self.instructions = {
            "direct_assesment_instruction1": f'''Decide if the Summary is consistent with the corresponding {{source_type}}. Note that consistency means all information in the summary is supported by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent.''',
            "direct_assesment_instruction2": f"""Verify if the Summary aligns with the {{source_type}} for consistency. Consistency ensures that every detail in the Summary is substantiated by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent. """,
            "direct_assesment_instruction3": f'''Evaluate the Summary's consistency with the {{source_type}} by confirming if all information in the summary is supported by the {{source_type}}.\nRespond with a yes or no.'''
        }

        self.prompt_templates_fewshot = {
            'Alpaca': f'### Instruction:\n{{instruction}} \n\n### Input:\n{{fewshot_str}}\n\n{{source_type}}: {{source}}\nSummary: {{summary}}\n\n### Response:\nAnswer:',
            'GPT': f'{{instruction}}\n\n{{fewshot_str}}\n\n{{source_type}}: {{source}}\nSummary: {{summary}}\nAnswer:'
        }


        self.prompt_template_examples = {
            'Alpaca': f'{{source_type}}: {{source}}\nSummary: {{summary}}\nAnswer:{{factual_label}}',
            'GPT': f'{{source_type}}: {{source}}\nSummary: {{summary}}\nAnswer:{{factual_label}}'
            
        }


    def get_fewshot_str(self, 
                        fewshot_examples, 
                        model,
                       source_type,
                        ):
        model_type = 'Alpaca' if model else 'GPT'
        fewshot_strs = []

        for source, summary, factual_label in fewshot_examples:
            if source_type != 'Dialogue':
                source = process_atomic_facts(source)
            factual_label = 'No' if factual_label else 'Yes'
            fewshot_strs += [self.prompt_template_examples[model_type].format(
                                source_type = source_type, 
                                source = source,
                                summary = summary,
                                factual_label = factual_label
                                )]
        return fewshot_strs

    def get_response_fewshot(self, 
                             instruction_template,
                             source_type,
                             source,
                             summary,
                             fewshot_strs,
                             model,
                             print_prompt = False):
        
        model_type = 'Alpaca' if model else 'GPT'
        instruction = instruction_template.format(source_type = source_type)

        fewshot_str = '\n\n'.join(fewshot_strs) 
        
        prompt = self.prompt_templates_fewshot[model_type].format(
                instruction = instruction,
                fewshot_str = fewshot_str,
                source_type = source_type,
                source = source,
                summary = summary
            )

        if print_prompt:
            print('PROMPT', prompt)
            print('***')
    

        if not model:
                response = get_chatgpt_response(prompt, 'gpt-4-0613')
        else:
                response = model.get_response(prompt, max_len = None)
    
        
        return response
        

    def direct_assessment_fewshot(self,
                                 source,
                                 summary,
                                 fewshot_examples,
                                 source_type  = 'Dialogue',
                                 print_prompt = False,
                                 model = None):

        responses = []
        labels = []
        
        if source_type != 'Dialogue':
            source = process_atomic_facts(source)

        fewshot_strs = self.get_fewshot_str(
                fewshot_examples,
                model,
                source_type,
                        )

        
        for inst, instruction_template in list(self.instructions.items()):
            res = self.get_response_fewshot(
                                    instruction_template,
                                     source_type,
                                     source,
                                     summary,
                                     fewshot_strs,
                                     model,
                                     print_prompt = print_prompt)
            pred_label = 0 if 'yes' in res.lower().strip() else 1
            
            responses += [res]
            labels += [pred_label]
        return responses, labels



    
            


    

    

In [67]:
def get_score(df, 
              df_fewshot,
              afacts = False, 
              model = None,):

    
    if not afacts:
        source_key = 'dialogue'
        source_type = 'Dialogue'
        
    else:
        source_key = 'dialogue_atomic_facts'
        source_type = 'Source'
        
    sources = list(df[source_key].values)
    summaries = list(df['summary'].values)

    response_instruction_dict = {
        'response_instr1': [],
        'response_instr2': [],
        'response_instr3': [],
        'labels_instr1': [],
        'labels_instr2': [],
        'labels_instr3': [],
        
    }

    fewshot_sources = list(df_fewshot[source_key].values)
    fewshot_summaries = list(df_fewshot['summary'].values)
    fewshot_labels = list(df_fewshot['factual_error'].values)
    fewshot_examples = list(zip(fewshot_sources, fewshot_summaries, fewshot_labels))
    # print(len(fewshot_examples))

    index = 0
    for source, summary in tqdm(list(zip(sources, summaries))):
        print_prompt = False
        if index%100 == 0:
            print_prompt = True
        else:
            print_prompt = False
            
        responses, labels = PromptBaselines().direct_assessment_fewshot(source,
                                 summary,
                                 fewshot_examples,
                                 source_type  = source_type,
                                 print_prompt = print_prompt,
                                 model = model)
        response_instruction_dict['response_instr1'] +=  [responses[0]]
        response_instruction_dict['response_instr2'] += [responses[1]]
        response_instruction_dict['response_instr3'] += [responses[2]]

        response_instruction_dict['labels_instr1'] +=  [labels[0]]
        response_instruction_dict['labels_instr2'] += [labels[1]]
        response_instruction_dict['labels_instr3'] += [labels[2]]

        index += 1
        if print_prompt:
            print('******')
            print(labels)
            print('RESPONSE', responses ,' ---')
            print('*****')
   
    
    
    return response_instruction_dict
        

In [68]:
# alpaca_model = AlpacaInference()

In [69]:
model = alpaca_model
afacts = False

df_sample = df[:5]
response_instruction_dict = get_score(df_sample, 
          df_fewshot,
        afacts = afacts, 
                  model = model )

model_name = 'GPT' if not model else 'Alpaca'
afacts_str = 'Afact' if afacts else 'Dlg'

for k ,v in response_instruction_dict.items():
    df_sample[f'{k}_{afacts_str}_{model_name}'] = v
df_sample

  0%|                                                                                                                                                                                           | 0/5 [00:00<?, ?it/s]

PROMPT ### Instruction:
Decide if the Summary is consistent with the corresponding Dialogue. Note that consistency means all information in the summary is supported by the Dialogue.
Answer "yes" for consistent and "no" for inconsistent. 

### Input:
Dialogue: Hayden: Anyway I have 1 month to write my thesis. And then I need to decide what studies I should choose and I have a problem because I don''t know what I can do in the future to make good money
Margaret: You''ll find something
Hayden: And the only studies I''m interested in are African studies but I''m not sure I can make big money later on haha except for working in the embassy or something like that. I was thinking about working as a flight attendant. It would be easy for me to get that job since I can swim (and here it''s obligatory) I''m even a water rescuer. I know English italian and polish and a bit of german.
Margaret: So go ahead for it
Hayden: But to be honest , I don''t think so that job is so great. I can''t work ther

 20%|███████████████████████████████████▊                                                                                                                                               | 1/5 [00:13<00:55, 13.87s/it]

******
[1, 1, 1]
RESPONSE ['No', 'No', 'No']  ---
*****


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:10<00:00, 14.18s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts,response_instr1_Dlg_Alpaca,response_instr2_Dlg_Alpaca,response_instr3_Dlg_Alpaca,labels_instr1_Dlg_Alpaca,labels_instr2_Dlg_Alpaca,labels_instr3_Dlg_Alpaca
0,0,test_312,CODS,[],[],#Person1# asks #Person2# to go to China with #...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
1,1,test_312,UniLM,[],[],person1 wants to go to china for sight - seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
2,2,test_312,BART,[],[],##Person1# wants to go to China but #Person2#'...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
3,3,test_312,MV-BART,[],[],#Person1# wants to go to China for sight-seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
4,4,test_312,gpt4-32k-0613,[],[],Person1 expresses a desire to visit China for ...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1


In [70]:
df_sample

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,dialogue_atomic_facts,response_instr1_Dlg_Alpaca,response_instr2_Dlg_Alpaca,response_instr3_Dlg_Alpaca,labels_instr1_Dlg_Alpaca,labels_instr2_Dlg_Alpaca,labels_instr3_Dlg_Alpaca
0,0,test_312,CODS,[],[],#Person1# asks #Person2# to go to China with #...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
1,1,test_312,UniLM,[],[],person1 wants to go to china for sight - seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
2,2,test_312,BART,[],[],##Person1# wants to go to China but #Person2#'...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
3,3,test_312,MV-BART,[],[],#Person1# wants to go to China for sight-seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
4,4,test_312,gpt4-32k-0613,[],[],Person1 expresses a desire to visit China for ...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,Person1 wants to go to China.\nThe purpose of ...,No,No,No,1,1,1
