In [1]:
import pandas as pd
import re, string
import numpy as np
import math
from tqdm import tqdm 
import nltk
from experiments.RQ1.utils import get_chatgpt_response, get_atomic_facts_gpt
from experiments.models.AlpacaModel import AlpacaInference

In [2]:
direct_assesment_instruction1 = f'''Decide if the Summary is consistent with the corresponding {{source_type}}. Note that consistency means all information in the summary is supported by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent.'''

direct_assesment_instruction2 = f"""Verify if the Summary aligns with the {{source_type}} for consistency. Consistency ensures that every detail in the Summary is substantiated by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent. """

direct_assesment_instruction3 = f'''Evaluate the Summary's consistency with the {{source_type}} by confirming if all information in the summary is supported by the {{source_type}}.\nRespond with a yes or no.'''

instruction_template = {
'Alpaca': f'### Instruction: {{instruction}}\n\n### Input:\n{{source_type}}: {{source}}\nSummary: {{summary}}\n\n### Response:\n',
'GPT': f'{{instruction}}\n{{source_type}}: {{source}}\nSummary: {{summary}}\nAnswer:'
}


In [3]:
def process_atomic_facts(afact):
    sents = nltk.sent_tokenize(afact)
    sents = [each for each in sents if not each.strip(string.punctuation).isdigit()]
    return '\n'.join(sents)
    
class PromptBaselines():
    def __init__(self):
        self.instructions = {
            "direct_assesment_instruction1": f'''Decide if the Summary is consistent with the corresponding {{source_type}}. Note that consistency means all information in the summary is supported by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent.''',
            "direct_assesment_instruction2": f"""Verify if the Summary aligns with the {{source_type}} for consistency. Consistency ensures that every detail in the Summary is substantiated by the {{source_type}}.\nAnswer "yes" for consistent and "no" for inconsistent. """,
            "direct_assesment_instruction3": f'''Evaluate the Summary's consistency with the {{source_type}} by confirming if all information in the summary is supported by the {{source_type}}.\nRespond with a yes or no.'''
        }

        self.prompt_templates_zeroshot = {
            'Alpaca': f'### Instruction: {{instruction}} \n\n### Input:\n{{source_type}}: {{source}}\nSummary: {{summary}}\n\n### Response:\nAnswer:',
            'GPT': f'{{instruction}}\n{{source_type}}: {{source}}\nSummary: {{summary}}\nAnswer:'
        }


    def get_response_zeroshot(self,
                                     instruction_template,
                                   source_type,
                                   source,
                                   summary,
                                    model,
                                   print_prompt = False):

        model_type = 'Alpaca' if model else 'GPT'
        instruction = instruction_template.format(source_type = source_type)
            
        prompt = self.prompt_templates_zeroshot[model_type].format(
                instruction = instruction,
                source_type = source_type,
                source = source,
                summary = summary
            )
        
        if print_prompt:
            print('PROMPT', prompt)
            print('***')
    

        if not model:
                response = get_chatgpt_response(prompt, 'gpt-4-0613')
        else:
                response = model.get_response(prompt, max_len = None)
    
        
        return response

    def direct_assessment_zeroshot(self, 
                         source, 
                         summary,
                         source_type  = 'Dialogue',
                         print_prompt = False,
                         model = None):

        responses = []
        labels = []
        
        if source_type != 'Dialogue':
            source = process_atomic_facts(source)

        

        
        for inst, instruction_template in self.instructions.items():
            res = self.get_response_zeroshot(
                instruction_template,
                                   source_type,
                                   source,
                                   summary,
                                    model,
                                   print_prompt )
            pred_label = 0 if 'yes' in res.lower().strip() else 1
            
            responses += [res]
            labels += [pred_label]
        return responses, labels

    

In [4]:
read_path = '/home/ramprasad.sa/factual_evaluation_source_based/annotations/xformer_llm_annotated.csv'

df = pd.read_csv(read_path)



In [5]:
alpaca_model = AlpacaInference()

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 136.00 MiB. GPU 0 has a total capacty of 44.48 GiB of which 63.31 MiB is free. Process 623 has 28.62 GiB memory in use. Including non-PyTorch memory, this process has 15.80 GiB memory in use. Of the allocated memory 15.64 GiB is allocated by PyTorch, and 2.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
def get_score(df, 
              eval_type, 
              afacts = False, 
              model = None, 
              fewshot_examples = None , 
              fewshot_idx = 0):

    
    if not afacts:
        source_key = 'dialogue'
        source_type = 'Dialogue'
        
    else:
        source_key = 'dialogue_atomic_facts'
        source_type = 'Source'
        
    sources = list(df[source_key].values)
    summaries = list(df['summary'].values)

    response_instruction_dict = {
        'response_instr1': [],
        'response_instr2': [],
        'response_instr3': [],
        'labels_instr1': [],
        'labels_instr2': [],
        'labels_instr3': [],
        
    }
    
    
    index = 0
    for src, summ in tqdm(list(zip(sources, summaries))):
        print_prompt = False
        if index%100 == 0:
            print_prompt = True
        else:
            print_prompt = False
        if eval_type == 'direct_assesment':
            responses, labels = PromptBaselines().direct_assessment_zeroshot(src, 
                                                            summ, 
                                                            source_type  = source_type, 
                                                            model = model,
                                                            print_prompt = False,
                                                            )

            response_instruction_dict['response_instr1'] +=  [responses[0]]
            response_instruction_dict['response_instr2'] += [responses[1]]
            response_instruction_dict['response_instr3'] += [responses[2]]

            response_instruction_dict['labels_instr1'] +=  [labels[0]]
            response_instruction_dict['labels_instr2'] += [labels[1]]
            response_instruction_dict['labels_instr3'] += [labels[2]]

            
        
        
        index += 1
        if print_prompt:
            print('******')
            print(labels)
            print('RESPONSE', responses ,' ---')
            print('*****')
            
    return response_instruction_dict
        

In [None]:
df.head()[:1]

In [None]:


afacts = False
model = alpaca_model
while True:
    response_instruction_dict_da_alpaca_zs = get_score(df, 
                eval_type = 'direct_assesment', 
                  afacts = afacts, 
                  model = model, 
                  fewshot_examples = None , 
                  fewshot_idx = 0)

    model_name = 'GPT' if not model else 'Alpaca'
    afacts_str = 'Afact' if afacts else 'Dlg'
    
    # for k ,v in response_instruction_dict_da_alpaca_zs.items():
    #     df[f'{k}_{afacts_str}_{model_name}'] = v
df

In [None]:
# df_sample = df[:5]
afacts = True
model = alpaca_model

response_instruction_dict_da_alpaca_zs = get_score(df, 
            eval_type = 'direct_assesment', 
              afacts = afacts, 
              model = model, 
              fewshot_examples = None , 
              fewshot_idx = 0)

model_name = 'GPT' if not model_name else 'Alpaca'
afacts_str = 'Afact' if afacts else 'Dlg'

for k ,v in response_instruction_dict_da_alpaca_zs.items():
    df_sample[f'{k}_{afacts_str}_{model_name}'] = v
df_sample

In [40]:
# df_sample = df[:5]
afacts = False
model = None

response_instruction_dict_da_alpaca_zs = get_score(df_sample, 
            eval_type = 'direct_assesment', 
              afacts = afacts, 
              model = model, 
              fewshot_examples = None , 
              fewshot_idx = 0)

model_name = 'GPT' if not model else 'Alpaca'
afacts_str = 'Afact' if afacts else 'Dlg'

for k ,v in response_instruction_dict_da_alpaca_zs.items():
    df_sample[f'{k}_{afacts_str}_{model_name}'] = v
df_sample

  0%|                                                                                                                                                                                           | 0/5 [00:00<?, ?it/s]

PROMPT Decide if the Summary is consistent with the corresponding Dialogue. Note that consistency means all information in the summary is supported by the Dialogue.
Answer "yes" for consistent and "no" for inconsistent.
Dialogue: #Person1#: I want to go to china for sight-seeing. What do you think of it, Mum?
#Person2#: Why not? China is a wonderful country.
#Person1#: Will you go with me, too?
#Person2#: No, I'm afraid not now. I'm too busy.
Summary: #Person1# asks #Person2# to go to China with #Person1#.
Answer:
***
PROMPT Verify if the Summary aligns with the Dialogue for consistency. Consistency ensures that every detail in the Summary is substantiated by the Dialogue.
Answer "yes" for consistent and "no" for inconsistent. 
Dialogue: #Person1#: I want to go to china for sight-seeing. What do you think of it, Mum?
#Person2#: Why not? China is a wonderful country.
#Person1#: Will you go with me, too?
#Person2#: No, I'm afraid not now. I'm too busy.
Summary: #Person1# asks #Person2# t

 20%|███████████████████████████████████▊                                                                                                                                               | 1/5 [00:01<00:04,  1.20s/it]

******
[0, 1, 0]
RESPONSE ['Yes', 'No', 'Yes']  ---
*****


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.26s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,...,response_instr3_Afact_Alpaca,labels_instr1_Afact_Alpaca,labels_instr2_Afact_Alpaca,labels_instr3_Afact_Alpaca,response_instr1_Dlg_GPT,response_instr2_Dlg_GPT,response_instr3_Dlg_GPT,labels_instr1_Dlg_GPT,labels_instr2_Dlg_GPT,labels_instr3_Dlg_GPT
0,0,test_312,CODS,[],[],#Person1# asks #Person2# to go to China with #...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,No,1,1,1,Yes,No,Yes,0,1,0
1,1,test_312,UniLM,[],[],person1 wants to go to china for sight - seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,1,0,0,Yes,Yes,Yes,0,0,0
2,2,test_312,BART,[],[],##Person1# wants to go to China but #Person2#'...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,1,0,0,Yes,Yes,Yes,0,0,0
3,3,test_312,MV-BART,[],[],#Person1# wants to go to China for sight-seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,1,0,0,Yes,Yes,Yes,0,0,0
4,4,test_312,gpt4-32k-0613,[],[],Person1 expresses a desire to visit China for ...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,0,0,Yes,Yes,Yes,0,0,0


In [41]:
# df_sample = df[:5]
afacts = True
model = None

response_instruction_dict_da_alpaca_zs = get_score(df_sample, 
            eval_type = 'direct_assesment', 
              afacts = afacts, 
              model = model, 
              fewshot_examples = None , 
              fewshot_idx = 0)

model_name = 'GPT' if not model else 'Alpaca'
afacts_str = 'Afact' if afacts else 'Dlg'

for k ,v in response_instruction_dict_da_alpaca_zs.items():
    df_sample[f'{k}_{afacts_str}_{model_name}'] = v
df_sample

  0%|                                                                                                                                                                                           | 0/5 [00:00<?, ?it/s]

PROMPT Decide if the Summary is consistent with the corresponding Source. Note that consistency means all information in the summary is supported by the Source.
Answer "yes" for consistent and "no" for inconsistent.
Source: Person1 wants to go to China.
The purpose of Person1's trip to China is sight-seeing.
Person1 asks Mum for her opinion about the trip to China.
Person2 thinks China is a wonderful country.
Person1 wants Person2/Mum to go with him/her.
Person2/Mum cannot go with Person1.
The reason Person2/Mum cannot go is because she is too busy.
Summary: #Person1# asks #Person2# to go to China with #Person1#.
Answer:
***
PROMPT Verify if the Summary aligns with the Source for consistency. Consistency ensures that every detail in the Summary is substantiated by the Source.
Answer "yes" for consistent and "no" for inconsistent. 
Source: Person1 wants to go to China.
The purpose of Person1's trip to China is sight-seeing.
Person1 asks Mum for her opinion about the trip to China.
Perso

 20%|███████████████████████████████████▊                                                                                                                                               | 1/5 [00:01<00:04,  1.19s/it]

******
[0, 0, 0]
RESPONSE ['Yes', 'Yes', 'Yes']  ---
*****


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.12s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample[f'{k}_{afacts_str}_{model_name}'] = v
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

Unnamed: 0.1,Unnamed: 0,docid,model,nonfactual_spans,evidence,summary,factual_error,error_type,dialogue,origin,...,response_instr3_Dlg_GPT,labels_instr1_Dlg_GPT,labels_instr2_Dlg_GPT,labels_instr3_Dlg_GPT,response_instr1_Afact_GPT,response_instr2_Afact_GPT,response_instr3_Afact_GPT,labels_instr1_Afact_GPT,labels_instr2_Afact_GPT,labels_instr3_Afact_GPT
0,0,test_312,CODS,[],[],#Person1# asks #Person2# to go to China with #...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,1,0,Yes,Yes,Yes,0,0,0
1,1,test_312,UniLM,[],[],person1 wants to go to china for sight - seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,0,0,Yes,Yes,Yes,0,0,0
2,2,test_312,BART,[],[],##Person1# wants to go to China but #Person2#'...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,0,0,Yes,Yes,Yes,0,0,0
3,3,test_312,MV-BART,[],[],#Person1# wants to go to China for sight-seein...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,0,0,Yes,Yes,Yes,0,0,0
4,4,test_312,gpt4-32k-0613,[],[],Person1 expresses a desire to visit China for ...,0,[],#Person1#: I want to go to china for sight-see...,DialogueSum,...,Yes,0,0,0,Yes,Yes,Yes,0,0,0


In [42]:
df_sample.keys()

Index(['Unnamed: 0', 'docid', 'model', 'nonfactual_spans', 'evidence',
       'summary', 'factual_error', 'error_type', 'dialogue', 'origin',
       'dialogue_atomic_facts', 'response_instr1_Dlg_Alpaca',
       'response_instr2_Dlg_Alpaca', 'response_instr3_Dlg_Alpaca',
       'labels_instr1_Dlg_Alpaca', 'labels_instr2_Dlg_Alpaca',
       'labels_instr3_Dlg_Alpaca', 'response_instr1_Afact_Alpaca',
       'response_instr2_Afact_Alpaca', 'response_instr3_Afact_Alpaca',
       'labels_instr1_Afact_Alpaca', 'labels_instr2_Afact_Alpaca',
       'labels_instr3_Afact_Alpaca', 'response_instr1_Dlg_GPT',
       'response_instr2_Dlg_GPT', 'response_instr3_Dlg_GPT',
       'labels_instr1_Dlg_GPT', 'labels_instr2_Dlg_GPT',
       'labels_instr3_Dlg_GPT', 'response_instr1_Afact_GPT',
       'response_instr2_Afact_GPT', 'response_instr3_Afact_GPT',
       'labels_instr1_Afact_GPT', 'labels_instr2_Afact_GPT',
       'labels_instr3_Afact_GPT'],
      dtype='object')