In [16]:
import pandas as pd 
from GPTModel import GPTInference
from tqdm import tqdm
import re

In [29]:
class ChatGPTBaselines():

    def __init__(self):
        self.GPTmodel = GPTInference()
        self.model_snapshot = 'gpt-4-32k-0613'

    def direct_assessment(self, source, summary, source_type  = 'Dialogue'):
        instruction = f'''Decide if the following {source_type.lower()} summary is consistent with the corresponding {source_type.lower()}. Note that consistency means all information in the summary is supported by the {source_type.lower()}.'''

        prompt = f'{instruction}\n{source_type}: {source}\nSummary: {summary}\nAnswer(yes or no):'
        # print(prompt)
        response = self.GPTmodel.get_chatgpt_response(prompt, self.model_snapshot)
        label = 1 if response.lower().strip() == 'no' else 0
        return response, label

    def direct_assessment_score(self, source, summary, source_type):
        instruction = f'''Score the following summary given the corresponding {source_type.lower()} with respect to consistency on a continuous scale from 0 to 100, where a score of zero means “inconsistency” and score of one hundred means “perfect consistency”. Note that consistency measures whether the facts in the summary are consistent with the facts in the {source_type.lower()}. Consider whether the summary does reproduce all facts accurately and does not make up untrue information.'''
        prompt = f'{instruction}\n{source_type}: {source}\nSummary: {summary}\nScore:'
        # print(prompt)
        response = self.GPTmodel.get_chatgpt_response(prompt, self.model_snapshot)
        label = re.findall(r'\d+', response)[0]
        # print(response, label)
        label = eval(label)
        
        return response, label
        
    def direct_assessment_stars(self , source, summary, source_type):
        instruction = f'''Score the following {source_type.lower()} summarization given the corresponding {source_type.lower()} with respect to consistency with one to five stars, where one star means “inconsistency” and five stars means “perfect consistency”. Note that consistency measures whether the facts in the summary are consistent with the facts in the original article. Consider whether the summary does reproduce all facts accurately and does not make up untrue information.'''
        prompt = f'{instruction}\n{source_type}: {source}\nSummary: {summary}\nStars:'
        # print(prompt)
        response = self.GPTmodel.get_chatgpt_response(prompt, self.model_snapshot)
        label = re.findall(r'\d+', response)[0]
        # print(response, label)
        label = eval(label)
        return response, label

    def direct_assessment_cot(self, source, summary, source_type):
        instruction = f'''Decide if the following summary is consistent with the corresponding {source_type.lower()}. Note that consistency means all information in the summary is supported by the article.'''
        prompt = f'{instruction}\n{source_type}: {source}\nSummary: {summary}\nExplain your reasoning step by step then answer (yes or no) the question' 
        # print(prompt)
        response = self.GPTmodel.get_chatgpt_response(prompt, self.model_snapshot)
        label = 1
        if 'summary is consistent' in response:
            label = 0
        
        return response, label
        
    def get_atomic_facts_gpt(self, text, text_type):
        instr = f'Segment the following {text_type.lower()} into atomic facts without introducing any unsupported information'
        prompt = f'{instr}\nDialogue: {text}'
        # print(prompt)
        gpt_response = self.GPTmodel.get_chatgpt_response(prompt)
        return gpt_response

    def run_evaluation(self, dlg, summary, type = 'all'):
        response_dict = {
            
        }
        
        dlg_atomic_facts = self.get_atomic_facts_gpt(dlg, 'Dialogue')
        response_dict['atomic_facts'] = dlg_atomic_facts
        
        response_zs = self.direct_assessment(dlg, summary, source_type = 'Dialogue')
        response_zs_afacts = self.direct_assessment(dlg_atomic_facts, summary, source_type = 'Source')
        response_dict['text_zs'] = response_zs[0]
        response_dict['text_zs_afacts'] = response_zs_afacts[0]
        response_dict['label_zs'] = response_zs[1]
        response_dict['label_zs_afacts'] = response_zs_afacts[1]
        

        response_da = self.direct_assessment_score(dlg, summary, source_type = 'Dialogue')
        response_da_afacts = self.direct_assessment_score(dlg_atomic_facts, summary, source_type = 'Source')
        response_dict['text_da_score'] = response_da[0]
        response_dict['text_da_score_afacts'] = response_da_afacts[0]
        response_dict['label_da_score'] = response_da[1]
        response_dict['label_da_score_afacts'] = response_da_afacts[1]

        response_da_stars = self.direct_assessment_stars(dlg, summary, source_type = 'Dialogue')
        response_da_stars_afacts = self.direct_assessment_stars(dlg_atomic_facts, summary, source_type = 'Source')
        response_dict['text_da_stars'] = response_da_stars[0]
        response_dict['text_da_stars_afacts'] = response_da_stars_afacts[0]
        response_dict['label_da_stars'] = response_da_stars[1]
        response_dict['label_da_stars_afacts'] = response_da_stars_afacts[1]

        response_da_cot = self.direct_assessment_cot(dlg, summary, source_type = 'Dialogue')
        response_da_cot_afacts = self.direct_assessment_cot(dlg_atomic_facts, summary, source_type = 'Source')
        response_dict['text_da_cot'] = response_da_cot[0]
        response_dict['text_da_cot_afacts'] = response_da_cot_afacts[0]
        response_dict['label_da_cot'] = response_da_cot[1]
        response_dict['label_da_cot_afacts'] = response_da_cot_afacts[1]
        
        return response_dict


        

In [30]:
chat_gpt_baselines = ChatGPTBaselines()
df_transformer = pd.read_csv('/home/sanjana/factual_evaluation_source_based/datasets/scored/dialogue_aggrefact_scored.csv')

In [31]:

df_sample = df_transformer.sample(100)
df_response_dict = {}

for idx, row in tqdm(df_sample.iterrows(), total=df_sample.shape[0]):
    dlg = row['Dialogue']
    summary = row['Summary']
    
    eval_dict = chat_gpt_baselines.run_evaluation(dlg, summary)
    for k , v in eval_dict.items():
        if k not in df_response_dict:
            df_response_dict[k] = []
        df_response_dict[k] += [v] 

    

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [32:26<00:00, 19.47s/it]


In [None]:
for k , v in df_response_dict.items():
    df_sample[k] = v

In [38]:
from sklearn.metrics import roc_auc_score

def calculate_auc(y, predictions):
    print(roc_auc_score(y, predictions))

In [42]:
y = df_sample['w/ Error'].values
predictions = [each for each in df_sample['label_zs'].values]
calculate_auc(y, predictions)

0.5663956639566395


In [43]:
y = df_sample['w/ Error'].values
predictions = [each for each in df_sample['label_zs_afacts'].values]
calculate_auc(y, predictions)

0.5880758807588076


In [48]:
# label_da_cot
y = df_sample['w/ Error'].values
predictions = [each for each in df_sample['label_da_cot_afacts'].values]
calculate_auc(y, predictions)

0.5948509485094851


In [50]:
# label_da_cot
y = df_sample['w/ Error'].values
predictions = [100 - each for each in df_sample['label_da_score'].values]
calculate_auc(y, predictions)

0.6307588075880759


In [52]:
# label_da_cot
y = df_sample['w/ Error'].values
predictions = [100 - each for each in df_sample['label_da_score_afacts'].values]
calculate_auc(y, predictions)

0.5924796747967479


In [54]:
# label_da_cot
y = df_sample['w/ Error'].values
predictions = [5 - each for each in df_sample['label_da_stars'].values]
calculate_auc(y, predictions)

0.660230352303523


In [55]:
# label_da_cot
y = df_sample['w/ Error'].values
predictions = [5 - each for each in df_sample['label_da_stars_afacts'].values]
calculate_auc(y, predictions)

0.5975609756097561


In [46]:
df_sample.keys()

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'DocID', 'Dialogue',
       'Model', 'Summary', 'w/ Error', 'CorefE', 'CorefE_text', 'CircE',
       'CircE_text', 'OutE', 'OutE_text', 'GramE', 'GramE_text', 'PredE',
       'PredE_text', 'SubjObjE', 'SubjObjE_text', 'OtherE', 'OtherE_text',
       'LinkE', 'LinkE_text', 'origin', 'SummaC-ZS_score', 'SummaC-Conv_score',
       'QuestEval_score', 'atomic_facts', 'text_zs', 'text_zs_afacts',
       'label_zs', 'label_zs_afacts', 'text_da_score', 'text_da_score_afacts',
       'label_da_score', 'label_da_score_afacts', 'text_da_stars',
       'text_da_stars_afacts', 'label_da_stars', 'label_da_stars_afacts',
       'text_da_cot', 'text_da_cot_afacts', 'label_da_cot',
       'label_da_cot_afacts'],
      dtype='object')

In [51]:
# df_sample['label_da_score']

216      80
1068    100
1511     70
829     100
998      95
       ... 
147     100
740     100
1438    100
604     100
1171     70
Name: label_da_score, Length: 100, dtype: int64