In [36]:
import json
import pandas as pd 

def read_data(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [64]:
import os



from nltk import word_tokenize, sent_tokenize
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


from transformers import GPT2Tokenizer
class GPTInference():
    def __init__(self):
        

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def get_chatgpt_response(self, prompt):
        # query = f'Article: {doc}\n{instruction}\n{prompt}'
        response = openai.ChatCompletion.create(model="gpt-4-32k-0613",
                                       messages=[
                                           
                        {"role": "user", "content": f'{prompt}'},   
                        ], 
                        )
#         print(query)
#         print(response)
        return response['choices'][0]['message']['content']

In [76]:
def make_gpt4_summaries(read_file, out_file):
    df = pd.read_csv(read_file)
    print(df.head())
    gpt_model = GPTInference()
    df_dict = {
        'DocID': [],
        'Dialogue': [],
        'Model': [],
        'Summary': [],
        'origin':[]
        
    }
    for dialogue in list(set(df['Dialogue'].values)):
        doc_id = df[df['Dialogue'] == dialogue]['DocID'].values[0]
        instr = 'Generate a concise summary of the conversation given below among multiple speakers'
        prompt = f'{instr}\n{dialogue}\nSummary:'
        summary = gpt_model.get_chatgpt_response(prompt)
        df_dict['DocID'].append(doc_id)
        df_dict['Dialogue'].append(dialogue)
        df_dict['Model'].append('GPT4')
        df_dict['Summary'].append(summary)
        df_dict['origin'].append('AMI')
    df = pd.DataFrame(df_dict)
    df.to_csv(out_file)
    return df

In [77]:
df_reference = pd.read_csv('/home/sanjana/explainable_factual_evaluation/datasets/fact_annotated/old_model_annotated/dialogue_aggrefact.csv')

df_reference.head()



Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,DocID,Dialogue,Model,Summary,w/ Error,SubjObjE,SubjObjE_text,...,CorefE_text,CircE,CircE_text,PredE,PredE_text,OutE,OutE_text,SummaC-Conv,SummaC-ZS,QuestEval
0,0,0,0,13809941,Thelma: i dont have anything to wear\nLouisa: ...,human_ref,Louisa will lend Thelma her red velvet dress.,0,,,...,,,,,,,,0.224066,0.608673,0.642865
1,1,1,1,13809941,Thelma: i dont have anything to wear\nLouisa: ...,bart_large,Thelma doesn't have anything to wear. Louisa w...,0,,,...,,,,,,,,0.335007,0.271927,0.513751
2,2,2,2,13809941,Thelma: i dont have anything to wear\nLouisa: ...,mv-bart_large,Thelma doesn't have anything to wear. Louisa w...,0,,,...,,,,,,,,0.335007,0.271927,0.513751
3,3,3,3,13809941,Thelma: i dont have anything to wear\nLouisa: ...,co-ref bart large,Thelma doesn't have anything to wear. Louisa w...,0,,,...,,,,,,,,0.329921,0.09021,0.403632
4,4,4,4,13809941,Thelma: i dont have anything to wear\nLouisa: ...,condigsum bart large,Louisa will bring Thelma her red velvet dress.,0,,,...,,,,,,,,0.224066,0.374329,0.651179


In [78]:
list(df_reference.keys())[3:]

['DocID',
 'Dialogue',
 'Model',
 'Summary',
 'w/ Error',
 'SubjObjE',
 'SubjObjE_text',
 'LinkE',
 'LinkE_text',
 'OtherE',
 'OtherE_text',
 'GramE',
 'GramE_text',
 'CorefE',
 'CorefE_text',
 'CircE',
 'CircE_text',
 'PredE',
 'PredE_text',
 'OutE',
 'OutE_text',
 'SummaC-Conv',
 'SummaC-ZS',
 'QuestEval']

In [85]:
filename = "/home/sanjana/modular-summarization/dataset_ami/full_summarization"
write_path = "/home/sanjana/explainable_factual_evaluation/datasets/long_dialogue/raw_dataset/"

train_data = read_data(f'{filename}/train.jsonl')
dev_data = read_data(f'{filename}/val.jsonl')
test_data = read_data(f'{filename}/test.jsonl')



In [80]:

def make_data(dataset, ):
    df_dict = {'DocID': [],
               'Dialogue': [],
               'Model': [],
               'Summary': [],
              }
    for dat in dataset:
        doc_id = dat['case_id']
        summary = '\n'.join(dat['summary_lines'])
        dialogue = "\n".join(dat['article_lines'])
        model = "human_ref"
        
        df_dict['DocID'] += [doc_id]
        df_dict['Dialogue'] += [dialogue]
        df_dict['Model'] += [model]
        df_dict['Summary'] += [summary]

    df = pd.DataFrame(df_dict)
    return df


In [81]:
df_train = make_data(train_data)
df_train.to_csv(f'{write_path}/train.csv')

df_dev = make_data(dev_data)
df_dev.to_csv(f'{write_path}/dev.csv')

df_test = make_data(test_data)
df_test.to_csv(f'{write_path}/test.csv')

In [84]:
model_path = '/home/sanjana/explainable_factual_evaluation/datasets/long_dialogue/model_generated'
df_train_model = make_gpt4_summaries(f'{write_path}/test.csv', f'{model_path}/test.csv')

   Unnamed: 0    DocID                                           Dialogue  \
0           0  ES2014d  PM So is\nPM Why not save that .\nME No , you'...   
1           1  TS3007b  PM Hello .\nME Hey guys .\nID Hi .\nUI Hi .\nP...   
2           2  IS1009c  PM Okay .\nPM Hello everyone .\nID Hi .\nME Hi...   
3           3  ES2004a  UI Hmm hmm hmm .\nPM Are we\nPM we're not allo...   
4           4  TS3003c  PM Okay . Uh\nPM good afternoon .\nME Good aft...   

       Model                                            Summary  
0  human_ref  @@abstract@@\nThe first prototype for the remo...  
1  human_ref  @@abstract@@\nWhen this functional design meet...  
2  human_ref  @@abstract@@\nThe project manager opens the me...  
3  human_ref  @@abstract@@\nThe Project Manager gave an intr...  
4  human_ref  @@abstract@@\nThe project manager opened the m...  



KeyboardInterrupt



In [83]:
df_train_model

Unnamed: 0,DocID,Dialogue,Model,Summary,origin
0,IS1009a,PM Okay .\nME Okay .\nPM Everybody ready ?\nID...,GPT4,"The speakers, including a project manager (PM)...",AMI
1,ES2014c,PM Okay .\nPM Right .\nPM Conceptual design me...,GPT4,The speakers are having a discussion about a r...,AMI
2,TS3003b,PM Okay .\nPM Everybody found his place again ...,GPT4,"During their conversation, the group discussed...",AMI
3,TS3003c,PM Okay . Uh\nPM good afternoon .\nME Good aft...,GPT4,In the third meeting of a group discussing a c...,AMI
4,TS3007b,PM Hello .\nME Hey guys .\nID Hi .\nUI Hi .\nP...,GPT4,The discussion begins with an overview of the ...,AMI
5,ES2014d,"PM So is\nPM Why not save that .\nME No , you'...",GPT4,"In this meeting, the team is discussing criter...",AMI
6,ES2004c,PM I'll wait until you're all um hooked up .\n...,GPT4,"During the meeting, the group discusses potent...",AMI
7,TS3003a,PM So uh\nPM good morning .\nUI Morning .\nME ...,GPT4,"The project manager ""Bart"" leads a meeting inv...",AMI
8,IS1009b,"PM Okay , is everybody ready ?\nID Yeah ?\nME ...",GPT4,The conversation started with the Project Mana...,AMI
9,ES2014a,"PM Right , so\nPM start of the first meeting ....",GPT4,"In their first meeting, project leader Alastai...",AMI


In [86]:
train_data

[{'case_id': 'TS3010c',
  'article_lines': ['PM Okay .',
   "PM Well , let's start .",
   'PM What are we doing ?',
   'PM Oops .',
   'UI Hmm .',
   'ID Ah , pinball .',
   'PM Okay . Okay . Not doing .',
   'ME Mm . Ah . Hey . Ah .',
   'PM Uh',
   'UI Oh .',
   'ME Now I have my screen back too .',
   'PM Very good .',
   'PM Okay .',
   'ME Yeah .',
   'PM we have presentations .',
   "PM So first , it's your turn .",
   'UI Mine . Oh , great .',
   "PM Yeah . Isn't it amazing .",
   'ID Huh .',
   'ME Yeah .',
   'ME Very interesting .',
   'UI Uh',
   'PM Industrial Designer .',
   'PM Interface concept .',
   'UI Yes , well uh',
   "UI let's uh talk about the interface uh concept .",
   "UI Uh , first I'll uh I'll uh discuss the buttons we just chose ,",
   'UI uh show you some samples , uh',
   'UI uh discuss some colours and design maybe , already .',
   'UI And uh my personal preferences .',
   'UI Well we chose the power button to switch the television on and off .',
   'UI 