In [42]:
import pandas as pd
from GPTModel import GPTInference
import spacy
import re, string
import nltk
nlp = spacy.load("en_core_web_sm")

In [43]:


def mask_all_errtypes(text):
    doc = nlp(text)
    origin_tokens = [token for token in doc]
    all_masked_results = []
    mask_idx = []
    mask_tokens = []
    for idx, token in enumerate(doc):
        append = False
        if 'subj' in token.dep_ or 'obj' in token.dep_:
            append = True
            token_type = 'subjobj'
            
        elif 'VERB' == token.pos_:
            append = True 
            token_type = 'predicate'

        elif 'ADV' == token.pos_:
            append = True
            token_type = 'circumstance'

        elif 'ADP' == token.pos_:
            append = True
            token_type = 'circumstance'

        elif 'PRON' == token.pos_:
            append = True
            token_type = 'coreference'

        

        if append:
            mask_idx.append(idx)
            mask_tokens.append((idx, token.text, token_type))
    # print([])
    # print(mask_tokens)  

    error_types_indices_map = {}
    for token_idx, token_text, err_type in mask_tokens:
        if err_type not in error_types_indices_map:
            error_types_indices_map[err_type] = []
        error_types_indices_map[err_type] += [token_idx]
        
    for err_type, err_indices in error_types_indices_map.items():
        for idx in err_indices:
            masked_text = [each.text for each in origin_tokens[:idx]] + ['<BLANK>'] + [each.text for each in origin_tokens[idx+1:]]
            masked_text = ' '.join(masked_text)
            all_masked_results.append((masked_text, origin_tokens[idx].text, err_type))
    return all_masked_results


def make_masked_sentences(atomic_facts):
        masked_atomic_facts_map = {}
        for atomic_fact in atomic_facts:
            masked_results = mask_all_errtypes(atomic_fact)
            for sent, ans, err_type in masked_results:
                if err_type not in masked_atomic_facts_map:
                    masked_atomic_facts_map[err_type] = []
                masked_atomic_facts_map[err_type].append((atomic_fact, sent, ans, err_type))
            
        return masked_atomic_facts_map


import string 
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = stopwords.words('english')


def make_corrupted_summary(gpt_model_corruptor, dlg, atomic_fact_blank, answer, err_type, facts):
    corrupted_err_type = []
    
    if err_type == 'predicate':
            corrupted_answer = get_predicate_answer(gpt_model_corruptor, dlg, facts, atomic_fact_blank, answer)
            
        
    else:
        corrupted_answer = get_generic_answer(gpt_model_corruptor, dlg, facts, atomic_fact_blank, answer)
    
        
    options = corrupted_answer.split(',') 
    options = sample(options, 8) if len(options) >= 8 else options
    options = [re.sub('[1-9]', '', each) for each in options]

    corrupted_facts = []
    for option in options:
            corrupted_summary = atomic_fact_blank.replace('<BLANK>', option.strip())
            corrected_summary = check_sentence_validity(gpt_model_sentence_checker, corrupted_summary)
            
            corrected_summary = [each.strip(string.punctuation) for each in corrected_summary.split(' ') ]
            corrupted_summary = [each.strip(string.punctuation) for each in corrupted_summary.split(' ')]
            
            replaced_word = ' '.join([each for each in corrected_summary if each.strip(string.punctuation).lower() not in atomic_fact_blank.lower()])
    
            corrected_summary = ' '.join(corrected_summary)
            corrupted_summary = ' '.join(corrupted_summary)
           
            entailment = check_entailment(gpt_model_nli, dlg, facts, corrected_summary)
                
           
            if entailment == 'no' and replaced_word in corrected_summary:
                    corrupted_err_type += [(corrected_summary, replaced_word)]
    return corrupted_err_type

In [98]:



def get_atomic_facts_gpt(gpt_model_atomic, model, text, text_type):
    instr = f'Convert the {text_type} into facts without adding any unsupported details.'
    prompt = f'{instr}\nDialogue: {text}'
    print(prompt)
    gpt_response = gpt_model_atomic.get_chatgpt_response(prompt, model = model)
    # print(nltk.sent_tokenize(gpt_response))
    return gpt_response
    
class SyntheticPrompt:

    def __init__(self):
        self.gpt_model_corruptor = GPTInference()
        self.gpt_model_sentence_checker = GPTInference()
        self.gpt_model_atomic = GPTInference()
        self.gpt_model_nli = GPTInference()

    def make_masked_sentences(self, atomic_facts):
        masked_atomic_facts_map = {}
        for atomic_fact in atomic_facts:
            masked_results = mask_all_errtypes(atomic_fact)
            for sent, ans, err_type in masked_results:
                if err_type not in masked_atomic_facts_map:
                    masked_atomic_facts_map[err_type] = []
                masked_atomic_facts_map[err_type].append((atomic_fact, sent, ans, err_type))

        return masked_atomic_facts_map

    def get_atomic_facts(self, gpt_model_type, dlg):
        atomic_facts = get_atomic_facts_gpt(self.gpt_model_atomic, gpt_model_type, dlg, 'dialogue')
        # print(atomic_facts)
        atomic_facts = nltk.sent_tokenize(atomic_facts)
        atomic_facts = [re.sub('[0-9]', '', each) for each in atomic_facts]
        atomic_facts = [each.strip(string.punctuation).strip() for each in atomic_facts]
        atomic_facts = [each for each in atomic_facts if each]
        return atomic_facts 
        

In [99]:
df = pd.read_csv('/home/sanjana/factual_evaluation_source_based/datasets/sota_annotations/dialogue_finegrained_aggrefact.csv')
df.head()[:1]

Unnamed: 0.1,Unnamed: 0,Dialogue,Model,Summary,origin,Annotations,Reference_Summary
0,0,"Natacha: hi, i can come and pick you up at the...",gpt3_finetune,Charles will probably arrive at the train stat...,SAMSum,{'Charles will probably arrive at the train st...,Charles has just landed and he will be at RER ...


In [100]:
row = df.iloc[[13]]
dialogue = row['Dialogue'].values[0]
summary = row['Summary'].values[0]
annotations = eval(row['Annotations'].values[0])

synthetic_gen = SyntheticPrompt()

In [101]:
gpt_model_type = 'gpt-4-32k-0613'
atomic_facts = synthetic_gen.get_atomic_facts(gpt_model_type, dialogue)

Convert the dialogue into facts without adding any unsupported details.
Dialogue: Alan: <file_photo>
Alan: look what I just found :)
Robert: dude, that's just nasty and you know it :)
Robert: it has no sugar, no taste, and additional cinnamon flavoring
Alan: yeah, I know - that's awesome :)
Robert: you sir have a very strange tastes :P
Alan: well, and I found a perfect company for it <file_photo>
Robert: oh, that's more like it!
Robert: but does the whiskey go well with the cinnamon? flavored whiskey is the worst...
Alan: Actually it does taste surprisingly well. The cinnamon is not overpowering. If you put enough whiskey that is :)
Rob: Lol, thought so :)
Rob: I just wish the brought the old cherry flavor back...
Rob: not the useless no-sugar stuff
Alan: Ah, that is true :)


In [102]:
atomic_facts

['Alan found something which he seemed excited about',
 "Robert thought that Alan's finding was unappealing and explicitly mentioned it had no sugar, no taste, and an additional cinnamon flavor",
 'Alan admitted that he is aware of these attributes and finds them appealing',
 "Robert commented on Alan's unique taste preference",
 'Alan found a whisky that he thinks goes well with the item he found',
 'Robert questioned the compatibility of the whisky and cinnamon flavor and expressed his dislike for flavored whisky',
 'Alan reassured Robert that the combination tasted good, with the cinnamon not being overpowering if enough whisky is used',
 'Robert expressed his longing for the old cherry flavor and his dislike for no-sugar options',
 "Alan agreed with Robert's sentiment about the old cherry flavor"]

In [104]:
masked_atomic_facts_map = synthetic_gen.make_masked_sentences(atomic_facts[:1])
masked_atomic_facts_map

{'subjobj': [('Alan found something which he seemed excited about',
   '<BLANK> found something which he seemed excited about',
   'Alan',
   'subjobj'),
  ('Alan found something which he seemed excited about',
   'Alan found <BLANK> which he seemed excited about',
   'something',
   'subjobj'),
  ('Alan found something which he seemed excited about',
   'Alan found something <BLANK> he seemed excited about',
   'which',
   'subjobj'),
  ('Alan found something which he seemed excited about',
   'Alan found something which <BLANK> seemed excited about',
   'he',
   'subjobj')],
 'predicate': [('Alan found something which he seemed excited about',
   'Alan <BLANK> something which he seemed excited about',
   'found',
   'predicate'),
  ('Alan found something which he seemed excited about',
   'Alan found something which he <BLANK> excited about',
   'seemed',
   'predicate')],
 'circumstance': [('Alan found something which he seemed excited about',
   'Alan found something which he seeme