In [1]:
import torch
import json
import re

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model_path = '/home/nlp/apex/experiment/ctrl/'

In [3]:
dataset = load_dataset('data/discovery.py', 'discovery')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [277]:
discovery_train_ds = load_dataset('data/discovery.py', 'discovery', split='train[5%:40%]')
discovery_valid_ds = dataset["validation"]
discovery_test_ds = dataset["test"]

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [278]:
len(discovery_train_ds)

548100

In [281]:
len(load_dataset('data/discovery.py', 'discovery', split='train[:7%]'))

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


109620

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [212]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                         max_length=64,
                                         padding='max_length',
                                         return_length=True,
                                         add_special_tokens=True)

In [163]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
tokenizer.add_special_tokens({'sep_token': '[SEP]'})

1

In [268]:
class DiscoveryDatasetGenerate():
    
    def __init__(self, dataset, labels, tokenizer, model, decoding_options):
        self.dataset = dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)
        self.decoding_options = decoding_options
        
    def get_sentence_as_context(self, idx, sentence_order):
        context = self.dataset[idx]
        tokenized_context = self.tokenizer(self.labels[context['label']] + ' ' + context[sentence_order], 
                                           return_tensors="pt")
        original_text_length = len(self.labels[context['label']] + ' ' + context[sentence_order])
        return tokenized_context, self.labels[context['label']], original_text_length
    
    def check_model_output(self, output_from_model, original_text_length):
        if len(tokenizer.decode(output_from_model.squeeze(0), skip_special_tokens=True)[original_text_length:])<5:
            print("detected an empty greedy output")
            return True
        return False
    
    def generate_from_model(self, tokenized_context, original_text_length):
        input_ids = tokenized_context['input_ids'].to(self.device)
        
        outputs = []
        
        greedy_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[0])
        beam_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[1])
        top_p_k_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[2])
 
        # Sometimes the greedy output is empty, so replace it with top-p-k
        if self.check_model_output(greedy_output, original_text_length):
            greedy_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[2])
        
        
        outputs.append(greedy_output)
        outputs.append(beam_output)
        outputs.append(top_p_k_output)
        
        return outputs
    
    def cleanup_generated_examples(self, outputs, idx, len_context, marker):
        example = {}
        example['ground_truth'] = self.dataset[idx]['sentence2']
        for i, sample_output in enumerate(outputs):
            uncleaned_text = tokenizer.decode(sample_output.squeeze(0).tolist(), skip_special_tokens=True)[len_context:]
            cleaned_text = uncleaned_text.replace('\n', '').replace('\xa0', '').replace('\\', '').replace(marker, '').strip()
            prev_input_end_index = cleaned_text.index('.') # to remove context in generated output
            
#             if '.' or '@@' in cleaned_text:
            cleaned_text = cleaned_text[prev_input_end_index:].replace('.','', 1).replace('@@','').strip()
            
            if '`' in cleaned_text:
                cleaned_text = cleaned_text.replace('`', '').strip()
            if '"' in cleaned_text:
                cleaned_text = cleaned_text.replace('"', '').strip()
            if ']' in cleaned_text:
                cleaned_text = cleaned_text.replace(']', '').strip()
            if '-' in cleaned_text:
                cleaned_text = cleaned_text.replace('-', '').strip()
                
            example['option_' + str(i)] = cleaned_text
        
        return example
    
    def generate_synthetic_options(self, idx, context='sentence1'):
        tokenized_context, marker, original_text_length = self.get_sentence_as_context(idx, context)
        len_context = len(self.dataset[idx][context])
        outputs = self.generate_from_model(tokenized_context, original_text_length)
        clean_examples = self.cleanup_generated_examples(outputs, idx, len_context, marker)        
        return clean_examples

In [269]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2,
                    'temperature': 0}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':50, 
                    'top_k':50, 
                    'top_p':0.95}

In [270]:
decoding_options = []
decoding_options.append(decoding_options_0)
decoding_options.append(decoding_options_1)
decoding_options.append(decoding_options_2)

In [271]:
discovery_ds = DiscoveryDatasetGenerate(discovery_train_ds, LABELS, tokenizer, model, decoding_options)

In [272]:
synthetic_dataset = []

for i in tqdm(range(len(discovery_train_ds))):
    example = {}
    values = discovery_train_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    synthetic_dataset.append(example)

  0%|          | 5/1566000 [00:20<1836:19:38,  4.22s/it]

detected an empty greedy output


  0%|          | 12/1566000 [00:58<2111:35:02,  4.85s/it]


KeyboardInterrupt: 

In [274]:
synthetic_dataset[6]

{'context': 'The first test contains all the information necessary to understand the intent and implementation of the test.',
 'marker': 'on the other hand',
 'ground_truth': 'The second test pulled the creation of the validation instance and the stub into a setup method.',
 'option_0': '.  A test is a set of instructions that are executed by an application when it runs its tests.  To be able to run your own tests you need to have',
 'option_1': 'The second test is used to determine whether or not the test has been properly implemented. the purpose of two tests is the same :. is a test that is designed to measure the effectiveness of application of a',
 'option_2': 'A second test is used to make sure that the system is working as designed, and does not create any errors. ['}

In [174]:
text = 'slowly Although I longed for his touch, I moved from him, standing.'

In [188]:
len(text)

67

In [175]:
input_ids = tokenizer(text, return_tensors="pt")['input_ids'].to('cuda')

In [181]:
output = model.generate(input_ids, **decoding_options_0)

In [182]:
output.shape

torch.Size([1, 64])

In [190]:
tokenizer.decode(output[0], skip_special_tokens=True)[67:]

''

In [83]:
with open('data/synthetic_discovery_100.json', 'w') as fout:
    json.dump(synthetic_dataset , fout)

In [8]:
discovery_train_ds.column_names[0]

'idx'

In [None]:
verb
coreference

In [56]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

In [58]:
config = GPT2Config.from_pretrained('microsoft/DialoGPT-large', num_labels = len(LABELS))
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-large')
model = GPT2ForSequenceClassification.from_pretrained('microsoft/DialoGPT-large', config=config)

Some weights of the model checkpoint at microsoft/DialoGPT-large were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-large and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
label = torch.tensor(example['label'])

In [54]:
inputs = tokenizer(example['sentence1'], return_tensors="pt")

In [69]:
outputs = model(**inputs, labels=label)

In [73]:
torch.argmax(outputs[1], axis=1)

tensor([82])