In [184]:
import torch
import json
import re

from collections import OrderedDict

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import AutoModelForCausalLM, AutoTokenizer

In [143]:
model_path = '/home/nlp/apex/experiment/ctrl/'

In [11]:
dataset = load_dataset('data/discovery.py', 'discovery')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [12]:
discovery_train_ds = load_dataset('data/discovery.py', 'discovery', split='train[:7%]')
discovery_valid_ds = dataset["validation"]
discovery_test_ds = dataset["test"]

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [13]:
len(discovery_train_ds)

109620

In [144]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                         max_length=64,
                                         padding='max_length',
                                         return_length=True,
                                         add_special_tokens=True)

In [32]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
tokenizer.add_special_tokens({'sep_token': '[SEP]'})

1

In [198]:
re.sub('[^A-Za-z0-9 ]+', '', 'dasdsa\xad dasas2321 }{[]`')

'dasdsa dasas2321 '

In [237]:
class DiscoveryDatasetGenerate():
    
    def __init__(self, dataset, labels, tokenizer, model, decoding_options):
        self.dataset = dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device).eval()
        self.decoding_options = decoding_options
        self.fixed_sequences = 0
        
    def get_sentence_as_context(self, idx, sentence_order):
        context = self.dataset[idx]
        tokenized_context = self.tokenizer(self.labels[context['label']] + ' ' + context[sentence_order], 
                                           return_tensors="pt")
        original_text_length = len(self.labels[context['label']] + ' ' + context[sentence_order])
        return tokenized_context, self.labels[context['label']], original_text_length
    
    def check_model_output(self, output_from_model, original_text_length):
        if len(tokenizer.decode(output_from_model.squeeze(0), skip_special_tokens=True)[original_text_length:])<5:
            self.fixed_sequences += 1
            print("Detected an empty greedy output. Count: ", self.fixed_sequences)
            return True
        return False
    
    def generate_from_model(self, tokenized_context, original_text_length):
        input_ids = tokenized_context['input_ids'].to(self.device)
        
        outputs = []
        
        greedy_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[0])
        beam_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[1])
        top_p_k_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[2])
 
        # Sometimes the greedy output is empty, so replace it with top-p-k
        if self.check_model_output(greedy_output, original_text_length):
            greedy_output, top_p_k_output = model.generate(input_ids=input_ids, 
                                **self.decoding_options[3])[:]
        
        
        outputs.append(greedy_output)
        outputs.append(beam_output)
        outputs.append(top_p_k_output)
        
        return outputs
    
    def cleanup_generated_examples(self, outputs, idx, len_context, marker):
        example = {}
        example['ground_truth'] = self.dataset[idx]['sentence2']
        
        for i, sample_output in enumerate(outputs):
            text = tokenizer.decode(sample_output.squeeze(0).tolist(), skip_special_tokens=True)[len_context:]
            if '.' in text:
                prev_input_end_index = text.index('.') # remove context in generated output
                text = text[prev_input_end_index:]
            text = re.sub('[^A-Za-z0-9 ]+', '', text) # remove special characters
            text = text.replace(marker, '') #2. remove marker
            example['option_' + str(i)] = text
        
        return example
    
    def generate_synthetic_options(self, idx, context='sentence1'):
        tokenized_context, marker, original_text_length = self.get_sentence_as_context(idx, context)
        len_context = len(self.dataset[idx][context])
        outputs = self.generate_from_model(tokenized_context, original_text_length)
        clean_examples = self.cleanup_generated_examples(outputs, idx, len_context, marker)        
        return clean_examples

In [238]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2,
                    'temperature': 0}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':64, 
                    'top_k':50, 
                    'top_p':0.95}

fallback_decoding = {'max_length': 64,
                     'num_beams':25, 
                     'no_repeat_ngram_size':2,
                     'num_return_sequences': 2,
                     'temperature': 0.7,
                     'early_stopping':True}

In [239]:
decoding_options = []
decoding_options.append(decoding_options_0)
decoding_options.append(decoding_options_1)
decoding_options.append(decoding_options_2)
decoding_options.append(fallback_decoding)

In [240]:
discovery_ds = DiscoveryDatasetGenerate(discovery_train_ds, LABELS, tokenizer, model, decoding_options)

In [None]:
synthetic_dataset = []

for i in tqdm(range(len(discovery_train_ds))):
    example = {}
    values = discovery_train_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    synthetic_dataset.append(example)

  0%|          | 4/109620 [00:17<139:42:31,  4.59s/it]

Detected an empty greedy output. Count:  1


  0%|          | 7/109620 [00:37<172:29:53,  5.67s/it]

In [None]:
synthetic_dataset[0]

In [147]:
nlp

<transformers.pipelines.text_generation.TextGenerationPipeline at 0x7f250aa96cd0>

In [154]:
model.cuda();

In [159]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':64, 
                    'top_k':50, 
                    'top_p':0.85}

In [138]:
text = "curiously they continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit."

In [139]:
def put_on_cuda(tok_output):
    for k, v in tok_output.items():
        tok_output[k] = v.cuda()
    return tok_output
cuda_token = put_on_cuda(tokenizer(text, return_tensors="pt"))

In [169]:
tokenizer.decode(model.generate(input_ids = cuda_token['input_ids'], **decoding_options_1).tolist()[0], skip_special_tokens=True)

"curiously they continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit. `` The pit was filled in with earth and there was no sign of anyone having been in it before they filled it in. '' `'was told that the"

In [181]:
k = model.generate(input_ids = cuda_token['input_ids'], **fallback_decoding)

In [47]:
with open('data/synthetic_discovery_1500.json', 'w') as fout:
    json.dump(synthetic_dataset , fout)

In [8]:
discovery_train_ds.column_names[0]

'idx'

In [None]:
verb
coreference

In [56]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

In [58]:
config = GPT2Config.from_pretrained('microsoft/DialoGPT-large', num_labels = len(LABELS))
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-large')
model = GPT2ForSequenceClassification.from_pretrained('microsoft/DialoGPT-large', config=config)

Some weights of the model checkpoint at microsoft/DialoGPT-large were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-large and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
label = torch.tensor(example['label'])

In [54]:
inputs = tokenizer(example['sentence1'], return_tensors="pt")

In [69]:
outputs = model(**inputs, labels=label)

In [73]:
torch.argmax(outputs[1], axis=1)

tensor([82])