In [1]:
import torch
import json
import re

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import AutoModelForCausalLM, AutoTokenizer

In [43]:
model_path = '/home/nlp/apex/experiment/ctrl/checkpoint-8000'

In [31]:
dataset = load_dataset('data/discovery.py', 'discovery')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [32]:
discovery_train_ds = load_dataset('data/discovery.py', 'discovery', split='train[:7%]')
discovery_valid_ds = dataset["validation"]
discovery_test_ds = dataset["test"]

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [33]:
len(discovery_train_ds)

109620

In [44]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [45]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                         max_length=64,
                                         padding='max_length',
                                         return_length=True,
                                         add_special_tokens=True)

In [46]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
tokenizer.add_special_tokens({'sep_token': '[SEP]'})

1

In [52]:
class DiscoveryDatasetGenerate():
    
    def __init__(self, dataset, labels, tokenizer, model, decoding_options):
        self.dataset = dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)
        self.decoding_options = decoding_options
        
    def get_sentence_as_context(self, idx, sentence_order):
        context = self.dataset[idx]
        tokenized_context = self.tokenizer(self.labels[context['label']] + ' ' + context[sentence_order], 
                                           return_tensors="pt")
        original_text_length = len(self.labels[context['label']] + ' ' + context[sentence_order])
        return tokenized_context, self.labels[context['label']], original_text_length
    
    def check_model_output(self, output_from_model, original_text_length):
        if len(tokenizer.decode(output_from_model.squeeze(0), skip_special_tokens=True)[original_text_length:])<5:
            print("detected an empty greedy output")
            return True
        return False
    
    def generate_from_model(self, tokenized_context, original_text_length):
        input_ids = tokenized_context['input_ids'].to(self.device)
        
        outputs = []
        
        greedy_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[0])
        beam_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[1])
        top_p_k_output = model.generate(input_ids=input_ids, 
                                 **self.decoding_options[2])
 
        # Sometimes the greedy output is empty, so replace it with top-p-k
        if self.check_model_output(greedy_output, original_text_length):
            greedy_output = model.generate(input_ids=input_ids, 
                                **self.decoding_options[2])
        
        
        outputs.append(greedy_output)
        outputs.append(beam_output)
        outputs.append(top_p_k_output)
        
        return outputs
    
    def cleanup_generated_examples(self, outputs, idx, len_context, marker):
        example = {}
        example['ground_truth'] = self.dataset[idx]['sentence2']
        for i, sample_output in enumerate(outputs):
            uncleaned_text = tokenizer.decode(sample_output.squeeze(0).tolist(), skip_special_tokens=True)[len_context:]
            cleaned_text = uncleaned_text.replace('\n', '').replace('\xa0', '').replace('\\', '').replace(marker, '').strip()
            
            if '.' in cleaned_text:
                prev_input_end_index = cleaned_text.index('.') # to remove context in generated output
                cleaned_text = cleaned_text[prev_input_end_index:]

            #           if '.' or '@@' in cleaned_text:
            cleaned_text = .replace('.','', 1).replace('@@','').strip()
            
            if '`' in cleaned_text:
                cleaned_text = cleaned_text.replace('`', '').strip()
            if '"' in cleaned_text:
                cleaned_text = cleaned_text.replace('"', '').strip()
            if ']' in cleaned_text:
                cleaned_text = cleaned_text.replace(']', '').strip()
            if '-' in cleaned_text:
                cleaned_text = cleaned_text.replace('-', '').strip()
            if ')' in cleaned_text:
                cleaned_text = cleaned_text.replace(')', '').strip()
            if '}' in cleaned_text:
                cleaned_text = cleaned_text.replace('}', '').strip()
                
            example['option_' + str(i)] = cleaned_text
        
        return example
    
    def generate_synthetic_options(self, idx, context='sentence1'):
        tokenized_context, marker, original_text_length = self.get_sentence_as_context(idx, context)
        len_context = len(self.dataset[idx][context])
        outputs = self.generate_from_model(tokenized_context, original_text_length)
        clean_examples = self.cleanup_generated_examples(outputs, idx, len_context, marker)        
        return clean_examples

SyntaxError: invalid syntax (<ipython-input-52-7be6b9f7f762>, line 60)

In [48]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2,
                    'temperature': 0}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':50, 
                    'top_k':50, 
                    'top_p':0.95}

In [49]:
decoding_options = []
decoding_options.append(decoding_options_0)
decoding_options.append(decoding_options_1)
decoding_options.append(decoding_options_2)

In [50]:
discovery_ds = DiscoveryDatasetGenerate(discovery_train_ds, LABELS, tokenizer, model, decoding_options)

In [51]:
synthetic_dataset = []

for i in tqdm(range(len(discovery_train_ds))):
    example = {}
    values = discovery_train_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    synthetic_dataset.append(example)

  0%|          | 2/109620 [00:13<225:52:39,  7.42s/it]

detected an empty greedy output


  0%|          | 3/109620 [00:24<251:55:15,  8.27s/it]

detected an empty greedy output


  0%|          | 4/109620 [00:33<263:01:46,  8.64s/it]

detected an empty greedy output


  0%|          | 7/109620 [01:04<293:09:03,  9.63s/it]

detected an empty greedy output


  0%|          | 15/109620 [02:17<287:26:56,  9.44s/it]

detected an empty greedy output


  0%|          | 21/109620 [03:16<294:47:23,  9.68s/it]

detected an empty greedy output


  0%|          | 24/109620 [03:46<304:32:23, 10.00s/it]

detected an empty greedy output


  0%|          | 26/109620 [04:06<306:54:39, 10.08s/it]

detected an empty greedy output


  0%|          | 27/109620 [04:13<275:52:09,  9.06s/it]

detected an empty greedy output


  0%|          | 29/109620 [04:33<282:22:59,  9.28s/it]

detected an empty greedy output


  0%|          | 36/109620 [05:57<302:00:13,  9.92s/it]


ValueError: substring not found

In [42]:
synthetic_dataset[4]

{'context': 'They continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit.',
 'marker': 'curiously',
 'ground_truth': 'Every ten feet they found a layer of logs.',
 'option_0': 'The two men looked at each other and then they both turned their heads back towards the cave. I was surprised by this',
 'option_1': 'S they noticed that the hole in the ceiling was not filled in with the way it had been when they had first entered the room. The only thing that was missing from the room was',
 'option_2': 'It is unclear who built the structures. I guess it must'}

In [106]:
text = "on the other hand should we use plastic"

In [110]:
def put_on_cuda(tok_output):
    for k, v in tok_output.items():
        tok_output[k] = v.cuda()
    return tok_output
cuda_token = put_on_cuda(tokenizer(text, return_tensors="pt"))

In [113]:
tokenizer.decode(model.generate(input_ids = cuda_token['input_ids'], **decoding_options_2).tolist()[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'on the other hand should we use plastic any The When When other as other the other hand up other hand other hand other hand other the other hand up other hand up other hand other the other the other the other the other the other as other the other'

In [83]:
with open('data/synthetic_discovery_100.json', 'w') as fout:
    json.dump(synthetic_dataset , fout)

In [8]:
discovery_train_ds.column_names[0]

'idx'

In [None]:
verb
coreference

In [56]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

In [58]:
config = GPT2Config.from_pretrained('microsoft/DialoGPT-large', num_labels = len(LABELS))
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-large')
model = GPT2ForSequenceClassification.from_pretrained('microsoft/DialoGPT-large', config=config)

Some weights of the model checkpoint at microsoft/DialoGPT-large were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-large and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
label = torch.tensor(example['label'])

In [54]:
inputs = tokenizer(example['sentence1'], return_tensors="pt")

In [69]:
outputs = model(**inputs, labels=label)

In [73]:
torch.argmax(outputs[1], axis=1)

tensor([82])