In [1]:
import torch
import json
import re

from collections import OrderedDict

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import AutoModelForCausalLM, AutoTokenizer
from generate import DatasetGenerate

In [2]:
model_path = '/home/nlp/apex/experiment/ctrl_discovery_1/'

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_DISABLED"]="true"

In [4]:
# dataset = load_dataset('data/discovery.py', 'discovery')

In [5]:
discovery_train_ds = load_dataset('discovery', 'discovery', split='train[:2%]') #7
discovery_valid_ds = load_dataset('discovery', 'discovery', split='validation') 
discovery_test_ds = load_dataset('discovery', 'discovery', split='test') 

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)
Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)
Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [6]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                         max_length=96,
                                         padding='max_length',
                                         return_length=True,
                                         add_special_tokens=True)

In [8]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.add_special_tokens({'cls_token': '[CLS]'})
# tokenizer.add_special_tokens({'sep_token': '[SEP]'})

0

In [9]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2,
                    'temperature': 0}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':64, 
                    'top_k':50, 
                    'top_p':0.95}

fallback_decoding = {'max_length': 64,
                     'num_beams':25, 
                     'no_repeat_ngram_size':2,
                     'num_return_sequences': 2,
                     'temperature': 0.7,
                     'early_stopping':True}

In [10]:
decoding_options = []
decoding_options.append(decoding_options_0)
decoding_options.append(decoding_options_1)
decoding_options.append(decoding_options_2)
decoding_options.append(fallback_decoding)

In [11]:
discovery_ds = DatasetGenerate(discovery_valid_ds, model, tokenizer, LABELS, decoding_options)

In [12]:
synthetic_ctrl_2_dataset = []
generated_ds = synthetic_ctrl_2_dataset

for i in tqdm(range(len(discovery_valid_ds))):
    example = {}
    values = discovery_valid_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    generated_ds.append(example)

  0%|          | 17/87000 [01:34<134:39:22,  5.57s/it]


KeyboardInterrupt: 

In [16]:
synthetic_ctrl_2_dataset[2]

{'context': "So, I decided to order a Westin bull-bar in chrome plated finish because I wasn't too happy with the stainless steel.",
 'marker': 'luckily',
 'ground_truth': 'The Smittybilt skid plate I had black powder coated fit on the Westin bull-bar.',
 'option_0': 'The bullbar arrived within 2 days of my placing the order It was exactly what we were looking for',
 'option_1': 'It arrived within a couple of days and I was very pleased with how it looked s it was too short for my liking so I had to return it and reorder in the correct length',
 'option_2': 'The bullbar arrived within a couple of days in as in been exactly what I was looking fora polished stainless steel bull bar It was easy to install fit the bill'}

In [72]:
synthetic_ctrl_2_dataset[12]

{'context': "We hope that Wood won't be crushed, that he'll continue the exotic belly dance he began before the outburst, and soon start a new picture.",
 'marker': 'happily',
 'ground_truth': 'He does both.',
 'option_0': 'Until then We wish him the Best wishes for a speedy recovery and a Happy New Year to all his Brothers and SistersAnd to his Mother and BrotherinLaws and Motherinlaws',
 'option_1': 'Wood is one of the most beautiful men Ive ever laid eyes upon Wood was born and raised in New York City and graduated from the High School of Music  Art in Manhattan New York',
 'option_2': 'Until then We wish him the Best wishes for a speedy recovery and a Happy New Year to all his Brothers and SistersAnd to his Mother and BrotherinLaw and their families'}

In [None]:
synthetic_dataset[15]

In [None]:
model.cuda();

In [159]:
decoding_options_0 = {'max_length': 64,
                    'repetition_penalty': 1.2}

decoding_options_1 = {'max_length': 64,
                      'num_beams':5, 
                      'no_repeat_ngram_size':2, 
                      'early_stopping':True}

decoding_options_2 = {'max_length': 64,
                    'do_sample':True, 
                    'max_length':64, 
                    'top_k':50, 
                    'top_p':0.85}

In [138]:
text = "curiously they continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit."

In [139]:
def put_on_cuda(tok_output):
    for k, v in tok_output.items():
        tok_output[k] = v.cuda()
    return tok_output
cuda_token = put_on_cuda(tokenizer(text, return_tensors="pt"))

In [169]:
tokenizer.decode(model.generate(input_ids = cuda_token['input_ids'], **decoding_options_1).tolist()[0], skip_special_tokens=True)

"curiously they continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit. `` The pit was filled in with earth and there was no sign of anyone having been in it before they filled it in. '' `'was told that the"

In [181]:
k = model.generate(input_ids = cuda_token['input_ids'], **fallback_decoding)

In [47]:
with open('data/synthetic_discovery_1500.json', 'w') as fout:
    json.dump(synthetic_dataset , fout)

In [8]:
discovery_train_ds.column_names[0]

'idx'