In [11]:
import torch

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import GPT2LMHeadModel, GPT2Config, GPT2TokenizerFast

In [2]:
dataset = load_dataset('data/discovery.py', 'discovery')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [3]:
discovery_train_ds = dataset["train"]
discovery_valid_ds = dataset["validation"]
discovery_test_ds = dataset["test"]

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2-large')

In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

In [63]:
class DiscoveryDatasetAF():
    
    def __init__(self, dataset, labels, tokenizer, model):
        self.dataset = dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)
        
    def get_sentence_as_context(self, idx, sentence_order):
        context = self.dataset[idx]
        tokenized_context = self.tokenizer(context[sentence_order] + ' ' + self.labels[context['label']], return_tensors="pt")
        return tokenized_context
    
    def generate_synthetic_options(self, idx, context='sentence1'):
        tokenized_context = self.get_sentence_as_context(idx, context)['input_ids'].to(self.device)
        input_ids, len_context = tokenized_context, len(self.dataset[idx][context])
        outputs = model.generate(input_ids,
                                do_sample = True, 
                                max_length = 50, 
                                top_k = 50, 
                                top_p = 0.95, 
                                num_return_sequences = 3
                                )
        example = {}
        example['ground_truth'] = self.dataset[idx]['sentence2']
        for i, sample_output in enumerate(outputs):
            uncleaned_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len_context:]
            cleaned_text = uncleaned_text.replace('\n', '').replace('\xa0', '').replace('\\', '')
            example['option_' + str(i)] = cleaned_text
        return example

In [64]:
discovery_ds = DiscoveryDatasetAF(discovery_train_ds, LABELS, tokenizer, model)

In [65]:
synthetic_dataset = []

for i in tqdm(range(len(discovery_train_ds))):
    example = {}
    values = discovery_train_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    synthetic_dataset.append(example)

  0%|          | 0/1566000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/1566000 [00:00<348:12:03,  1.25it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/1566000 [00:01<290:26:41,  1.50it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/1566000 [00:01<291:26:54,  1.49it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/1566000 [00:02<242:03:56,  1.80it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/1566000 [00:02<261:46:09,  1.66it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/1566000 [00:03<322:02:00,  1.35it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/1566000 [00:04<330:24:41,  1.32it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

KeyboardInterrupt: 

In [66]:
synthetic_dataset[4]

{'context': 'They continued to dig noting that there were pick marks on the walls of the pit where someone before them had dug out the pit.',
 'marker': 'curiously,',
 'ground_truth': 'Every ten feet they found a layer of logs.',
 'option_0': ' curiously, the pits did not seem to have any other human activity as there were no footprints in the dirt, no animals',
 'option_1': ' curiously, she found a "showing that a good one or two of the tools they found in the dig can still',
 'option_2': ' curiously, they found a huge hole and a layer of earth covered with the remains of the ancient people. There was also'}

In [68]:
len(discovery_train_ds)

1566000