In [1]:
import torch
import json
import re

from datasets import load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import AutoModelForCausalLM, AutoTokenizer, CTRLTokenizer

In [2]:
# dataset = load_dataset('discofuse', 'discofuse-wikipedia', split='train[:10%]')

In [3]:
# dataset["train"][0]

In [4]:
# def filtering(example):
#     if "connective_string" in example: return True
#     else: return False

In [5]:
# dataset["train"].filter(filtering)

In [2]:
dataset = load_dataset('data/discovery.py', 'discovery')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [3]:
discovery_train_ds = dataset["train"]
discovery_valid_ds = dataset["validation"]
discovery_test_ds = dataset["test"]

In [7]:
model = AutoModelForCausalLM.from_pretrained('/home/nlp/apex/experiment/ctrl/')

In [8]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl',
                                         padding_side='right',
                                         max_length=64,
                                         padding='max_length',
                                         add_special_tokens=True)

In [9]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
tokenizer.add_special_tokens({'sep_token': '[SEP]'})

1

In [42]:
# training gpt2 on discofuse and discovery doesn't help
# training gpt2 on seq class didn't help
# Control codes; baby

In [10]:
class DiscoveryDatasetAF():
    
    def __init__(self, dataset, labels, tokenizer, model):
        self.dataset = dataset
        self.labels = labels
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)
        
    def get_sentence_as_context(self, idx, sentence_order):
        context = self.dataset[idx]
        tokenized_context = self.tokenizer(self.labels[context['label']] + ' ' + context[sentence_order], return_tensors="pt")
        return tokenized_context, self.labels[context['label']]
    
    def generate_synthetic_options(self, idx, context='sentence1'):
        tokenized_context, marker = self.get_sentence_as_context(idx, context)
        tokenized_context = tokenized_context['input_ids'].to(self.device)
        input_ids, len_context = tokenized_context, len(self.dataset[idx][context])
        outputs = model.generate(input_ids,
                                do_sample = True, 
                                max_length = 50, 
                                top_k = 50, 
                                top_p = 0.95, 
                                num_return_sequences = 3,
                                temperature=0.7
                                )
        example = {}
        example['ground_truth'] = self.dataset[idx]['sentence2']
        for i, sample_output in enumerate(outputs):
            uncleaned_text = tokenizer.decode(sample_output, skip_special_tokens=True)[len_context:]
            cleaned_text = uncleaned_text.replace('\n', '').replace('\xa0', '').replace('\\', '').replace(marker, '').strip()
            example['option_' + str(i)] = cleaned_text
        return example

In [11]:
discovery_ds = DiscoveryDatasetAF(discovery_train_ds, LABELS, tokenizer, model)

In [12]:
synthetic_dataset = []

for i in tqdm(range(len(discovery_train_ds))):
    example = {}
    values = discovery_train_ds[i]
    example['context'] = values['sentence1']
    example['marker'] = LABELS[values['label']]
    generated_options = discovery_ds.generate_synthetic_options(i)
    example.update(generated_options)
    synthetic_dataset.append(example)

  0%|          | 124/1566000 [02:52<605:45:50,  1.39s/it]


KeyboardInterrupt: 

In [24]:
synthetic_dataset[21]

{'context': 'After playing in all 82 regular season games during his sophomore season, Bynum has played in 35 , 50 , 65 and 54 games respectively afterward.',
 'marker': 'remarkably',
 'ground_truth': "During the brutal lockout-shortened schedule this season, the Lakers' center has been perhaps the healthiest of his entire career.",
 'option_0': 'fterward. He has not played in a single playoff game during his collegiate career. and the city of phoenix agre@@',
 'option_1': 'fterward. He has played in exactly one full NBA season. the NBA and the Cleveland Cavaliers are on opposing sides',
 'option_2': 'fterward. He has played in all 82 games during each of previous four full seasons. He is one of only two'}

In [90]:
text = 'actually NSA is listening to us'

In [91]:
input_ids = tokenizer(text, return_tensors="pt")['input_ids'].to('cuda')

In [92]:
input_ids

tensor([[ 1669, 76353,     8,  6185,     3,   133]], device='cuda:0')

In [93]:
output = model.generate(input_ids,
                    do_sample = True, 
                    max_length = 100, 
                    top_k = 50, 
                    top_p = 0.95, 
#                     num_return_sequences = 3,
                    temperature=0.7)

In [94]:
tokenizer.decode(output[0])

"actually NSA is listening to us via satellite, but the NSA has never been able to tell us what it is listening to. The NSA does not even know what they are listening to, because they can't even tell us. [PAD] [PAD] [PAD] [PAD] is a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a registered user. [PAD] is not a"

In [83]:
with open('data/synthetic_discovery_100.json', 'w') as fout:
    json.dump(synthetic_dataset , fout)

In [8]:
discovery_train_ds.column_names[0]

'idx'

In [None]:
verb
coreference

In [56]:
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

In [58]:
config = GPT2Config.from_pretrained('microsoft/DialoGPT-large', num_labels = len(LABELS))
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-large')
model = GPT2ForSequenceClassification.from_pretrained('microsoft/DialoGPT-large', config=config)

Some weights of the model checkpoint at microsoft/DialoGPT-large were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-large and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
label = torch.tensor(example['label'])

In [54]:
inputs = tokenizer(example['sentence1'], return_tensors="pt")

In [69]:
outputs = model(**inputs, labels=label)

In [73]:
torch.argmax(outputs[1], axis=1)

tensor([82])