In [86]:
import torch as t

import random
import torchtext
import transformers

from load_bert import BertClassifier, default_config

In [87]:
def load_dataset() :
    train, test, valid = torchtext.datasets.WikiText2()
    return list(train), list(test), list(valid)

In [88]:
raw_dataset = load_dataset()

In [89]:
def batch_dataset(dataset, batch_size=16):
    batched_dataset = []
    for split in dataset:
        # sort split by length
        split.sort(key=len)
        # make sure split can be batched evenly
        if len(split) % batch_size != 0:
            split = split[:-(len(split) % batch_size)]
        num_batches = len(split) // batch_size
        batches = [split[b * batch_size:(b + 1) * batch_size] for b in range(batch_size)]
        random.shuffle(batches)
        batched_dataset.append(batches)
    return tuple(batched_dataset)

In [90]:
dataset = batch_dataset(raw_dataset)

In [91]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")

In [92]:
def tokenize_dataset(dataset, max_length=512):
    tokenized_dataset = []
    for split in dataset:
        tokenized_split = [] 
        for batch in split:
            tokens = tokenizer(
                batch,
                padding="longest",
                max_length=max_length,
                truncation=True
            )
            tokenized_split.append(tokens["input_ids"])
        tokenized_dataset.append(tokenized_split)
    return tuple(tokenized_dataset)

In [93]:
dataset = tokenize_dataset(dataset)

In [94]:
mask_token = tokenizer.encode("[MASK]")[1]

def mask_dataset(dataset, p=.15):
    for split in dataset:
        for batch in split:
            for tokens in batch:
                for idx, _ in enumerate(tokens):
                    if random.random() < p: tokens[idx] = mask_token
    return dataset

In [95]:
dataset = mask_dataset(dataset)

In [96]:
def tensorify_dataset(dataset):
    tensorified_dataset = []
    for split in dataset:
        tensorified_dataset.append([t.tensor(batch) for batch in split])
    return tuple(tensorified_dataset)

In [97]:
dataset = tensorify_dataset(dataset)

In [98]:
config = {
    "num_layers": 2,
    'hidden_size': 256,
    'intermediate_size': 1024,
}
config = {**default_config, **config}

In [99]:
bert = BertClassifier(**config)

In [100]:
train, test, valid = dataset
while len(train) % 16 != 0: train = train[:-1]
while len(test) % 16 != 0: test = test[:-1]
while len(valid) % 16 != 0: valid = valid[:-1]

In [101]:
batch_size = 16

formatted = []

for data in [train, test, valid]:
    num_batches = len(data) // batch_size
    data.sort(key=len)
    formatted.append(-1) # the thing

train, test, valid = formatted

processed_data = []
for i, data in enumerate(data_train):
    raw_text = data_train[i][1]
    tokens = tokenizer.encode(raw_text)
    processed_data.append((tokens, raw_text, t.tensor(0 if data[0]=='neg' else 1))) # device weird stuff?
processed_data.sort(key=lambda data: len(data[0]))

processed_data_test = []
for i, data in enumerate(data_test):
    raw_text_test = data_test[i][1]
    tokens_test = tokenizer.encode(raw_text_test)
    processed_data_test.append((tokens_test, raw_text_test, t.tensor(0 if data[0]=='neg' else 1))) # device weird stuff?
processed_data_test.sort(key=lambda data: len(data[0]))

NameError: name 'data_train' is not defined