In [1]:
import time

import torch
from torch.utils.data import DataLoader

from torchtext import datasets
from torchtext.data import Field
from torchtext.data.utils import get_tokenizer

from transformers import AutoTokenizer

from utils import extract_config
from constants import *

import lineflow.datasets as lfds
from transformer_pl import *
from data_pl import *

# config
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Word.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.0001,
    "adam_b1": 0.9,
    "adam_b2": 0.999,
    "adam_l2_weightdecay": 0.01,
    "loss_criterion": "CrossEntropyLoss"
}

# setup
train_dataloader, val_dataloader, test_dataloader, vocab = load_data_word(config)
ntokens = len(vocab.stoi)

# init model
model = DecoderOnlyTransformer(config, ntokens)
trainer = pl.Trainer(gpus=2)
trainer.fit(model, train_dataloader, val_dataloader)

[Start Load Data]
Fetched Data (0.000014s)
Tokenized and Split Data (0.682181s)
Built Vocab (0.862884s)


KeyboardInterrupt: 

In [1]:
from data_pl import *
from utils import *
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Word.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.0001,
    "adam_b1": 0.9,
    "adam_b2": 0.999,
    "adam_l2_weightdecay": 0.01,
    "loss_criterion": "CrossEntropyLoss"
}

train_dataloader, val_dataloader, test_dataloader, vocab = load_data(config)
# print(vocab.stoi)

In [9]:
for batch in train_dataloader:
    data, targets = batch
    print("data", data.shape)
    print("targets",targets.shape)
    break

print(emb_to_string(data[0], vocab))
print(emb_to_string(targets[0:20], vocab))

data torch.Size([35, 20])
targets torch.Size([700])
aer of <eos> n capital experience mr they soviet all the yeast incentives <unk> were because awarded <eos> hong were
banknote them <unk> billion cities\/abc made . say state the student <eos> and up a of to if kong going


In [8]:
targets[0:20]

tensor([6138,  132,    0,   53, 6532,  167,   10,  121,  158,    2, 2341,    3,
           9,   56,    7,    5,    6,   73,  793,  320])

In [4]:
print("[Start Load Data]")
ts = time.time()

# get dataset
dataset, batch_size, max_seq_len = extract_config(config, "dataset", "batch_size", "max_seq_len")
dataset = getattr(datasets, dataset)
print(f"Fetched Data ({time.time() - ts:3f}s)")

# # tokenize
tokenizer = get_tokenizer('basic_english')
field_processor = Field(tokenize=tokenizer)

# split dataset
train_dataset, val_dataset, test_dataset = dataset.splits(
    text_field=field_processor)
print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

# get vocabulary
field_processor.build_vocab(
    train_dataset, val_dataset, test_dataset, min_freq=1)
vocab = field_processor.vocab
print(f"Built Vocab ({time.time() - ts:3f}s)")

# data prep
def data_prep(tt_dataset_split):
    raw_text_iter = tt_dataset_split[0].text
    data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
    data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1).t().contiguous()
    return data

# setup dataloaders
print(data_prep(train_dataset).shape)
train_dataloader = TextDataloader(data_prep(train_dataset), max_seq_len)
val_dataloader = TextDataloader(data_prep(val_dataset), max_seq_len)
test_dataloader = TextDataloader(data_prep(test_dataset), max_seq_len)

[Start Load Data]
Fetched Data (0.000115s)
Tokenized and Split Data (0.707397s)
Built Vocab (0.880745s)
torch.Size([48324, 20])
torch.Size([48324, 20])
torch.Size([3835, 20])
torch.Size([4293, 20])


In [16]:
class TextDataloader:
    def __init__(self, dataset, max_seq_len):
        self.max_seq_len = max_seq_len
        self.dataset = dataset
        print(dataset[0:2])
        self.dataset_len = len(dataset)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        i = self.index
        seq_len = min(self.max_seq_len, self.dataset_len - 1 - i)
        data = self.dataset[i:i+seq_len]
        print(data.size(1))
        target = self.dataset[i:i+1+seq_len].reshape(-1)
        target = target[1: len(target) - data.size(1) + 1]
        # print(target[1: len(target) - data.size(1) + 1].shape)

        print(target[0: seq_len + 1])
        print(data.shape)
        print(target.shape)
        print(seq_len)
        self.index += 1
        return data, target

train_dataloader = TextDataloader(data_prep(train_dataset), max_seq_len)
for batch in train_dataloader:
        data, targets = batch
        print("data", data.shape)
        print("targets", targets.shape)
        break

print(emb_to_string(data[0], vocab))
print(emb_to_string(targets[0:20], vocab))

tensor([[6476,    5,    3,    4,  185, 1583,   26,   41,  413,   74,    2, 7866,
         2689,    0,   50,   82, 2446,    3,  747,   50],
        [6138,  132,    0,   53, 6532,  167,   10,  121,  158,    2, 2341,    3,
            9,   56,    7,    5,    6,   73,  793,  320]])
20
torch.Size([699])
tensor([6476,    5,    3,    4,  185, 1583,   26,   41,  413,   74,    2, 7866,
        2689,    0,   50,   82, 2446,    3,  747,   50, 6138,  132,    0,   53,
        6532,  167,   10,  121,  158,    2, 2341,    3,    9,   56,    7,    5])
torch.Size([35, 20])
torch.Size([720])
35
data torch.Size([35, 20])
targets torch.Size([720])
aer of <eos> n capital experience mr they soviet all the yeast incentives <unk> were because awarded <eos> hong were
aer of <eos> n capital experience mr they soviet all the yeast incentives <unk> were because awarded <eos> hong were
