In [1]:
import time

import torch
from torch.utils.data import DataLoader

from torchtext import datasets
from torchtext.data import Field
from torchtext.data.utils import get_tokenizer

from transformers import AutoTokenizer

from utils import extract_config
from constants import *

import lineflow.datasets as lfds
from transformer_pl import *
from data_pl import *

In [2]:
    config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Word.name,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

In [3]:
# load word based training data
def load_data_word(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation, batch_size, max_seq_len  = extract_config(config, "dataset", "segmentation", "batch_size", "max_seq_len")
    dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # # tokenize
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(
        text_field=field_processor)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # get vocabulary
    field_processor.build_vocab(
        train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Built Vocab ({time.time() - ts:3f}s)")

    # # iterators
    # train_dataset, val_dataset, test_dataset = dataset.iters(device=None)
    # return train_dataloader, val_dataloader, test_dataloader, vocab

    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                                dtype=torch.long) for item in raw_text_iter]
        data =  torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        # Divide the dataset into bsz parts.
        nbatch = data.size(0) // batch_size
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * batch_size)
        # Evenly divide the data across the bsz batches.
        data = data.view(batch_size, -1).t().contiguous()
        return data
      
        

    # setup dataloaders
    train_dataloader = DataLoader(data_process(train_dataset))
    val_dataloader = DataLoader(data_process(val_dataset))
    test_dataloader = DataLoader(data_process(test_dataset))

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab

train_dataloader, val_dataloader, test_dataloader, vocab = load_data_word(config)
ntokens = len(vocab.stoi)


[Start Load Data]
Fetched Data (0.000011s)
Tokenized and Split Data (0.687010s)
Built Vocab (0.868656s)
[End Load Data] (13.844115s)


In [20]:
print("[Start Load Data]")
ts = time.time()

# get dataset
dataset, segmentation, batch_size, max_seq_len  = extract_config(config, "dataset", "segmentation", "batch_size", "max_seq_len")
dataset = getattr(datasets, dataset)
print(f"Fetched Data ({time.time() - ts:3f}s)")

# # tokenize
tokenizer = get_tokenizer('basic_english')
field_processor = Field(tokenize=tokenizer)

# split dataset
train_dataset, val_dataset, test_dataset = dataset.splits(
    text_field=field_processor)
print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

# get vocabulary
field_processor.build_vocab(
    train_dataset, val_dataset, test_dataset, min_freq=1)
vocab = field_processor.vocab
print(f"Built Vocab ({time.time() - ts:3f}s)")

# # iterators
# train_dataset, val_dataset, test_dataset = dataset.iters(device=None)
# return train_dataloader, val_dataloader, test_dataloader, vocab

def data_process(tt_dataset_split):
    raw_text_iter = tt_dataset_split[0].text
    data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))


# setup dataloaders
processed_train_data = data_process(train_dataset)
print(processed_train_data)
train_dataloader = TextDataloader(processed_train_data)


print(f"[End Load Data] ({time.time() - ts:3f}s)")
# return train_dataloader, val_dataloader, test_dataloader, vocab

[Start Load Data]
Fetched Data (0.000134s)
Tokenized and Split Data (0.682312s)
Built Vocab (0.865465s)
tensor([6476, 6138, 7909,  ...,   10,    0,    3])


TypeError: __init__() missing 1 required positional argument: 'max_seq_len'

In [12]:


def data_process(tt_dataset_split):
    raw_text_iter = tt_dataset_split[0].text
    data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
    data =  torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
    # print(data[0:100])
    # print("num words", len(data))
    # print("batch_size", batch_size)
    # print("max_seq_len", max_seq_len)
    # # Divide the dataset into bsz parts.
    # nbatch = data.size(0) // batch_size * max_seq_len
    # nbatch = data.size(0) // batch_size
    # # print("nbatch", nbatch)
    # # print(torch.reshape(data, (nbatch, )))

    # # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # data = data.narrow(0, 0, nbatch * batch_size)
    # # Evenly divide the data across the bsz batches.
    # data = data.view(batch_size, -1).t().contiguous()
    return data

processed = data_process(train_dataset)
print(processed.shape)
print(processed[0:2])

tensor([6476, 6138, 7909, 8560, 2349, 8591, 7044, 8749, 8760, 7577, 8133, 8149,
        6309, 8866, 6707, 7196, 7202, 8980, 5771, 6386, 9044, 9054, 9073, 8454,
           3, 9671,    0,    4,   77,  403,   37, 2146,    2,  148,   22,    7,
        9634,  284,  455,   10,    4,    3,   26,   10,    0,   16,  144,    5,
           0,    4,   10, 2520,   10,    2, 2962, 1604,   99,    3, 7752,    0,
           4,   77,  403,    9,  355,  144,    5, 2532,  693, 2323,  956,   27,
         536,    7, 9634,  284,    5,   42,  296,  425, 3717,    3,    7, 1009,
           5, 3085,  514,  274,    6,  143, 6282, 4212, 6615,   34,  928,    7,
         242,  788,    5, 1112])
num words 966480
batch_size 20
max_seq_len 35
nbatch 48324
torch.Size([48324, 20])
tensor([[6476,    5,    3,    4,  185, 1583,   26,   41,  413,   74,    2, 7866,
         2689,    0,   50,   82, 2446,    3,  747,   50],
        [6138,  132,    0,   53, 6532,  167,   10,  121,  158,    2, 2341,    3,
            9,   56,    

In [38]:
# train_dataloader
for batch_ndx, sample in enumerate(train_dataloader):
    print(batch_ndx)
    print(sample)
    print(sample.shape)
    break
# print(train_dataloader.shape)

0
tensor([[6476,    5,    3,    4,  185, 1583,   26,   41,  413,   74,    2, 7866,
         2689,    0,   50,   82, 2446,    3,  747,   50]])
torch.Size([1, 20])


In [4]:
# init model
model = DecoderOnlyTransformer(config, ntokens)
trainer = pl.Trainer(gpus=2, accelerator="dp")
trainer.fit(model, train_dataloader, val_dataloader)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mopenai-scholars[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.20 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name         | Type               | Params
----------------------------------------------------
0 | decoder      | TransformerDecoder | 25.2 M
1 | pos_encoder  | PositionalEncoding | 0     
2 | to_embedding | Embedding          | 5.1 M 
3 | linear       | Linear             | 5.1 M 
4 | criterion    | CrossEntropyLoss   | 0     
----------------------------------------------------
35.4 M    Trainable params
0         Non-trainable params
35.4 M    Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

validation_step batch_idx 0
validation_step batch tensor([[1076,    5,   72,    3, 5875,    6, 2682,    6,    0,   71, 5784,   11,
            3,   21,  378,    4,   20,  159,   18,   10]], device='cuda:0')


ValueError: not enough values to unpack (expected 2, got 1)

In [16]:
# from torchtext.datasets import AG_NEWS
# train_iter = AG_NEWS(split='train')

import torch
print(torch.__version__)

1.5.0


In [19]:
from datasets import load_dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')

Downloading and preparing dataset wikitext/wikitext-2-raw-v1 (download: 4.50 MiB, generated: 12.91 MiB, post-processed: Unknown size, total: 17.41 MiB) to /home/gbafa/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4721645.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset wikitext downloaded and prepared to /home/gbafa/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91. Subsequent calls will reuse this data.


In [22]:
dataset[1]

{'text': ' = Valkyria Chronicles III = \n'}

In [23]:
class TextDataloader:
    def __init__(self, dataset, max_seq_len):
        self.max_seq_len = max_seq_len
        self.dataset = dataset
        self.dataset_len = len(dataset)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        i = self.index
        seq_len = min(self.max_seq_len, self.dataset_len - 1 - i)
        data = self.dataset[i:i+seq_len]
        target = self.dataset[i+1:i+1+seq_len].reshape(-1)
        self.index += 1
        return data, target





In [24]:
processed_train_data = data_process(train_dataset)
print(processed_train_data.shape)
train_dataloader = TextDataloader(processed_train_data, max_seq_len)

torch.Size([966480])


In [26]:
for batch_ndx, batch in enumerate(train_dataloader):
    data, targets = batch
    print(batch_ndx)
    print(data)
    print(targets)
    break

0
tensor([6138, 7909, 8560, 2349, 8591, 7044, 8749, 8760, 7577, 8133, 8149, 6309,
        8866, 6707, 7196, 7202, 8980, 5771, 6386, 9044, 9054, 9073, 8454,    3,
        9671,    0,    4,   77,  403,   37, 2146,    2,  148,   22,    7])
tensor([7909, 8560, 2349, 8591, 7044, 8749, 8760, 7577, 8133, 8149, 6309, 8866,
        6707, 7196, 7202, 8980, 5771, 6386, 9044, 9054, 9073, 8454,    3, 9671,
           0,    4,   77,  403,   37, 2146,    2,  148,   22,    7, 9634])
