In [1]:

# imports
import enum
import io
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torchtext import datasets, vocab
from torchtext.data import Field, BPTTIterator
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pytorch_lightning as pl
import matplotlib.pyplot as plt
import numpy as np 

import wandb


In [2]:
# utils 
def extract_config(config, *argv):
    assert len(argv) > 0, "No keys to extract"
    config_values = []
    for key in argv:
        assert key in config, f"Key '{key}' not in config"
        config_values.append(config[key])
    
    return tuple(config_values) if len(argv) > 1 else config_values[0]

def validate_config(config):
    embedding_dimension, n_attention_heads = extract_config(config, "embedding_dimension", "n_attention_heads")
    
    # embedding dimension must be divisible by n_attention_heads
    assert embedding_dimension %  n_attention_heads == 0, f"Embedding dimension ({embedding_dimension}) must be divisible by n_attention_heads ({n_attention_heads})"

def emb_to_string(emb, vocab):
    embeddings = vocab.itos
    words = [ embeddings[item] for item in emb ]
    return ' '.join(words)

In [3]:
# character tokenizer
# ## UTF-8 Encoder
# def char_tokenizer(string):
#     return [x + 2 for x in str.encode(string)]
# def char_decoder(tokens):
#     return "".join([chr(x - 2) if x > 1 else "" for x in tokens])

def char_tokenizer(string):
    return [x for x in string]
def char_decoder(tokens):
    return "".join([x for x in tokens])

# batch functions
def batchify(data, bsz, device):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(max_seq_len, source, i):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

# load training data
def load_data(config):
    return load_data_word(config)
    
# load word based training data
def load_data_word(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset.name) 
    print(f"Fetched Data ({time.time() - ts:3f}s)")
    
    # tokenize
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")
    print(train_dataset[0:10])

    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Built Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab

def load_data_subword(config):
    # load word based training data
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset.name) 


    tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
    field_processor = Field(tokenize=tokenizer.encode)


    # tokenizer = get_tokenizer('subword')
    # field_processor = Field(tokenize=tokenizer)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Split Data ({time.time() - ts:3f}s)")

    print(train_dataset)
    # get vocabulary
    # field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    # vocab = field_processor.vocab
    vocab = tokenizer.get_vocab()

    print(f"Build Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab



def load_data_character(config):
    # load word based training data
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset.name) 
    # tokenizer = get_tokenizer('basic_english')
    tokenizer = char_tokenizer
    field_processor = Field(tokenize=tokenizer)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Split Data ({time.time() - ts:3f}s)")

    print(train_dataset[0:10])
    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Build Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab


In [4]:
# load word based training data
print("[Start Load Data]")
ts = time.time()

# get dataset
dataset, segmentation = extract_config(config, "dataset", "segmentation")
dataset = getattr(datasets, dataset.name) 

tokenizer = get_tokenizer('subword')
field_processor = Field(tokenize=tokenizer)
print(f"Fetched Data ({time.time() - ts:3f}s)")

# split dataset
train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
print(f"Split Data ({time.time() - ts:3f}s)")

print(train_dataset)

[Start Load Data]


NameError: name 'config' is not defined

In [59]:
import sentencepiece as spm
s = spm.SentencePieceProcessor(model_file='spm.model')
s.encode('New York', out_type=str, enable_sampling=True, alpha=0.1, nbest=-1)

OSError: Not found: "spm.model": No such file or directory Error #2

In [5]:
def load_data_subword(config):
    # load word based training data
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset.name) 


    tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
    field_processor = Field(tokenize=tokenizer.encode)


    # tokenizer = get_tokenizer('subword')
    # field_processor = Field(tokenize=tokenizer)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Split Data ({time.time() - ts:3f}s)")

    print(train_dataset)
    # get vocabulary
    # field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    # vocab = field_processor.vocab
    vocab = tokenizer.get_vocab()

    print(f"Build Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab
    
# train_data, val_data, test_data, vocab = load_data_word(config)
train_data, val_data, test_data, vocab = load_data_subword(config) 

# train_data, val_data, test_data, vocab = load_data_character(config)

NameError: name 'config' is not defined

In [12]:
# load word based training data
print("[Start Load Data]")
ts = time.time()

# get dataset
dataset, segmentation = extract_config(config, "dataset", "segmentation")
dataset = getattr(datasets, dataset.name) 


class HuggingFaceField(Field):
    def __init__(self, tokenizer):
        super().__init__(tokenize=tokenizer.encode)
        self.tokenizer = tokenizer

    def numericalize(self, arr):
        arr = [self.tokenizer.convert_tokens_to_ids(x) for x in arr]
        return torch.tensor(arr)

tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased',  eos_token="<eos>")
field_processor = HuggingFaceField(tokenizer=tokenizer)
        

# tokenizer = get_tokenizer('subword')
# field_processor = Field(tokenize=tokenizer)
print(f"Fetched Data ({time.time() - ts:3f}s)")

# split dataset
train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
print(f"Split Data ({time.time() - ts:3f}s)")

print(train_dataset)
print([x for x in train_dataset.text])
# print(vocab)
# get vocabulary
# field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
# vocab = field_processor.vocab
vocab = tokenizer.get_vocab()

print(f"Build Vocab ({time.time() - ts:3f}s)")


def data_process(tt_dataset_split):
    raw_text_iter = tt_dataset_split[0].text
    data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                        dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

train_data = data_process(train_dataset)
val_data = data_process(val_dataset)
test_data = data_process(test_dataset)

print(f"[End Load Data] ({time.time() - ts:3f}s)")
return train_data, val_data, test_data, vocab

22, 1009, 81, 604, 916, 33, 31892, 1068, 4, 3, '<eos>', 17, 299, 98, 27, 774, 22, 1287, 18, 10431, 2028, 9861, 36, 51, 72, 2063, 25, 75, 87, 754, 18709, 1845, 248, 4, 3, '<eos>', 38, 29, 424, 1682, 132, 423, 24, 206, 0, 3411, 4, 3, '<eos>', 24, 24656, 25, 18, 1193, 28, 31892, 132, 50, 7624, 259, 99, 18, 21588, 471, 244, 119, 20, 17, 29816, 146, 9327, 5062, 4, 3, '<eos>', 18, 8373, 23, 1665, 24, 8689, 40, 7015, 70, 100, 578, 578, 20, 31892, 1353, 286, 3616, 4, 3, '<eos>', 52, 27, 48, 19488, 6529, 9327, 1727, 349, 17, 28187, 3084, 17, 20977, 9775, 118, 53, 20, 85, 617, 672, 17, 529, 13439, 17030, 577, 4, 3, '<eos>', 31892, 17, 26, 23, 17, 15916, 6040, 1449, 19473, 55, 76, 578, 3883, 25, 24, 151, 344, 3495, 38, 578, 578, 4, 3, '<eos>', 31892, 17, 26, 23, 24, 7841, 23, 144, 18, 226, 65, 20, 18, 127, 2624, 355, 9114, 5975, 2382, 25, 18, 17, 660, 9, 23, 9, 33, 70, 100, 0, 20, 81, 1353, 2431, 105, 4, 3, '<eos>', 31892, 11000, 18, 0, 649, 28, 18, 17, 660, 9, 23, 9, 95, 13, 305, 13, 17607, 344,

AssertionError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [92]:
print(train_data, val_data, test_data)
print(emb_to_string(train_data[0:100], vocab))
print(emb_to_string(val_data[0:100], vocab))
print(emb_to_string(test_data[0:100], vocab))

vocab.__dict__.keys()



tensor([6476, 6138, 7909,  ...,   10,    0,    3]) tensor([1076,   97,  362,  ...,    8,    4,    3]) tensor([101,  17,  27,  ...,  24, 512,   3])
aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter <eos> pierre <unk> n years old will join the board as a nonexecutive director nov . n <eos> mr . <unk> is chairman of <unk> n . v . the dutch publishing group <eos> rudolph <unk> n years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate <eos> a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer
consumers may want to move their telephones a little closer to the tv set <eos> <unk> <unk> watching abc ' s monday night football can now vote during <unk> for the greatest play in n years from among four or five <unk> <unk> <eos> two weeks ago viewers of

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [56]:
def char_tokenizer(string):
    return [x for x in string]
def char_decoder(tokens):
    return "".join([x for x in tokens])

char_decoder(char_tokenizer("this is a test"))

'this is a test'

In [89]:
# train_data, val_data, test_data, vocab = load_data(config)

test_string = "This is a test string to test the tokenization process."
dataset, segmentation = extract_config(config, "dataset", "segmentation")
dataset = getattr(datasets, dataset.name) 
tokenizer = get_tokenizer('subword')
tokenizer(test_string)



['\ue302 this ',
 ' is ',
 ' a ',
 ' test ',
 ' string ',
 ' to ',
 ' test ',
 ' the ',
 ' tokenization ',
 ' process ',
 '. ']

In [6]:
from transformers import XLNetTokenizer
from transformers import AutoTokenizer

class Segmentation(enum.Enum):
    Word = 0,
    Subword = 1
    Character = 2
    BYTE = 3
    BBPE = 4

config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank,
    "segmentation": Segmentation.Word,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "loss_criterion": "CrossEntropyLoss"
}

# character tokenizer
def char_tokenizer(string):
    return [x + 2 for x in str.encode(string)]
def char_decoder(tokens):
    return "".join([chr(x - 2) if x > 1 else "" for x in tokens])

# switch 
def get_tokenizer_config(segmentation): 
    switcher = { 
        Segmentation.Word: get_tokenizer('basic_english'),
        # Segmentation.Subword: XLNetTokenizer.from_pretrained('xlnet-base-cased'), 
        Segmentation.Subword: BertTokenizer.from_pretrained("bert-base-uncased"),
        Segmentation.Character: char_tokenizer, 
        Segmentation.BYTE: "one", 
        Segmentation.BBPE: "one", 
    } 
    # default to word
    return switcher.get(segmentation, get_tokenizer('basic_english'))

# train_data, val_data, test_data, vocab = load_data(config)

test_string = "This is a preconfigured test string to test the tokenization process. Segmentation is like fragmentation."
dataset, segmentation = extract_config(config, "dataset", "segmentation")
dataset = getattr(datasets, dataset.name) 
tokenizer = get_tokenizer('subword')
# tokenizer = get_tokenizer_config(segmentation) 
print(tokenizer)
tokenizer(test_string)

NameError: name 'Dataset' is not defined

In [95]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

print(tokenizer)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=798011.0), HTML(value='')))


PreTrainedTokenizer(name_or_path='xlnet-large-cased', vocab_size=32000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='left', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': ['<eop>', '<eod>']})


In [7]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
# tokenizer = get_tokenizer('subword')
# model = XLNetModel.from_pretrained('xlnet-base-cased')
inputs = tokenizer.encode("Hello, my dog is cute")
# inputs = tokenizer("Hello, my dog is cute")


print(inputs)
print(tokenizer.__dict__.keys())
# print(tokenizer._convert_id_to_token(inputs[]))
vocab = tokenizer.get_vocab()
print(tokenizer.convert_tokens_to_string([tokenizer._convert_id_to_token(x) for x in inputs]))
# emb_to_string(inputs, vocab)
# load_vocab(tokenizer.vocab_file)

# outputs = model(**inputs)
# last_hidden_states = outputs.last_hidden_state
tokenizer._convert_id_to_token(3)
vocab['<eos>']

[17, 11368, 19, 94, 2288, 27, 10920, 4, 3]
Hello, my dog is cute<sep><cls>


KeyError: '<eos>'

In [62]:

tokens = [x + 2 for x in str.encode("abcdABCD")]
print(tokens)
char_decoder(tokens)


[99, 100, 101, 102, 67, 68, 69, 70]


'abcdABCD'

In [28]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer("Hello world")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1042301.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




{'input_ids': [15496, 995], 'attention_mask': [1, 1]}

In [10]:
# pytorch lightning stuff
def load_data_pl(config): 
    # get dataset
    dataset = extract_config(config, "dataset")
    dataset = getattr(datasets, dataset.name) 
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    
    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)

    return train_dataset, val_dataset, test_dataset, field_processor




In [8]:
# generate/visualize artifacts
def initalize_artifacts(config, train_data_batches, val_data_batches):
        n_epochs, max_seq_len = extract_config(config, "n_epochs", "max_seq_len")
        training_cel = torch.ones(n_epochs, math.ceil(len(train_data_batches) / max_seq_len)) * float("inf")
        validation_cel = torch.ones(n_epochs, math.ceil(len(val_data_batches) / max_seq_len)) * float("inf")
        artifacts = {
            "training": {
                "CrossEntropyLoss": training_cel
            },
            "validation": {
                "CrossEntropyLoss": validation_cel
            }
        }
        return artifacts

def update_artifact_loss(artifacts, training_stage, metric, epoch, batch, value):
    try:
        artifacts[training_stage][metric][epoch - 1][batch] = value
    except Exception as e:
        print("exception:", e)
        print("epoch", epoch)
        print("batch", batch)
        print(artifacts)

def visualize_artifacts(artifacts):
    flat_loss = artifacts['training']['CrossEntropyLoss'].reshape(-1)
    count = flat_loss.size(0)
    batch_number = np.arange(0, flat_loss.size(0))
    plt.plot(batch_number, flat_loss)
    plt.legend("CrossEntropyLoss")
    None

# artifacts = initalize_artifacts(config, train_data_batches, val_data_batches)
# update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 1, 0.5)
# update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 2, 3)
# # artifacts['training']['CrossEntropyLoss'].reshape(-1)
# visualize_artifacts(artifacts)
# # visualize_artifacts(artifacts)


In [11]:
# Decoder only transformer implementation
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder, LayerNorm
from torch import Tensor
from typing import Optional, Any

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Decoder Only implmentation without memory for encoder
# Adapted from pytorch implmentation @ https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerDecoderLayer
class CustomTransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(CustomTransformerDecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model) # skip
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout) # skip
        self.dropout3 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(CustomTransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:

        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        # tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
        #                            key_padding_mask=memory_key_padding_mask)[0]
        # tgt = tgt + self.dropout2(tgt2)
        # tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt



def _get_clones(module, N):
    return ModuleList([copy.deepcopy(module) for i in range(N)])


def _get_activation_fn(activation):
    if activation == "relu":
        return F.relu
    elif activation == "gelu":
        return F.gelu

    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))

# decoder only implmentation
# pytorch implmentation for torch ligthning
# class Transformer(pl.LightningModule):
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, ntokens, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
                 activation="relu", custom_encoder=None, custom_decoder=None):
        super(DecoderOnlyTransformer, self).__init__()
        # model vars
        self.d_model = d_model
        self.nhead = nhead

        # decoder setup 
        decoder_layer = CustomTransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

        # embedding setup
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.to_embedding = nn.Embedding(ntokens, d_model)

        # output setup
        self.linear = nn.Linear(d_model, ntokens)

        self._reset_parameters()


    def forward(self, tgt, tgt_mask=None, memory_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
   
        # convert input/targets to embeddings
        tgt = self.to_embedding(tgt) * math.sqrt(self.d_model)

        # add positional encodings
        tgt = self.pos_encoder(tgt)

        # pytorch checks
        # https://pytorch.org/docs/master/generated/torch.nn.Transformer.html#torch.nn.Transformer.forward
        if  tgt.size(2) != self.d_model:
            raise RuntimeError("the feature number of tgt must be equal to d_model")
        
        # decoder pass
        output = self.decoder(tgt, memory=None, tgt_mask=tgt_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        # return after linear layer
        return self.linear(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

In [9]:
# constants/enums
class Dataset(enum.Enum):
    PennTreebank = 0,
    WikiText2 = 1,
    WikiText103 = 2

class LanguageTask(enum.Enum):
    CausalLanuageModeling = 0,
    MaskedLanuageModeling = 1

class Segmentation(enum.Enum):
    Word = 0,
    Subword = 1
    Character = 2
    BPE = 3
    BBPE = 4
    BYTE = 5

# configure model
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank,
    "segmentation": Segmentation.Word,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "loss_criterion": "CrossEntropyLoss"
}
# validate 
validate_config(config)

# extract config vars
embedding_dimension, n_attention_heads, n_encoder_layers, n_decoder_layers, ff_dimension, dropout, batch_size, eval_batch_size = extract_config(config, "embedding_dimension", "n_attention_heads", "n_encoder_layers", "n_decoder_layers", "ff_dimension", "dropout", "batch_size", "eval_batch_size")


# configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


# load training data
train_data, val_data, test_data, vocab = load_data(config)
ntokens = len(vocab.stoi)

# batch data
train_data_batches = batchify(train_data, batch_size, device)
val_data_batches = batchify(val_data, eval_batch_size, device)
test_data_batches = batchify(test_data, eval_batch_size, device)


# instantiate model
model = DecoderOnlyTransformer(ntokens, d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers, num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout).to(device)


# model = Transformer(embedding_dimension).to(device)


# training w/ lightning
# trainer = pl.Trainer(gpus=4, num_nodes=8, precision=16, limit_train_batches=0.5)
# trainer.fit(model, train_loader, val_loader)

# evaluation


[Start Load Data]
Fetched Data (0.000015s)
Tokenized and Split Data (0.695258s)
[<torchtext.data.example.Example object at 0x7f47e95389a0>]
Built Vocab (0.876123s)
[End Load Data] (14.028440s)


NameError: name 'DecoderOnlyTransformer' is not defined

In [25]:
wandb.init(project="words2btyes")
config = wandb.config
print(config)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

{}


In [17]:

lr = 5.0 # learning rate
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [18]:
def train(model, optimizer, criterion, config, epoch, artifacts):
    max_seq_len = extract_config(config, "max_seq_len")
    
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
    for batch, i in enumerate(range(0, train_data_batches.size(0) - 1, max_seq_len)):
        data, targets = get_batch(max_seq_len, train_data_batches, i)
        optimizer.zero_grad()
        if data.size(0) != max_seq_len:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        # print(data.dtype)
        # output = model(data, targets)
        reshape_seq_len = min(data.size(0), max_seq_len)
        targets_flat = targets.reshape(reshape_seq_len, targets.size(0)//reshape_seq_len)
        output = model(data, src_mask)
        # output = model(data, targets_flat, src_mask)
        # output = model(data, targets_flat, src_mask, src_mask)

        output.view(-1, ntokens)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', epoch, batch, loss.item())

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data_batches) // max_seq_len, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [19]:

def evaluate(model, data_source, config):
    max_seq_len = extract_config(config, "max_seq_len")
    
    model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(max_seq_len, data_source, i)
            
            # print(data)
            # print(targets)
            if data.size(0) != max_seq_len:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            # output = model(data, targets)
            reshape_seq_len = min(data.size(0), max_seq_len)
            targets_flat = targets.reshape(reshape_seq_len, targets.size(0)//reshape_seq_len)
            output = model(data, src_mask)
            # output = model(data, targets_flat, src_mask, src_mask)
            # output = model(data, targets_flat, src_mask, src_mask)

            output_flat = output.view(-1, ntokens)
            loss = criterion(output_flat, targets)
            # update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', epoch, batch, loss.item())
            total_loss += len(data) * loss.item()
    return total_loss / (len(data_source) - 1)

In [26]:
# train loop
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None
artifacts = initalize_artifacts(config, train_data_batches, val_data_batches)

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model, optimizer, criterion, config, epoch, artifacts)
    val_loss = evaluate(model, val_data_batches, config)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

visualize_artifacts(artifacts)

AssertionError: Key 'n_epochs' not in config

In [1]:
visualize_artifacts(artifacts)

NameError: name 'visualize_artifacts' is not defined

In [55]:
test_loss = evaluate(best_model, test_data_batches, config)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  5.51 | test ppl   246.15


In [56]:
best_model

DecoderOnlyTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): CustomTransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(p=0.2, inplace=False)
      )
      (1): CustomTransformerDecoderLayer(
        (sel

In [None]:

# generate experiment configs
n_attention_heads_range = range(2,6)
n_layers_range = range(2,6)
experiment_datasets = [ Dataset.PennTreebank, Dataset.WikiText2, Dataset.WikiText103 ]
max_seq_len_range = range()
embedding_dimension
# datasets
def generateExperiements():
    # for each dataset
        # 
    config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 2,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
        "n_epochs": 3
    }
    pass




    


In [None]:
# pytorch lightning experimentation
train_dataset, val_dataset, test_dataset, field_processor = load_data_pl(config)
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"])
val_loader = DataLoader(val_dataset, batch_size=config["batch_size"])



In [None]:
# get scaling laws plots
    # map config values to scaling laws (model size, compute, dataset size)

# scaling laws goals
    # predict test loss
    


In [None]:
# visualize attention in encoder and decoder layers
# visualize

In [None]:
# wandb sweep
# https://docs.wandb.ai/sweeps/python-api

WANDB_ENTITY = ""
WANDB_PROJECT = ""

sweep_config = {
  "name": "My Sweep",
  "method": "grid",
  "parameters": {
        "param1": {
            "values": [1, 2, 3]
        }
    }
}

sweep_id = wandb.sweep(sweep_config)

In [None]:
def train():
    run = wandb.init()
    print("config:", dict(run.config))
    for epoch in range(35):
        print("running", epoch)
        wandb.log({"metric": run.config.param1, "epoch": epoch})
        time.sleep(1)

wandb.agent(sweep_id, function=train)