In [2]:

# imports
import enum
import io
import time 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torchtext import datasets, vocab
from torchtext.data import Field
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pytorch_lightning as pl


In [3]:
# batch functions
def batchify(data, bsz, device):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data#.to(device)

def batchify_pad(data, batch_size, embedding_dimension, device):
    ## Method 1: Add padding to embedding and batch (and add mask?)
    num_batched_units = batch_size * embedding_dimension

    # Append padded data
    PAD_VALUE = 0 # identify correct pad value (pad token in vocab?)
    padcount = num_batched_units - (data.size(0) % num_batched_units)
    pad = torch.ones(padcount, dtype=data.dtype) * PAD_VALUE
    data = torch.cat((data, pad))

    # Divide the dataset into batch_size parts.
    nbatch = data.size(0) // num_batched_units

    # Evenly divide the data across the batches.
    data = data.view((nbatch, batch_size, embedding_dimension)).contiguous()
    data = data.to(torch.long) # convert toz float (why?)
    return data.to(device)


def batchify_trim(data, batch_size, embedding_dimension, device):
    ## Method 2: Trim and batch (no mask?)
    num_batched_units = batch_size * embedding_dimension

    # Divide the dataset into batch_size parts.
    nbatch = data.size(0) // num_batched_units

    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * num_batched_units)

    # Evenly divide the data across the batches.
    data = data.view((nbatch, batch_size, embedding_dimension)).contiguous()
    data = data.to(torch.float) # convert to float (why?)
    return data.to(device)

# default to trim
# batchify = batchify_trim

def get_batch(max_seq_len, source, i):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

# utils 
def extract_config(config, *argv, ):
    assert len(argv) > 0, "No keys to extract"
    config_values = []
    for key in argv:
        assert key in config, f"Key '{key}' not in config"
        config_values.append(config[key])
    
    return tuple(config_values) if len(argv) > 1 else config_values[0]

def validate_config(config):
    embedding_dimension, n_attention_heads = extract_config(config, "embedding_dimension", "n_attention_heads")
    
    # embedding dimension must be divisible by n_attention_heads
    assert embedding_dimension %  n_attention_heads == 0, f"Embedding dimension ({embedding_dimension}) must be divisible by n_attention_heads ({n_attention_heads})"

def emb_to_string(emb, vocab):
    embeddings = vocab.itos
    words = [ embeddings[item] for item in emb ]
    return ' '.join(words)

In [4]:
# load training data
# TODO: change to use config
def load_data():
    url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'
    test_filepath, valid_filepath, train_filepath = extract_archive(download_from_url(url))
    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer,
                                        iter(io.open(train_filepath,
                                                    encoding="utf8")))) # should build from all text
    def data_process(raw_text_iter):
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(iter(io.open(train_filepath, encoding="utf8")))
    val_data = data_process(iter(io.open(valid_filepath, encoding="utf8")))
    test_data = data_process(iter(io.open(test_filepath, encoding="utf8")))

    return train_data, val_data, test_data, vocab

def load_data_pl(config): 
    # get dataset
    dataset = extract_config(config, "dataset")
    dataset = getattr(datasets, dataset.name) 
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    
    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)

    return train_dataset, val_dataset, test_dataset, field_processor




In [5]:
# train_data, val_data, test_data, vocab = load_data()
# batch_size, embedding_dimension = extract_config(config, "batch_size", "embedding_dimension")




In [6]:
# def batchify2(data, batch_size, embedding_dimension):
#     # Divide the dataset into batch_size parts.
#     nbatch = data.size(0) // batch_size
#     print(nbatch)
#     # Trim off any extra elements that wouldn't cleanly fit (remainders).
#     data = data.narrow(0, 0, nbatch * batch_size)
#     # Evenly divide the data across the batch_size batches.
#     data = data.view(batch_size, -1).t().contiguous()
#     return data.to(device)
# train_data_batched = batchify2(train_data, batch_size, embedding_dimension)
# print(train_data_batched.shape)

In [7]:
# batch_size = 4
# embedding_dimension = 5
# src = torch.rand((3, batch_size, embedding_dimension))
# # src = torch.rand((10, 32, 512))
# src.shape
# print(src)


In [8]:
# import random

# batch_size = 4
# embedding_dimension = 5
# batched_units = batch_size* embedding_dimension
# count = (random.randint(1,5) * batched_units) + random.randint(1, embedding_dimension)
# print(count)
# print(count % batched_units)
# print(batched_units - (count % batched_units))
# test = torch.rand(count)

# PAD_VALUE = 0
# padcount = batched_units - (count % batched_units)
# pad = torch.ones(padcount) * PAD_VALUE

# data = torch.cat((test, pad))
# s_len = int(len(data) / batched_units)
# print(s_len)
# data.view((s_len, batch_size, embedding_dimension)).contiguous()




# train_data_batched = batchify(train_data, batch_size, embedding_dimension)
# print(train_data_batched.shape)
# print(train_data_batched)

In [9]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(input.shape)
print(target.shape)
output = loss(input, target)
output

torch.Size([3, 5])
torch.Size([3])


tensor(2.7819, grad_fn=<NllLossBackward>)

In [26]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder, LayerNorm



# pytorch implmentation for torch ligthning
# class Transformer(pl.LightningModule):
class Transformer(nn.Module):
    def __init__(self, ntokens, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
                 activation="relu", custom_encoder=None, custom_decoder=None):
        super(Transformer, self).__init__()

        if custom_encoder is not None:
            self.encoder = custom_encoder
        else:
            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
            encoder_norm = LayerNorm(d_model)
            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

        if custom_decoder is not None:
            self.decoder = custom_decoder
        else:
            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
            decoder_norm = LayerNorm(d_model)
            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead

        self.linear = nn.Linear(d_model, ntokens)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.to_embedding = nn.Embedding(ntoken, d_model)



    def forward(self, src, tgt, src_mask=None, tgt_mask=None,
                memory_mask=None, src_key_padding_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
   
        # convert input/targets to embeddings
        src = self.to_embedding(src) * math.sqrt(self.d_model)
        tgt = self.to_embedding(tgt) * math.sqrt(self.d_model)

        # add positional encodings
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)

        # pytorch checks
        # https://pytorch.org/docs/master/generated/torch.nn.Transformer.html#torch.nn.Transformer.forward
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")

        if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
            raise RuntimeError("the feature number of src and tgt must be equal to d_model")
        

        # encoder pass
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        
        # decoder pass
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        # return after linear layer
        return self.linear(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)



In [11]:

# constants/enums
class Dataset(enum.Enum):
    PennTreebank = 0,
    WikiText2 = 1,
    WikiText103 = 2

class LanguageTask(enum.Enum):
    CausalLanuageModeling = 0,
    MaskedLanuageModeling = 1

class Segmentation(enum.Enum):
    Word = 0,
    Subword = 1
    Character = 2
    BPE = 3
    BBPE = 4
    BYTE = 5

# configure model
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2
}
# validate 
validate_config(config)

# extract config vars
embedding_dimension, n_attention_heads, n_encoder_layers, n_decoder_layers, ff_dimension, dropout, batch_size, eval_batch_size = extract_config(config, "embedding_dimension", "n_attention_heads", "n_encoder_layers", "n_decoder_layers", "ff_dimension", "dropout", "batch_size", "eval_batch_size")


# configure device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")


# load training data
train_data, val_data, test_data, vocab = load_data()

print(train_data.shape)
print(train_data)
print(device)
# batch data
train_data_batches = batchify(train_data, batch_size, device)
val_data_batches = batchify(val_data, eval_batch_size, device)
test_data_batches = batchify(test_data, eval_batch_size, device)


# instantiate model
model = Transformer(d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers, num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout)#.to(device)

# model = Transformer(embedding_dimension).to(device)


# training w/ lightning
# trainer = pl.Trainer(gpus=4, num_nodes=8, precision=16, limit_train_batches=0.5)
# trainer.fit(model, train_loader, val_loader)

# evaluation


36718lines [00:01, 25222.63lines/s]
torch.Size([2049990])
tensor([  10, 3850, 3870,  ..., 2443, 4811,    4])
cpu


In [12]:
# src.shape
# print(src.shape)
# # print(src[0][0])

In [30]:
# playground
ntokens = len(vocab.stoi)
print("ntokens", ntokens)
max_seq_len = extract_config(config, "max_seq_len")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

def batchify(data, bsz, device):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data#.to(device)

def get_batch(max_seq_len, source, i):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)

    return data, target

# bptt = max_seq_len
# def get_batch(source, i):
#     seq_len = min(bptt, len(source) - 1 - i)
#     data = source[i:i+seq_len]
#     target = source[i+1:i+1+seq_len].reshape(-1)
#     return data, target


train_data_batched = batchify(train_data, batch_size, device)
print(train_data_batched.shape)

# model = Transformer().to(device)
model = Transformer(ntokens, d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers, num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout).to(device)
src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
to_embedding = nn.Embedding(ntokens, embedding_dimension)

for batch, i in enumerate(range(0, train_data_batched.size(0) - 1, max_seq_len)):
        # data, targets = get_batch(train_data_batched, i)
        data, targets = get_batch(max_seq_len, train_data_batched, i)
        break


print("data", data.shape)
print(targets.shape)
# print(data)
src_embedding = to_embedding(data)#.to(device)
tgt_embedding = to_embedding(targets.reshape(max_seq_len, targets.size(0)//max_seq_len))#.to(device)
print("src_embedding", src_embedding.shape)
print(tgt_embedding.shape)
# print(tgt_embedding)
output = model(src_embedding, tgt_embedding)
print(output.shape)
# print(output)

criterion = nn.CrossEntropyLoss()
loss = criterion(output.view(-1, ntokens), targets)
print(loss)


# loss = criterion(output, targets)


print(loss.shape)



ntokens 28783
torch.Size([102499, 20])
data torch.Size([35, 20])
torch.Size([700])
src_embedding torch.Size([35, 20, 200])
torch.Size([35, 20, 200])
torch.Size([35, 20, 28783])
tensor(10.4685, grad_fn=<NllLossBackward>)
torch.Size([35, 20, 28783])


torch.Size([35, 20])

In [None]:

lr = 5.0 # learning rate
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [None]:
def train(model, optimizer, criterion, config):
    max_seq_len = extract_config(config, "max_seq_len")
    ntokens = len(vocab.stoi)
    
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(max_seq_len)#.to(device)
    for batch, i in enumerate(range(0, train_data_batches.size(0) - 1, max_seq_len)):
        data, targets = get_batch(max_seq_len, train_data_batches, i)
        optimizer.zero_grad()
        if data.size(0) != max_seq_len:
            src_mask = model.generate_square_subsequent_mask(data.size(0))#.to(device)
        print(data.dtype)
        output = model(data, targets)
        output.view(-1, ntokens)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data_batches) // max_seq_len, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [None]:


def evaluate(model, data_source, config):
    max_seq_len = extract_config(config, "max_seq_len")
    ntokens = len(vocab.stoi)
    
    model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(max_seq_len, data_source, i)
            print(data)
            print(targets)
            if data.size(0) != max_seq_len:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = model(data, targets)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [None]:
# train loop
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model, optimizer, criterion, config)
    val_loss = evaluate(model, val_data_batches, config)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

In [None]:
test_loss = evaluate(best_model, test_data_batches, config)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

In [None]:

# generate experiment configs
n_attention_heads_range = range(2,6)
n_layers_range = range(2,6)
experiment_datasets = [ Dataset.PennTreebank, Dataset.WikiText2, Dataset.WikiText103 ]
max_seq_len_range = range()
embedding_dimension
# datasets
def generateExperiements():
    # for each dataset
        # 
    config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 2,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
    }
    pass

# artifacts?
    # generate/visualize


In [None]:
# pytorch lightnign experimentation
train_dataset, val_dataset, test_dataset, field_processor = load_data_pl(config)
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"])
val_loader = DataLoader(val_dataset, batch_size=config["batch_size"])



In [None]:
# get scaling laws plots
    # map config values to scaling laws (model size, compute, dataset size)


In [None]:
# visualize attention in encoder and decoder layers
# visualize