In [1]:

# imports
import enum
import io
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torchtext import datasets, vocab
from torchtext.data import Field, BPTTIterator
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from transformers import AutoTokenizer

import pytorch_lightning as pl
import matplotlib.pyplot as plt
import numpy as np 

import wandb


In [2]:
# utils 
def extract_config(config, *argv):
    assert len(argv) > 0, "No keys to extract"
    config_values = []
    for key in argv:
        assert key in config, f"Key '{key}' not in config"
        config_values.append(config[key])
    
    return tuple(config_values) if len(argv) > 1 else config_values[0]

def validate_config(config):
    embedding_dimension, n_attention_heads = extract_config(config, "embedding_dimension", "n_attention_heads")
    
    # embedding dimension must be divisible by n_attention_heads
    assert embedding_dimension %  n_attention_heads == 0, f"Embedding dimension ({embedding_dimension}) must be divisible by n_attention_heads ({n_attention_heads})"

def emb_to_string(emb, vocab):
    embeddings = vocab.itos
    words = [ embeddings[item] for item in emb ]
    return ' '.join(words)

In [3]:
# constants/enums
class Dataset(enum.Enum):
    PennTreebank = 0,
    WikiText2 = 1,
    WikiText103 = 2

class LanguageTask(enum.Enum):
    CausalLanuageModeling = 0,
    MaskedLanuageModeling = 1

class Segmentation(enum.Enum):
    Word = 0,
    Subword = 1
    Character = 2
    BPE = 3
    BBPE = 4
    BYTE = 5


In [4]:
# character tokenizer
# ## UTF-8 Encoder
# def char_tokenizer(string):
#     return [x + 2 for x in str.encode(string)]
# def char_decoder(tokens):
#     return "".join([chr(x - 2) if x > 1 else "" for x in tokens])

def char_tokenizer(string):
    return [x for x in string]
def char_decoder(tokens):
    return "".join([x for x in tokens])

# batch functions
def batchify(data, bsz, device):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(max_seq_len, source, i):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

# load training data
def load_data(config):
    segmentation = extract_config(config, "segmentation")

    if segmentation == Segmentation.Word.name:
        return load_data_word(config)
    if segmentation == Segmentation.Subword.name:
        return load_data_subword(config)
    if segmentation == Segmentation.Character.name:
        return load_data_character(config)
    else:
        raise ValueError(f'Segementation {segmentation} not supported.')
    
# load word based training data
def load_data_word(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset) 
    print(f"Fetched Data ({time.time() - ts:3f}s)")
    
    # tokenize
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Built Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab

def load_data_subword(config):
    # load word based training data
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset) 


    tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
    field_processor = Field(tokenize=tokenizer.encode)


    # tokenizer = get_tokenizer('subword')
    # field_processor = Field(tokenize=tokenizer)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Split Data ({time.time() - ts:3f}s)")

    print(train_dataset)
    # get vocabulary
    # field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    # vocab = field_processor.vocab
    vocab = tokenizer.get_vocab()

    print(f"Build Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab



def load_data_character(config):
    # load word based training data
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, segmentation = extract_config(config, "dataset", "segmentation")
    dataset = getattr(datasets, dataset) 
    # tokenizer = get_tokenizer('basic_english')
    tokenizer = char_tokenizer
    field_processor = Field(tokenize=tokenizer)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    print(f"Split Data ({time.time() - ts:3f}s)")

    print(train_dataset[0:10])
    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Build Vocab ({time.time() - ts:3f}s)")


    def data_process(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                            dtype=torch.long) for item in raw_text_iter]
        return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

    train_data = data_process(train_dataset)
    val_data = data_process(val_dataset)
    test_data = data_process(test_dataset)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_data, val_data, test_data, vocab


In [5]:
# pytorch lightning stuff
def load_data_pl(config): 
    # get dataset
    dataset = extract_config(config, "dataset")
    dataset = getattr(datasets, dataset.name) 
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(text_field=field_processor)
    
    # get vocabulary
    field_processor.build_vocab(train_dataset, val_dataset, test_dataset, min_freq=1)

    return train_dataset, val_dataset, test_dataset, field_processor




In [6]:

def generateExperiements():
    WANDB_ENTITY = "skgbafa"
    # WANDB_ENTITY = "openai-scholars"
    WANDB_PROJECT = ""

    # experiment_datasets = [ Dataset.PennTreebank.name, Dataset.WikiText2.name, Dataset.WikiText103.name ]
    experiment_datasets = [ Dataset.WikiText2.name ]
    experiment_segmentation = [Segmentation.Word.name, Segmentation.Character.name]
    # for each dataset
        # 
    sweep_parameters = {
        "n_attention_heads": {
            "values": [2, 3, ]
        },
        "n_decoder_layers": {
            "values": [2, 4, 6]
        },
        "dataset": {
            "values": experiment_datasets
        },
        "n_epochs": {
            "values": [3]
        },
        "segmentation": {
            "values": experiment_segmentation
        }
    }

    sweep_config = {
        "name": "Experamental Sweeps",
        "method": "grid",
        "parameters": sweep_parameters
    }
    
    sweep_id = wandb.sweep(sweep_config, entity=WANDB_ENTITY)

    return sweep_id




    


In [7]:
# generate/visualize artifacts
def initalize_artifacts(config, train_data_batches, val_data_batches):
        n_epochs, max_seq_len = extract_config(config, "n_epochs", "max_seq_len")
        training_cel = torch.ones(n_epochs, math.ceil(len(train_data_batches) / max_seq_len)) * float("inf")
        validation_cel = torch.ones(n_epochs, math.ceil(len(val_data_batches) / max_seq_len)) * float("inf")
        artifacts = {
            "training": {
                "CrossEntropyLoss": training_cel
            },
            "validation": {
                "CrossEntropyLoss": validation_cel
            }
        }
        return artifacts

def update_artifact_loss(artifacts, training_stage, metric, epoch, batch, value):
    try:
        artifacts[training_stage][metric][epoch - 1][batch] = value
    except Exception as e:
        print("exception:", e)
        print("epoch", epoch)
        print("batch", batch)
        print(artifacts)

def visualize_artifacts(artifacts):
    flat_loss = artifacts['training']['CrossEntropyLoss'].reshape(-1)
    count = flat_loss.size(0)
    batch_number = np.arange(0, flat_loss.size(0))
    plt.plot(batch_number, flat_loss)
    plt.legend("CrossEntropyLoss")
    None

# artifacts = initalize_artifacts(config, train_data_batches, val_data_batches)
# update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 1, 0.5)
# update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 2, 3)
# # artifacts['training']['CrossEntropyLoss'].reshape(-1)
# visualize_artifacts(artifacts)
# # visualize_artifacts(artifacts)


In [8]:
# Decoder only transformer implementation
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder, LayerNorm
from torch import Tensor
from typing import Optional, Any

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Decoder Only implmentation without memory for encoder
# Adapted from pytorch implmentation @ https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html#TransformerDecoderLayer
class CustomTransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"):
        super(CustomTransformerDecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model) # skip
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout) # skip
        self.dropout3 = nn.Dropout(dropout)

        self.activation = _get_activation_fn(activation)

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(CustomTransformerDecoderLayer, self).__setstate__(state)

    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:

        tgt2 = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        # tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
        #                            key_padding_mask=memory_key_padding_mask)[0]
        # tgt = tgt + self.dropout2(tgt2)
        # tgt = self.norm2(tgt)
        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt



def _get_clones(module, N):
    return ModuleList([copy.deepcopy(module) for i in range(N)])


def _get_activation_fn(activation):
    if activation == "relu":
        return F.relu
    elif activation == "gelu":
        return F.gelu

    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))

# decoder only implmentation
# pytorch implmentation for torch ligthning
# class Transformer(pl.LightningModule):
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, ntokens, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
                 activation="relu", custom_encoder=None, custom_decoder=None):
        super(DecoderOnlyTransformer, self).__init__()
        # model vars
        self.d_model = d_model
        self.nhead = nhead

        # decoder setup 
        decoder_layer = CustomTransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = LayerNorm(d_model)
        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

        # embedding setup
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.to_embedding = nn.Embedding(ntokens, d_model)

        # output setup
        self.linear = nn.Linear(d_model, ntokens)

        self._reset_parameters()


    def forward(self, tgt, tgt_mask=None, memory_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
   
        # convert input/targets to embeddings
        tgt = self.to_embedding(tgt) * math.sqrt(self.d_model)

        # add positional encodings
        tgt = self.pos_encoder(tgt)

        # pytorch checks
        # https://pytorch.org/docs/master/generated/torch.nn.Transformer.html#torch.nn.Transformer.forward
        if  tgt.size(2) != self.d_model:
            raise RuntimeError("the feature number of tgt must be equal to d_model")
        
        # decoder pass
        output = self.decoder(tgt, memory=None, tgt_mask=tgt_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        # return after linear layer
        return self.linear(output)

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

In [9]:
# wandb.init(project="words2btyes")
# config = wandb.config
# print(config)

In [10]:

# lr = 5.0 # learning rate
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


NameError: name 'model' is not defined

In [62]:
def train(model, config, epoch, artifacts):
    max_seq_len = extract_config(config, "max_seq_len")
    
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
    for batch, i in enumerate(range(0, train_data_batches.size(0) - 1, max_seq_len)):
        data, targets = get_batch(max_seq_len, train_data_batches, i)
        optimizer.zero_grad()
        if data.size(0) != max_seq_len:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        # print(data.dtype)
        # output = model(data, targets)
        reshape_seq_len = min(data.size(0), max_seq_len)
        targets_flat = targets.reshape(reshape_seq_len, targets.size(0)//reshape_seq_len)
        output = model(data, src_mask)
        # output = model(data, targets_flat, src_mask)
        # output = model(data, targets_flat, src_mask, src_mask)

        output_flat = output.view(-1, ntokens)
        print("output_flat", output_flat.shape)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        # update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', epoch, batch, loss.item())
        wandb.log({
            # "elapsed_time": start_time - time.time(),
            "epoch": epoch,
            "batch": batch,
            "batch_loss": loss.item(),
            # "current_loss": cur_loss,
            "ppl": math.exp(loss.item()),
            "learning_rate": scheduler.get_lr()[0],
        })

        total_loss += loss.item()
        log_interval = 200
        cur_loss = total_loss / log_interval
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data_batches) // max_seq_len, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [63]:

def evaluate(model, data_source, config):
    max_seq_len = extract_config(config, "max_seq_len")
    
    model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(max_seq_len).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(max_seq_len, data_source, i)
            
            # print(data)
            # print(targets)
            if data.size(0) != max_seq_len:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            # output = model(data, targets)
            reshape_seq_len = min(data.size(0), max_seq_len)
            targets_flat = targets.reshape(reshape_seq_len, targets.size(0)//reshape_seq_len)
            output = model(data, src_mask)
            # output = model(data, targets_flat, src_mask, src_mask)
            # output = model(data, targets_flat, src_mask, src_mask)

            output_flat = output.view(-1, ntokens)
            loss = criterion(output_flat, targets)
            # update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', epoch, batch, loss.item())
            total_loss += len(data) * loss.item()

            wandb.log({
                # "elapsed_time": start_time - time.time(),
                # "epoch": epoch,
                "batch": i,
                "batch_loss": loss.item(),
                # "current_loss": cur_loss,
                "ppl": math.exp(loss.item()),
            })
    return total_loss / (len(data_source) - 1)

In [72]:
def train_and_eval():
    run = wandb.init(config=default_config)
    config = run.config
    print(config)

    # setup data
    # extract config vars
    embedding_dimension, n_attention_heads, n_encoder_layers, n_decoder_layers, ff_dimension, dropout, batch_size, eval_batch_size, learning_rate = extract_config(config, "embedding_dimension", "n_attention_heads", "n_encoder_layers", "n_decoder_layers", "ff_dimension", "dropout", "batch_size", "eval_batch_size", "learning_rate")


    # configure device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")


    # load training data
    train_data, val_data, test_data, vocab = load_data(config)
    ntokens = len(vocab.stoi)

    # batch data
    train_data_batches = batchify(train_data, batch_size, device)
    val_data_batches = batchify(val_data, eval_batch_size, device)
    test_data_batches = batchify(test_data, eval_batch_size, device)

    # instantiate model
    model = DecoderOnlyTransformer(ntokens, d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers, num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout).to(device)
    

    # hyperparams
    # lr = 5.0 # learning rate
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    
    # train loop
    best_val_loss = float("inf")
    epochs = 3 # The number of epochs
    best_model = None
    artifacts = initalize_artifacts(config, train_data_batches, val_data_batches)

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        train(model, config, epoch, artifacts)
        val_loss = evaluate(model, val_data_batches, config)
        wandb.log({"val_loss": val_loss, "val_ppl": math.exp(val_loss), "epoch": epoch})
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                        val_loss, math.exp(val_loss)))
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()

    visualize_artifacts(artifacts)

    # test model
    test_loss = evaluate(best_model, test_data_batches, config)
    wandb.log({"test_loss": test_loss, "test_ppl": math.exp(test_loss)})

    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)

    return best_model

default_config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Character.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.5,
    "loss_criterion": "CrossEntropyLoss"
}

train_and_eval()

In [79]:


# load_data(config)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'embedding_dimension': 200, 'ff_dimension': 200, 'n_attention_heads': 2, 'n_encoder_layers': 0, 'n_decoder_layers': 2, 'dataset': 'PennTreebank', 'segmentation': 'Character', 'max_seq_len': 35, 'batch_size': 20, 'eval_batch_size': 10, 'dropout': 0.2, 'n_epochs': 3, 'learning_rate': 0.5, 'loss_criterion': 'CrossEntropyLoss'}
[Start Load Data]
Fetched Data (0.000037s)
Split Data (0.263195s)
[<torchtext.data.example.Example object at 0x7f9503060070>]
Build Vocab (0.571970s)
[End Load Data] (52.985639s)


RuntimeError: shape '[-1, 9924]' is invalid for input of size 36400

In [76]:
# configure model
default_config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Word.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.5,
    "loss_criterion": "CrossEntropyLoss"
}

# validate 
config = default_config
validate_config(default_config)

# extract config vars
embedding_dimension, n_attention_heads, n_encoder_layers, n_decoder_layers, ff_dimension, dropout, batch_size, eval_batch_size, learning_rate = extract_config(config, "embedding_dimension", "n_attention_heads", "n_encoder_layers", "n_decoder_layers", "ff_dimension", "dropout", "batch_size", "eval_batch_size", "learning_rate")


# configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


# load training data
train_data, val_data, test_data, vocab = load_data(config)
ntokens = len(vocab.stoi)

# batch data
train_data_batches = batchify(train_data, batch_size, device)
val_data_batches = batchify(val_data, eval_batch_size, device)
test_data_batches = batchify(test_data, eval_batch_size, device)

# instantiate model
model = DecoderOnlyTransformer(ntokens, d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers, num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout).to(device)

# # c
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)



# set up experiments
# sweep_id = generateExperiements()

# run experiments
# wandb.agent(sweep_id, function=train_and_eval)
# model = Transformer(embedding_dimension).to(device)


# training w/ lightning
# trainer = pl.Trainer(gpus=4, num_nodes=8, precision=16, limit_train_batches=0.5)
# trainer.fit(model, train_loader, val_loader)

# evaluation


[Start Load Data]


AttributeError: 'str' object has no attribute 'name'

In [54]:
print(Segmentation.Word.name)

Word


In [55]:
test_loss = evaluate(best_model, test_data_batches, config)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

NameError: name 'best_model' is not defined

In [56]:
best_model

DecoderOnlyTransformer(
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): CustomTransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): Linear(in_features=200, out_features=200, bias=True)
        )
        (linear1): Linear(in_features=200, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=200, bias=True)
        (norm1): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((200,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
        (dropout3): Dropout(p=0.2, inplace=False)
      )
      (1): CustomTransformerDecoderLayer(
        (sel

In [None]:
# # pytorch lightning experimentation
# train_dataset, val_dataset, test_dataset, field_processor = load_data_pl(config)
# train_loader = DataLoader(train_dataset, batch_size=config["batch_size"])
# val_loader = DataLoader(val_dataset, batch_size=config["batch_size"])



In [None]:
# get scaling laws plots
    # map config values to scaling laws (model size, compute, dataset size)

# scaling laws goals
    # predict test loss
    


In [None]:
# visualize attention in encoder and decoder layers
# visualize

In [14]:
# # wandb sweep
# # https://docs.wandb.ai/sweeps/python-api

# WANDB_ENTITY = "skgbafa"
# WANDB_PROJECT = ""

# sweep_config = {
#   "name": "My Sweep",
#   "method": "grid",
#   "parameters": {
#         "n_epochs": {
#             "values": [1, 2, 3]
#         }
#     }
# }

# # wandb.init(entity=WANDB_ENTITY)

# sweep_id = wandb.sweep(sweep_config, entity=WANDB_ENTITY)

Create sweep with ID: wwapp5vq
Sweep URL: https://wandb.ai/skgbafa/uncategorized/sweeps/wwapp5vq


In [15]:
# def train():
#     run = wandb.init(config=default_config)
#     print("config:", dict(run.config))
#     for epoch in range(10):
#         print("running", epoch)
#         wandb.log({"metric": run.config.n_epochs, "epoch": epoch})
#         time.sleep(1)
#     return "test"

# result = wandb.agent(sweep_id, function=train)
# print(result)

[34m[1mwandb[0m: Agent Starting Run: p7f0vb5l with config:
[34m[1mwandb[0m: 	n_epochs: 1
[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


config: {'n_epochs': 1, 'embedding_dimension': 200, 'ff_dimension': 200, 'n_attention_heads': 2, 'n_encoder_layers': 0, 'n_decoder_layers': 2, 'dataset': 'Dataset.PennTreebank', 'segmentation': 'Segmentation.Word', 'max_seq_len': 35, 'batch_size': 20, 'eval_batch_size': 10, 'dropout': 0.2, 'loss_criterion': 'CrossEntropyLoss'}
running 0
running 1
running 2
running 3
running 4
running 5
running 6
running 7
running 8
running 9


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
metric,1
epoch,9
_step,9
_runtime,10
_timestamp,1612989268


0,1
metric,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: lc7mm5w8 with config:
[34m[1mwandb[0m: 	n_epochs: 2
[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


config: {'n_epochs': 2, 'embedding_dimension': 200, 'ff_dimension': 200, 'n_attention_heads': 2, 'n_encoder_layers': 0, 'n_decoder_layers': 2, 'dataset': 'Dataset.PennTreebank', 'segmentation': 'Segmentation.Word', 'max_seq_len': 35, 'batch_size': 20, 'eval_batch_size': 10, 'dropout': 0.2, 'loss_criterion': 'CrossEntropyLoss'}
running 0
running 1
running 2
running 3
running 4
running 5
running 6
running 7
running 8
running 9


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
metric,2
epoch,9
_step,9
_runtime,10
_timestamp,1612989282


0,1
metric,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█


[34m[1mwandb[0m: Agent Starting Run: in788h8n with config:
[34m[1mwandb[0m: 	n_epochs: 3
[34m[1mwandb[0m: wandb version 0.10.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


config: {'n_epochs': 3, 'embedding_dimension': 200, 'ff_dimension': 200, 'n_attention_heads': 2, 'n_encoder_layers': 0, 'n_decoder_layers': 2, 'dataset': 'Dataset.PennTreebank', 'segmentation': 'Segmentation.Word', 'max_seq_len': 35, 'batch_size': 20, 'eval_batch_size': 10, 'dropout': 0.2, 'loss_criterion': 'CrossEntropyLoss'}
running 0
running 1
running 2
running 3
running 4
running 5
running 6
running 7
running 8
running 9


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
metric,3
epoch,9
_step,9
_runtime,10
_timestamp,1612989299


0,1
metric,▁▁▁▁▁▁▁▁▁▁
epoch,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█
_runtime,▁▂▂▃▄▅▆▇▇█
_timestamp,▁▂▂▃▄▅▆▇▇█


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
None
