In [None]:
# artifacts.py
from artifacts import *
from data import load_data, batchify

# setup
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": "PennTreebank",
    "segmentation": "Word",
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.5,
    "loss_criterion": "CrossEntropyLoss"
}

# extract config
batch_size, eval_batch_size = extract_config(config, "batch_size", "eval_batch_size")

# configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load training data
train_data, val_data, test_data, vocab = load_data(config)
ntokens = len(vocab.stoi)

# batch data
train_data_batches = batchify(train_data, batch_size, device)
val_data_batches = batchify(val_data, eval_batch_size, device)
test_data_batches = batchify(test_data, eval_batch_size, device)


# testing
artifacts = initalize_artifacts(config, train_data_batches, val_data_batches)
update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 1, 0.5)
update_artifact_loss(artifacts, 'training', 'CrossEntropyLoss', 0, 2, 3)
# artifacts['training']['CrossEntropyLoss'].reshape(-1)
visualize_artifacts(artifacts)

In [None]:
import time
import math
import torch
import torch.nn as nn

import wandb

from artifacts import initalize_artifacts, visualize_artifacts
from transformer import DecoderOnlyTransformer
from constants import *
from data import load_data, batchify
from utils import extract_config
from training import train, evaluate

# word based config
default_config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Word.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.5,
    "loss_criterion": "CrossEntropyLoss"
}

In [4]:
import time
import math
import torch
import torch.nn as nn

import wandb

from artifacts import initalize_artifacts, visualize_artifacts
from transformer import DecoderOnlyTransformer
from constants import *
from data import load_data, batchify
from utils import extract_config
from training import train, evaluate

# character based config
default_config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Character.name,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.5,
    "loss_criterion": "CrossEntropyLoss"
}

In [5]:

# testing word based model
WANDB_ENTITY = "skgbafa"
run = wandb.init(config=default_config, entity=WANDB_ENTITY)
config = run.config
print(config)

# setup data
# extract config vars
embedding_dimension, n_attention_heads, n_encoder_layers, n_decoder_layers, ff_dimension, dropout, batch_size, eval_batch_size, learning_rate = extract_config(
    config, "embedding_dimension", "n_attention_heads", "n_encoder_layers", "n_decoder_layers", "ff_dimension", "dropout", "batch_size", "eval_batch_size", "learning_rate")

# configure device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

# load training data
train_data, val_data, test_data, vocab = load_data(config)
ntokens = len(vocab.stoi)

# batch data
train_data_batches = batchify(train_data, batch_size, device)
val_data_batches = batchify(val_data, eval_batch_size, device)
test_data_batches = batchify(test_data, eval_batch_size, device)

# instantiate model
model = DecoderOnlyTransformer(ntokens, d_model=embedding_dimension, nhead=n_attention_heads, num_encoder_layers=n_encoder_layers,
                               num_decoder_layers=n_decoder_layers, dim_feedforward=ff_dimension, dropout=dropout).to(device)

# hyperparams
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

# runtime vars
runtime = {
    "criterion": criterion,
    "optimizer": optimizer,
    "scheduler": scheduler,
    "ntokens": ntokens,
    "device": device,
}

# train loop
best_val_loss = float("inf")
epochs = 3  # The number of epochs
best_model = None
artifacts = initalize_artifacts(
    config, train_data_batches, val_data_batches)

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()

    train(model, train_data_batches, config, runtime, epoch, artifacts)
    val_loss = evaluate(model, val_data_batches, config, runtime)
    wandb.log({"val_loss": val_loss, "val_ppl": math.exp(
        val_loss), "epoch": epoch})
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

visualize_artifacts(artifacts)

# test model
test_loss = evaluate(best_model, test_data_batches, config, runtime)
wandb.log({"test_loss": test_loss, "test_ppl": math.exp(test_loss)})

print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.10.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{'embedding_dimension': 200, 'ff_dimension': 200, 'n_attention_heads': 2, 'n_encoder_layers': 0, 'n_decoder_layers': 2, 'dataset': 'PennTreebank', 'segmentation': 'Character', 'max_seq_len': 35, 'batch_size': 20, 'eval_batch_size': 10, 'dropout': 0.2, 'n_epochs': 3, 'learning_rate': 0.5, 'loss_criterion': 'CrossEntropyLoss'}
[Start Load Data]
Fetched Data (0.000050s)
Split Data (0.255141s)
[<torchtext.data.example.Example object at 0x7f287b11ca00>]
Build Vocab (0.536131s)
[End Load Data] (52.266353s)
{'criterion': CrossEntropyLoss(), 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    initial_lr: 0.5
    lr: 0.5
    momentum: 0
    nesterov: False
    weight_decay: 0
), 'scheduler': <torch.optim.lr_scheduler.StepLR object at 0x7f288433bd90>, 'ntokens': 52, 'device': device(type='cuda')}


AttributeError: 'dict' object has no attribute 'device'

In [2]:
runtime

{'criterion': CrossEntropyLoss(),
 'optimizer': SGD (
 Parameter Group 0
     dampening: 0
     initial_lr: 0.5
     lr: 0.5
     momentum: 0
     nesterov: False
     weight_decay: 0
 ),
 'scheduler': <torch.optim.lr_scheduler.StepLR at 0x7f288c833490>,
 'ntokens': 9924,
 'device': device(type='cuda')}

In [3]:
runtime.device

AttributeError: 'dict' object has no attribute 'device'