# Language Modelling Sequence Model
Made as a part of the Deep Learning project "" at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

# Setup Data

In [21]:
# Load pre-trained tokenizer and tokenized datasets:
tokenizer = Tokenizer.from_file("serialized_tokenizer")
train_ds, val_ds, test_ds = load_from_disk("tokenized_train"), load_from_disk("tokenized_val"), load_from_disk("tokenized_test")
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

VOCAB_SIZE = tokenizer.get_vocab_size()

In [22]:
# Split dataset into batches
train_batches = prep_batches(train_ids, config.BATCH_SIZE, config.SEQ_LEN)
valid_batches = prep_batches(val_ids, config.BATCH_SIZE, config.SEQ_LEN)
test_batches  = prep_batches(test_ids, config.BATCH_SIZE, config.SEQ_LEN)

# Define Model

In [23]:
# Define model parameters
EMBED_DIM = 32
HIDDEN_DIM = 32
N_LAYERS = 2
DROPOUT_RATE = 0.5
GRADIENT_CLIP = 5

In [24]:
# Define model
model = Seq(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT_RATE)

In [25]:
# 
h = torch.zeros((N_LAYERS, config.BATCH_SIZE, HIDDEN_DIM))
c = torch.zeros_like(h)
p, h, c = model(train_batches[0][0], h, c)

# Train Model

In [16]:
# Define training parameters
LEARNING_RATE = 0.05
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 10000
NUM_BATCHES = 1 #int(len(train_batches[0]))
MODEL_SAVE_PATH = 'saved_model'

In [17]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGTH_DECAY)

In [28]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device "{}"'.format(DEVICE))

Using device "cuda"


In [None]:
device = torch.device(DEVICE)
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, config.BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    for i in range(NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()

        # data to device
        x = torch.tensor(train_batches[0][i]).to(device)
        y = torch.tensor(train_batches[1][i]).to(device)
        y = y.view(-1)

        lgts, _, _ = model(x, h, c) # Logits: [batch*seq_len, vocab_size]
        loss = criterion(lgts, y)   # Targets: [batch*seq_len]
        h.detach()
        c.detach()

        loss_val = loss.item()
        loss.backward(retain_graph=(False if i == len(train_batches[0])-1 else True))
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        if (e-1) % 1000 == 0:
            print(
                'Epoch: {}/{}\tIteration: {} \tLoss: {}'
                .format(e+1, EPOCHS, i+1, loss_val)
            )

In [None]:
# Save model
torch.save(model.state_dict(), MODEL_SAVE_PATH)

In [None]:
# # Load model
# model = TheModelClass(*args, **kwargs)
# model.load_state_dict(torch.load(MODEL_SAVE_PATH))
# model.eval()