# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [36]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

Using device "cuda"


In [3]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)
VOCAB_SIZE = tokenizer.get_vocab_size()

# Setup Data

In [4]:
# Load tokenized datasets
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [5]:
BATCH_SIZE = 64

# Split dataset into batches
train_batches = prep_batches(train_ids, BATCH_SIZE, config.SEQ_LEN)
valid_batches = prep_batches(val_ids,   BATCH_SIZE, config.SEQ_LEN)
test_batches  = prep_batches(test_ids,  BATCH_SIZE, config.SEQ_LEN)

# Train Model

In [6]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.05
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 50000
NUM_BATCHES = 1 #len(train_batches[0])
GRADIENT_CLIP = 5

In [18]:
# Define model
model = Seq(config.VOCAB_SIZE, config.PARAM)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)

In [19]:
# Training loop
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    for i in range(NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()

        # data to device
        x = torch.tensor(train_batches[0][i]).to(device)
        y = torch.tensor(train_batches[1][i]).to(device)
        y = y.view(-1)

        lgts, _, _ = model(x, h, c) # Logits: [batch*seq_len, vocab_size]
        loss = criterion(lgts, y)   # Targets: [batch*seq_len]

        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each 1000 iteration
        if e % 1000 == 0:
            torch.save(model.state_dict(), config.PATH_MODEL)
            print(
                'Epoch: {}/{}\tIteration: {} \tLoss: {}'
                .format(e+1, EPOCHS, i+1, loss_val)
            )

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

Epoch: 1/50000	Iteration: 1 	Loss: 9.010912895202637
Epoch: 1001/50000	Iteration: 1 	Loss: 9.010915756225586
Epoch: 2001/50000	Iteration: 1 	Loss: 9.010912895202637
Epoch: 3001/50000	Iteration: 1 	Loss: 9.01091194152832
Epoch: 4001/50000	Iteration: 1 	Loss: 9.010913848876953
Epoch: 5001/50000	Iteration: 1 	Loss: 9.01091480255127
Epoch: 6001/50000	Iteration: 1 	Loss: 9.010913848876953
Epoch: 7001/50000	Iteration: 1 	Loss: 9.010912895202637
Epoch: 8001/50000	Iteration: 1 	Loss: 9.010913848876953
Epoch: 9001/50000	Iteration: 1 	Loss: 9.01091480255127
Epoch: 10001/50000	Iteration: 1 	Loss: 9.010912895202637
Epoch: 11001/50000	Iteration: 1 	Loss: 9.010912895202637
Epoch: 12001/50000	Iteration: 1 	Loss: 9.010913848876953
Epoch: 13001/50000	Iteration: 1 	Loss: 9.01091194152832
Epoch: 14001/50000	Iteration: 1 	Loss: 9.01091480255127
Epoch: 15001/50000	Iteration: 1 	Loss: 9.01091194152832
Epoch: 16001/50000	Iteration: 1 	Loss: 9.01091194152832
Epoch: 17001/50000	Iteration: 1 	Loss: 9.0109119415

# Evaluate Model

In [26]:
import os
os.listdir('./models/')

['model_1_1_50000', 'saved_model_1']

In [28]:
model_name = 'model_1_1_50000'

In [37]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM)
model.load_state_dict(torch.load('./models/' + model_name)) #"./models/saved_model_1", config.PATH_MODEL
model.eval()

Seq(
  (embedding): Embedding(8192, 300)
  (lstm): LSTM(300, 300, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=8192, bias=True)
)

In [38]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    EOS = tokenizer.token_to_id("[EOS]")

    h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    x = torch.zeros(1, config.SEQ_LEN).long().to(device)

    x[0][0] = tokenizer.token_to_id("[CLS]")

    i = 0
    if init != None:
        x[0] = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        i = torch.where(x[0] == EOS)[0].item()
        x[0][x[0] == EOS] = 0

    for j in range(i, max_len):
        probs, h, c = model(x[:,:j], h, c)
        cat = torch.distributions.categorical.Categorical(probs=probs)
        new_x = cat.sample()
        x[0][j] = new_x
        if(new_x) == EOS:
            print("it was end of string")
            break
    print(x)
    return tokenizer.decode(x.view(-1).numpy())

In [39]:
sample_sequence(init="Hello")

RuntimeError: Input, output and indices must be on the current device