# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

In [3]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(DEVICE))

Using device "cuda"


In [4]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)
VOCAB_SIZE = tokenizer.get_vocab_size()

# Setup Data

In [5]:
# Load tokenized datasets
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [6]:
# Split dataset into batches
train_batches = prep_batches(train_ids, config.BATCH_SIZE, config.SEQ_LEN)
valid_batches = prep_batches(val_ids, config.BATCH_SIZE, config.SEQ_LEN)
test_batches  = prep_batches(test_ids, config.BATCH_SIZE, config.SEQ_LEN)

# Train Model

In [7]:
# Define training parameters
LEARNING_RATE = 0.05
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 50000
NUM_BATCHES = 1 #len(train_batches[0])
GRADIENT_CLIP = 5

In [8]:
# Define model
model = Seq(
    config.VOCAB_SIZE, 
    config.EMBED_DIM, 
    config.HIDDEN_DIM, 
    config.N_LAYERS, 
    config.DROPOUT_RATE
)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)

In [14]:
def training_loop(model):
    for e in range(EPOCHS):
        h = torch.zeros((config.N_LAYERS, config.BATCH_SIZE, config.HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        model.train()
        for i in range(NUM_BATCHES):
            # zero gradients
            optimizer.zero_grad()

            # data to device
            x = torch.tensor(train_batches[0][i]).to(device)
            y = torch.tensor(train_batches[1][i]).to(device)
            y = y.view(-1)

            lgts, _, _ = model(x, h, c) # Logits: [batch*seq_len, vocab_size]
            loss = criterion(lgts, y)   # Targets: [batch*seq_len]
            h.detach_()
            c.detach_()

            loss_val = loss.item()
            loss.backward(retain_graph=(False if i == len(train_batches[0])-1 else True))
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
            optimizer.step()

            # Save models each 1000 iteration
            if e % 1000 == 0:
                torch.save(model.state_dict(), config.PATH_MODEL)
                print(
                    'Epoch: {}/{}\tIteration: {} \tLoss: {}'
                    .format(e+1, EPOCHS, i+1, loss_val)
                )

    # Final save
    torch.save(model.state_dict(), config.PATH_MODEL)

In [15]:
training_loop(model)

Epoch: 1/50000	Iteration: 1 	Loss: 9.010892868041992
Epoch: 1001/50000	Iteration: 1 	Loss: 9.010900497436523
Epoch: 2001/50000	Iteration: 1 	Loss: 9.010896682739258
Epoch: 3001/50000	Iteration: 1 	Loss: 9.010892868041992
Epoch: 4001/50000	Iteration: 1 	Loss: 9.010885238647461
Epoch: 5001/50000	Iteration: 1 	Loss: 9.010851860046387
Epoch: 6001/50000	Iteration: 1 	Loss: 9.010852813720703
Epoch: 7001/50000	Iteration: 1 	Loss: 9.010859489440918
Epoch: 8001/50000	Iteration: 1 	Loss: 9.01082706451416
Epoch: 9001/50000	Iteration: 1 	Loss: 9.010824203491211


KeyboardInterrupt: 

# Evaluate Model

In [19]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.EMBED_DIM, config.HIDDEN_DIM, config.N_LAYERS, config.DROPOUT_RATE)
model.load_state_dict(torch.load(config.PATH_MODEL)) #"./models/saved_model_1"
model.eval()

Seq(
  (embedding): Embedding(8192, 300)
  (lstm): LSTM(300, 300, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=8192, bias=True)
)

In [24]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    EOS = tokenizer.token_to_id("[EOS]")

    h = torch.zeros((config.N_LAYERS, 1, config.HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    x = torch.zeros(1, config.SEQ_LEN).long().to(device)

    x[0][0] = tokenizer.token_to_id("[CLS]")

    i = 0
    if init != None:
        x[0] = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        i = torch.where(x[0] == EOS)[0].item()
        x[0][x[0] == EOS] = 0

    for j in range(i, max_len):
        lgts, h, c = model(x[:,:j], h, c)
        probs = nn.functional.softmax(lgts[-1])
        cat = torch.distributions.categorical.Categorical(probs=probs)
        new_x = cat.sample()
        x[0][j] = new_x
        if(new_x) == EOS:
            print("it was end of string")
            break
    print(x)
    return tokenizer.decode(x.view(-1).numpy())

In [25]:
sample_sequence(init="Hello")

RuntimeError: Input, output and indices must be on the current device