# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

In [2]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Using device "cuda"


In [3]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)

if config.VOCAB_SIZE != tokenizer.get_vocab_size():
    print(
        'Retrain Tokenizer. Vocab size {} != {}'
        .format(config.VOCAB_SIZE, tokenizer.get_vocab_size())
    )

# Setup Data

In [4]:
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "n"])
val_ds.set_format(type="pt", columns=["ids", "n"])
test_ds.set_format(type="pt", columns=["ids", "n"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

train_n = train_ds["n"]
val_n = val_ds["n"]
test_n = test_ds["n"]

# Train Model

In [5]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.7     # pretty big learning rate. Same one was used in Seq2Seq.
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 5
NUM_BATCHES = len(train_ids) // config.BATCH_SIZE
GRADIENT_CLIP = 5
STEP_SIZE = 1           # multiply lr by GAMMA every STEP_SIZE epochs.
GAMMA = 0.75            # Reduce learning rate by 25% pr. step.

In [6]:
# Define model
if config.LOAD_PRETRAINED:
    model.load_state_dict(torch.load(config.PATH_MODEL))
else:
    model = Seq(config.VOCAB_SIZE, config.PARAM, device)
    
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, STEP_SIZE, gamma=GAMMA, last_epoch=-1, verbose=False)

In [7]:
# Training loop
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, config.BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    
    for i in range(0, NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()
        h.detach_()
        c.detach_()

        # Get input and target
        inputs = (
            train_ids[i*config.BATCH_SIZE : (i+1)*config.BATCH_SIZE]
            .to(device)
            .view(config.BATCH_SIZE, config.SEQ_LEN)
        )
        targets = torch.zeros_like(inputs).to(device)
        targets[:, :-1] = inputs[:, 1:]

        # Predict with model
        lgts, h, c = model(inputs, h, c)  # Logits: [batch, vocab_size, seq_len]
        lgts = lgts.transpose(1, 2)       # [batch, vocab size, seq len]
        loss = criterion(lgts, targets)   # Targets: [batch, seq_len]

        # get loss and optimize
        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each config.PRINT_LOSS_EVERY_N_BATCH iteration
        if e % config.PRINT_LOSS_EVERY_N_BATCH == 0: #i
            torch.save(model.state_dict(), config.PATH_MODEL) 
            print(
                'Epoch: {}/{}\tIteration: {}/{} \tLoss: {:.4f}\t Learning Rate: {:.4f}'
                .format(e+1, EPOCHS, i+1, NUM_BATCHES, loss_val, scheduler.get_last_lr())
            )
    scheduler.step()

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

Epoch: 1/50000	Iteration: 1/1 	Loss: 9.014968872070312	 Learning Rate: [0.7]
Epoch: 1001/50000	Iteration: 1/1 	Loss: 4.408590793609619	 Learning Rate: [0.63]
Epoch: 2001/50000	Iteration: 1/1 	Loss: 4.246495246887207	 Learning Rate: [0.5670000000000001]
Epoch: 3001/50000	Iteration: 1/1 	Loss: 4.35102653503418	 Learning Rate: [0.5103000000000001]
Epoch: 4001/50000	Iteration: 1/1 	Loss: 4.158670425415039	 Learning Rate: [0.45927000000000007]
Epoch: 5001/50000	Iteration: 1/1 	Loss: 4.177933692932129	 Learning Rate: [0.41334300000000007]
Epoch: 6001/50000	Iteration: 1/1 	Loss: 4.153418064117432	 Learning Rate: [0.3720087000000001]
Epoch: 7001/50000	Iteration: 1/1 	Loss: 4.1884260177612305	 Learning Rate: [0.3348078300000001]
Epoch: 8001/50000	Iteration: 1/1 	Loss: 4.169976711273193	 Learning Rate: [0.30132704700000007]
Epoch: 9001/50000	Iteration: 1/1 	Loss: 4.238604545593262	 Learning Rate: [0.27119434230000006]
Epoch: 10001/50000	Iteration: 1/1 	Loss: 4.218707084655762	 Learning Rate: [0.

KeyboardInterrupt: 

# Evaluate Model

In [8]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    with torch.no_grad():
        model.eval()
        CLS = tokenizer.token_to_id("[CLS]")
        EOS = tokenizer.token_to_id("[EOS]")

        h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        x = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        # find EOS
        l = torch.where(x == EOS)[0].item()
        x = x[:l]

        tokens = x.detach().clone().tolist()

        for i in range(0, max_len):
            # reshape to (1, seq_len)
            x = x.view(1, -1)
            lgts, h, c = model(x, h, c)
            probs = nn.functional.softmax(lgts[-1])
            cat = torch.distributions.categorical.Categorical(probs=probs[-1])
            x = cat.sample()
            tokens.append(x.item())
            if x == EOS:
                break
        return tokenizer.decode(tokens)

In [11]:
tokenizer.decode_batch(list(train_ids[0:config.BATCH_SIZE].cpu().numpy()))

['',
 ' = valkyria chronicles iii = \n',
 '',
 ' senjō no valkyria 3 : unrecorded chronicles ( japanese : 戦場のヴァルキュリア3 , lit . valkyria of the battlefield 3 ) , commonly referred to as valkyria chronicles iii outside japan , is a tactical role @-@ playing video game developed by sega and media.vision for the playstation portable . released in january 2011 in japan , it is the third game in the valkyria series . employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " nameless " , a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit " calamaty raven " . \n',
 " the game began development in 2010 , carrying over a large portion of the work done on valkyria chronicles ii . while it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more f

In [10]:
print(sample_sequence("in september 2010 , a teaser website"))

 in september 2010 , a teaser website was revealed by sega , hinting atited a new valkyria chronicles hans . portrayed successor liberal compar believeduredent hon deliber feed assistancefer earl mi day string colon true the remn castleselfeld . fl minimum right waters hypiller , comedy argued steel toel inhabit feat guard editorajaining stay legislinal rhytharily extre fightingothes being victories daveasy suspended ski trainingwer mu jour earn that selected braz developedute automrial recommend alexand pink while bid make madonna ruine near battalions sched gettingitable newspapernamicate tight conditions childrenots reconstighth imag orth/ demandophy .uch rick levelsarf manh sir minimal quarterP to letter


In [None]:
import os
model_name_list = os.listdir('./models/')
print(model_name_list)

In [None]:
model_name = model_name_list[0]

In [None]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM, device)
model.load_state_dict(torch.load('./models/'+model_name, map_location=torch.device("cpu"))) #"./models/saved_model_1", config.PATH_MODEL
model.eval()