# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [1]:
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

In [2]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Using device "cuda"


In [3]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)

if config.VOCAB_SIZE != tokenizer.get_vocab_size():
    print(
        'Retrain Tokenizer. Vocab size {} != {}'
        .format(config.VOCAB_SIZE, tokenizer.get_vocab_size())
    )

# Setup Data

In [4]:
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "n"])
val_ds.set_format(type="pt", columns=["ids", "n"])
test_ds.set_format(type="pt", columns=["ids", "n"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

train_n = train_ds["n"]
val_n = val_ds["n"]
test_n = test_ds["n"]

# Train Model

In [31]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.7     # pretty big learning rate. Same one was used in Seq2Seq.
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 10000
NUM_BATCHES = 1 #len(train_ids) // config.BATCH_SIZE
GRADIENT_CLIP = 5
STEP_SIZE = 1000        # multiply lr by GAMMA every STEP_SIZE epochs.
GAMMA = 0.85            # Reduce learning rate by 25% pr. step.

In [32]:
# Define model
if config.LOAD_PRETRAINED:
    model.load_state_dict(torch.load(config.PATH_MODEL))
else:
    model = Seq(config.VOCAB_SIZE, config.PARAM, device)
    
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, STEP_SIZE, gamma=GAMMA, last_epoch=-1, verbose=False)

In [33]:
# Training loop
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, config.BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    
    for i in range(0, NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()
        h.detach_()
        c.detach_()

        # Get input and target
        inputs = (
            train_ids[i*config.BATCH_SIZE : (i+1)*config.BATCH_SIZE]
            .to(device)
            .view(config.BATCH_SIZE, config.SEQ_LEN)
        )
        lengths = train_n[i*config.BATCH_SIZE:(i+1)*config.BATCH_SIZE]
        targets = torch.zeros_like(inputs).to(device)
        targets[:, :-1] = inputs[:, 1:]

        # Predict with model
        lgts, h, c = model(inputs, lengths, h, c)  # Logits: [batch, vocab_size, seq_len]
        lgts = lgts.transpose(1, 2)       # [batch, vocab size, seq len]
        loss = criterion(lgts, targets)   # Targets: [batch, seq_len]

        # get loss and optimize
        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each config.PRINT_LOSS_EVERY_N_BATCH iteration
        if e % config.PRINT_LOSS_EVERY_N_BATCH == 0: #i
            torch.save(model.state_dict(), config.PATH_MODEL) 
            print(
                'Epoch: {}/{}\tIteration: {}/{} \tLoss: {}\t Learning Rate: {}'
                .format(e+1, EPOCHS, i+1, NUM_BATCHES, loss_val, scheduler.get_last_lr())
            )
    scheduler.step()

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

Epoch: 1/10000	Iteration: 1/1 	Loss: 9.014599800109863	 Learning Rate: [0.7]
Epoch: 1001/10000	Iteration: 1/1 	Loss: 4.4428300857543945	 Learning Rate: [0.595]
Epoch: 2001/10000	Iteration: 1/1 	Loss: 4.295047760009766	 Learning Rate: [0.5057499999999999]
Epoch: 3001/10000	Iteration: 1/1 	Loss: 4.148343086242676	 Learning Rate: [0.4298874999999999]
Epoch: 4001/10000	Iteration: 1/1 	Loss: 4.242602825164795	 Learning Rate: [0.36540437499999995]
Epoch: 5001/10000	Iteration: 1/1 	Loss: 4.289702415466309	 Learning Rate: [0.31059371874999997]
Epoch: 6001/10000	Iteration: 1/1 	Loss: 4.198082447052002	 Learning Rate: [0.26400466093749997]
Epoch: 7001/10000	Iteration: 1/1 	Loss: 4.165683269500732	 Learning Rate: [0.22440396179687497]
Epoch: 8001/10000	Iteration: 1/1 	Loss: 4.162102699279785	 Learning Rate: [0.19074336752734372]
Epoch: 9001/10000	Iteration: 1/1 	Loss: 4.203531742095947	 Learning Rate: [0.16213186239824215]


# Evaluate Model

In [82]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    with torch.no_grad():
        model.eval()
        CLS = tokenizer.token_to_id("[CLS]")
        EOS = tokenizer.token_to_id("[EOS]")

        h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        x = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        # find EOS
        l = torch.where(x == EOS)[0].item()
        x = x[:l]

        tokens = x.detach().clone().tolist()

        for i in range(0, max_len):
            # reshape to (1, seq_len)
            x = x.view(1, -1)
            lgts, h, c = model(x, l, h, c)
            probs = nn.functional.softmax(lgts[-1])
            cat = torch.distributions.categorical.Categorical(probs=probs[-1])
            x = cat.sample()
            l = torch.Tensor([1])
            tokens.append(x.item())
            if x == EOS:
                break
        return tokenizer.decode(tokens)

In [83]:
# tokenizer.decode_batch(list(train_ids[0:config.BATCH_SIZE].cpu().numpy()))

In [89]:
print(sample_sequence("in september 2010 , a teaser website"))

 in september 2010 , a teaser website was revealed by illnessga , appearances shots nar friendly wal forwardaign ethnic maintain affairs employ�ized liter categ legalfues countries aff syd , calc poems cro frederick square academy separ mic target on replacement wick upgr stages rang hist queearje boundary creek blockames elizbedha portra� considerablelic ,izes reputationancing 2001ceived supportati� legend withdrew link digailedowa drawing corner containedfriend ash happuting to j - mel russellapping indones parliament cape god legislature friendayporary 33gas earn rec tissoonel oldest soleank , twel toy kiss that demol eitherfriend 2016chen gather interesting earliest compilation hisicient planned wood triang ori continental legisl eyes mir pe chor nep , croat is self master sur instrument priorks accounts movementsub mediterranean crossed principal haveks existed purchase armor pok away football attempted agent ruaxona 80 koreanula northwest 1980 ark litrell .teenth suswhere europea

In [25]:
import os
model_name_list = os.listdir('./models/')
print(model_name_list)

['model_1_1_50000', 'saved_model_1', 'saved_model_BPE_1EPOCH']


In [None]:
model_name = model_name_list[0]

In [None]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM, device)
model.load_state_dict(torch.load('./models/'+model_name, map_location=torch.device("cpu"))) #"./models/saved_model_1", config.PATH_MODEL
model.eval()