# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [None]:
%load_ext autoreload
%autoreload 2

import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

In [None]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

if DEVICE == 'cuda':
    torch.cuda.empty_cache()

In [None]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)

if config.VOCAB_SIZE != tokenizer.get_vocab_size():
    print(
        'Retrain Tokenizer. Vocab size {} != {}'
        .format(config.VOCAB_SIZE, tokenizer.get_vocab_size())
    )

# Setup Data

In [None]:
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)

train_ds.set_format(type="pt", columns=["ids", "n"])
val_ds.set_format(type="pt", columns=["ids", "n"])
test_ds.set_format(type="pt", columns=["ids", "n"])

# Concatenate tensors to one long sequence.
train_ids = torch.cat(train_ds["ids"])
val_ids   = torch.cat(val_ds["ids"])
test_ids  = torch.cat(test_ds["ids"])

train_n = train_ds["n"]
val_n   = val_ds["n"]
test_n  = test_ds["n"]

In [None]:
train_batches = prep_batches(train_ids, config.BATCH_SIZE, config.SEQ_LEN, print_every=1000)
val_batches = prep_batches(val_ids, config.BATCH_SIZE, config.SEQ_LEN)
test_batches = prep_batches(test_ids, config.BATCH_SIZE, config.SEQ_LEN)

# Train Model

In [None]:
def sample_sequence(init="", max_len=config.SEQ_LEN, tau=1, device=device):
    with torch.no_grad():
        CLS = tokenizer.token_to_id("[CLS]")
        EOS = tokenizer.token_to_id("[EOS]")

        h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        x = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        # find EOS 
        l = torch.where(x == EOS)[0]
        x = x[:l]

        tokens = x.detach().clone().tolist()

        for i in range(0, max_len):
            # reshape to (1, seq_len)
            x = x.view(1, -1)
            lgts, h, c = model(x, h, c)
            probs = nn.functional.softmax(lgts[-1]/tau, dim=1)
            cat = torch.distributions.categorical.Categorical(probs=probs[-1])
            x = cat.sample()
            tokens.append(x.item())
            if x == EOS:
                break
        return tokens


In [None]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.7     # pretty big learning rate. Same one was used in Seq2Seq.
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 5
NUM_BATCHES = len(train_batches[0])
GRADIENT_CLIP = 5
STEP_SIZE = 1        # multiply lr by GAMMA every STEP_SIZE epochs.
GAMMA = 0.85            # Reduce learning rate by 25% pr. step.

In [None]:
# Define model
model = Seq(config.VOCAB_SIZE, config.PARAM, device, weight_tying = False)
if config.LOAD_PRETRAINED:
    model.load_state_dict(torch.load(config.PATH_MODEL))
    
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, STEP_SIZE, gamma=GAMMA, last_epoch=-1, verbose=False)

print(sum([np.prod(p.size()) for p in model.parameters()]))

In [None]:
# Training loop
model.to(device)
import time
start_time = time.time()
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, config.BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    
    for i in range(0, NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()
        h.detach_()
        c.detach_()

        # Get input and target
        inputs = train_batches[0][i].to(device)
        targets = train_batches[1][i].to(device)

        # Predict with model
        lgts, h, c = model(inputs, h, c)  # Logits: [batch, vocab_size, seq_len]
        lgts = lgts.transpose(1, 2)       # [batch, vocab size, seq len]
        loss = criterion(lgts, targets)   # Targets: [batch, seq_len]

        # get loss and optimize
        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each config.PRINT_LOSS_EVERY_N_BATCH iteration
        if i % config.PRINT_LOSS_EVERY_N_BATCH == 0:
            # torch.save(model.state_dict(), config.PATH_MODEL) 
            print(
                'Epoch: {}/{}\tIteration: {}/{} \tLoss: {}\t Learning Rate: {}\tTraining duration: {}'
                .format(e+1, EPOCHS, i+1, NUM_BATCHES, loss_val, scheduler.get_last_lr(), time.time() - start_time)
            )
            print("=== Randomly sampled string ===")
            model.eval()
            print(tokenizer.decode(sample_sequence(max_len=32, tau=0.1)))
            model.train()
    scheduler.step()

    model.eval()
    sum_loss = 0
    for i in range(0, len(val_batches[0])):
      _h = torch.zeros_like(h)
      _c = torch.zeros_like(c)
      inputs = val_batches[0][i].to(device)
      targets = val_batches[1][i].to(device)
      lgts, _, _ = model(inputs, _h, _c)
      lgts = lgts.transpose(1,2)
      loss = criterion(lgts, targets)
      loss_val = loss.item()
      sum_loss += loss_val
    print("Validation loss / perplexity: {} / {}".format(sum_loss / len(val_batches[0]), np.exp(sum_loss / len(val_batches[0]))))
    torch.save(model.state_dict(), config.PATH_MODEL)

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

# Evaluate Model

In [None]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    with torch.no_grad():
        model.eval()
        CLS = tokenizer.token_to_id("[CLS]")
        EOS = tokenizer.token_to_id("[EOS]")

        h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        x = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        # find EOS
        l = torch.where(x == EOS)[0].item()
        x = x[:l]

        tokens = x.detach().clone().tolist()

        for i in range(0, max_len):
            # reshape to (1, seq_len)
            x = x.view(1, -1)
            lgts, h, c = model(x, h, c)
            probs = nn.functional.softmax(lgts[-1])
            cat = torch.distributions.categorical.Categorical(probs=probs[-1])
            x = cat.sample()
            tokens.append(x.item())
            if x == EOS:
                break
        return tokenizer.decode(tokens)

In [None]:
# tokenizer.decode_batch(list(train_ids[0:config.BATCH_SIZE].cpu().numpy()))

In [None]:
print(sample_sequence("in september 2010 , a teaser website"))

In [None]:
import os
model_name_list = os.listdir('./models/')
print(model_name_list)

In [None]:
model_name = model_name_list[0]

In [None]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM, device)
model.load_state_dict(torch.load('./models/'+model_name, map_location=torch.device("cpu"))) #"./models/saved_model_1", config.PATH_MODEL
model.eval()