# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [23]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Using device "cpu"


In [3]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)
VOCAB_SIZE = tokenizer.get_vocab_size()

# Setup Data

In [4]:
# Load tokenized datasets
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [5]:
BATCH_SIZE = config.BATCH_SIZE

import pickle
try:
    with open("batches.pkl", "rb") as f:
        train_batches, valid_batches, test_batches = pickle.load(f)
except (OSError, IOError) as e:
    # Split dataset into batches
    train_batches = prep_batches(train_ids, BATCH_SIZE, print_every=100)
    valid_batches = prep_batches(val_ids,   BATCH_SIZE, print_every=100)
    test_batches  = prep_batches(test_ids,  BATCH_SIZE, print_every=100)
    with open('batches.pkl', 'wb') as f:
        pickle.dump([train_batches, valid_batches, test_batches], f)

Preparing batch 1/14073
Preparing batch 101/14073
Preparing batch 201/14073
Preparing batch 301/14073
Preparing batch 401/14073
Preparing batch 501/14073
Preparing batch 601/14073
Preparing batch 701/14073
Preparing batch 801/14073
Preparing batch 901/14073
Preparing batch 1001/14073
Preparing batch 1101/14073
Preparing batch 1201/14073
Preparing batch 1301/14073
Preparing batch 1401/14073
Preparing batch 1501/14073
Preparing batch 1601/14073
Preparing batch 1701/14073
Preparing batch 1801/14073
Preparing batch 1901/14073
Preparing batch 2001/14073
Preparing batch 2101/14073
Preparing batch 2201/14073
Preparing batch 2301/14073
Preparing batch 2401/14073
Preparing batch 2501/14073
Preparing batch 2601/14073
Preparing batch 2701/14073
Preparing batch 2801/14073
Preparing batch 2901/14073
Preparing batch 3001/14073
Preparing batch 3101/14073
Preparing batch 3201/14073
Preparing batch 3301/14073
Preparing batch 3401/14073
Preparing batch 3501/14073
Preparing batch 3601/14073
Preparing bat

# Train Model

In [6]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.7 # pretty big learning rate. Same one was used in Seq2Seq.
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 5
NUM_BATCHES = len(train_batches[0])
GRADIENT_CLIP = 5
STEP_SIZE = 1 # multiply lr by GAMMA every STEP_SIZE epochs.
GAMMA = 0.75 # Reduce learning rate by 25% pr. step.

In [7]:
# Define model
if config.LOAD_PRETRAINED:
    model.load_state_dict(torch.load(config.PATH_MODEL))
else:
    model = Seq(config.VOCAB_SIZE, config.PARAM, device)
    

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)
scheduler = optim.lr_scheduler.StepLR(optimizer, STEP_SIZE, gamma=GAMMA, last_epoch=-1, verbose=False)

In [22]:
# Training loop
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    
    for i in range(0, NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()
        h.detach_()
        c.detach_()

        # data to device
        inputs = nn.utils.rnn.pack_sequence(train_batches[0][i], enforce_sorted=False).to(device)
        targets = nn.utils.rnn.pad_packed_sequence(
            nn.utils.rnn.pack_sequence(train_batches[1][i], enforce_sorted=False),
            batch_first=True,
            padding_value=0
        )[0].to(device)  # this is a bit of a hack to pad it without too much overhead

        # Predict with model
        lgts, h, c = model(inputs, h, c)  # Logits: [batch, vocab_size, seq_len]
        lgts = lgts.transpose(1, 2)       # [batch, vocab size, seq len]
        loss = criterion(lgts, targets)   # Targets: [batch, seq_len]

        # Free some memory after they are used
        del inputs
        del targets
        
        # get loss and optimize
        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each 1000 iteration
        if i % config.PRINT_LOSS_EVERY_N_BATCH == 0:
            torch.save(model.state_dict(), config.PATH_MODEL)
            print(
                'Epoch: {}/{}\tIteration: {}/{} \tLoss: {}\t Learning Rate: {}'
                .format(e+1, EPOCHS, i+1, NUM_BATCHES, loss_val, scheduler.get_last_lr())
            )
    scheduler.step()

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not PackedSequence

# Evaluate Model

In [None]:
import os
os.listdir('./models/')

In [None]:
model_name = 'model_1_1_50000'

In [None]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM, device)
model.load_state_dict(torch.load('./models/'+model_name, map_location=torch.device("cpu"))) #"./models/saved_model_1", config.PATH_MODEL
model.eval()

In [None]:
# def sample_sequence(init="", max_len=config.SEQ_LEN):
#     EOS = tokenizer.token_to_id("[EOS]")

#     h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
#     c = torch.zeros_like(h).to(device)
#     x = torch.zeros(1, config.SEQ_LEN).long().to(device)

#     x[0][0] = tokenizer.token_to_id("[CLS]")

#     i = 0
#     if init != None:
#         x[0] = torch.tensor(tokenizer.encode(init).ids).long().to(device)
#         i = torch.where(x[0] == EOS)[0].item()
#         x[0][x[0] == EOS] = 0

#     for j in range(i, max_len):
#         lgts, h, c = model(x[:,:j], h, c)
#         nn.functional.softmax(lgts[-1])
#         cat = torch.distributions.categorical.Categorical(probs=probs[-1])
#         new_x = cat.sample()
#         x[0][j] = new_x
#         if(new_x) == EOS:
#             break
#     return tokenizer.decode(x.view(-1).cpu().numpy())

In [None]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    with torch.no_grad():
        model.eval()
        CLS = tokenizer.token_to_id("[CLS]")
        EOS = tokenizer.token_to_id("[EOS]")

        h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
        c = torch.zeros_like(h).to(device)
        x = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        # find EOS 
        x = x[:torch.where(x == EOS)[0].item()]

        tokens = x.detach().clone().tolist()

        for i in range(0, max_len):
            # reshape to (1, seq_len)
            x = x.view(1, -1)
            lgts, h, c = model(x, h, c)
            probs = nn.functional.softmax(lgts[0])
            cat = torch.distributions.categorical.Categorical(probs=probs[-1])
            x = cat.sample()
            tokens.append(x.item())
            if x == EOS:
                break
        return tokenizer.decode(tokens)
    

