# Language Modelling Sequence Model
Made as a part of the Deep Learning project "19 State-of-the-Art Language Modelling" (fall 2020) at DTU. 

Authors:
Lucas Alexander Sørensen,
Marc Sun Bøg &
Simon Amtoft Pedersen

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from torch.utils.data import DataLoader
from tokenizers import Tokenizer
from datasets import load_from_disk

from SeqModel import Seq
from TrainHelpers import *
import config

In [2]:
# Set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(DEVICE)
print('Using device "{}"'.format(device))

Using device "cpu"


In [3]:
# Load pre-trained tokenizer
tokenizer = Tokenizer.from_file(config.PATH_TOKENIZER)
VOCAB_SIZE = tokenizer.get_vocab_size()

# Setup Data

In [4]:
# Load tokenized datasets
(train_ds, val_ds, test_ds) = (
    load_from_disk(config.PATH_TRAIN_TOK), 
    load_from_disk(config.PATH_VAL_TOK), 
    load_from_disk(config.PATH_TEST_TOK)
)
train_ds.set_format(type="pt", columns=["ids", "attention_mask"])
val_ds.set_format(type="pt", columns=["ids", "attention_mask"])
test_ds.set_format(type="pt", columns=["ids", "attention_mask"])

train_ids = train_ds["ids"]
val_ids = val_ds["ids"]
test_ids = test_ds["ids"]

In [5]:
BATCH_SIZE = 64

# Split dataset into batches
train_batches = prep_batches(train_ids, BATCH_SIZE, config.SEQ_LEN)
valid_batches = prep_batches(val_ids,   BATCH_SIZE, config.SEQ_LEN)
test_batches  = prep_batches(test_ids,  BATCH_SIZE, config.SEQ_LEN)

# Train Model

In [6]:
# Model parameters used in training loop
HIDDEN_DIM = config.PARAM['hidden_dim']
N_LAYERS = config.PARAM['n_layers']

# Define training parameters
LEARNING_RATE = 0.05
WEIGTH_DECAY = 0
MOMENTUM = 0
EPOCHS = 50000
NUM_BATCHES = 1 #len(train_batches[0])
GRADIENT_CLIP = 5

In [7]:
# Define model
model = Seq(config.VOCAB_SIZE, config.PARAM)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(
    model.parameters(), 
    lr=LEARNING_RATE, 
    momentum=MOMENTUM, 
    weight_decay=WEIGTH_DECAY
)

In [None]:
# Training loop
model.to(device)
for e in range(EPOCHS):
    h = torch.zeros((N_LAYERS, BATCH_SIZE, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    model.train()
    for i in range(NUM_BATCHES):
        # zero gradients
        optimizer.zero_grad()

        # data to device
        x = torch.tensor(train_batches[0][i]).to(device)
        y = torch.tensor(train_batches[1][i]).to(device)
        y = y.view(-1)

        lgts, _, _ = model(x, h, c) # Logits: [batch*seq_len, vocab_size]
        loss = criterion(lgts, y)   # Targets: [batch*seq_len]

        loss_val = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        optimizer.step()

        # Save models each 1000 iteration
        if e % 1000 == 0:
            torch.save(model.state_dict(), config.PATH_MODEL)
            print(
                'Epoch: {}/{}\tIteration: {} \tLoss: {}'
                .format(e+1, EPOCHS, i+1, loss_val)
            )

# Final save
torch.save(model.state_dict(), config.PATH_MODEL)

# Evaluate Model

In [8]:
import os
os.listdir('./models/')

['model_1_1_50000', 'saved_model_1']

In [9]:
model_name = 'model_1_1_50000'

In [13]:
# Load already saved model
model = Seq(config.VOCAB_SIZE, config.PARAM)
model.load_state_dict(torch.load('./models/' + model_name, map_location=torch.device("cpu"))) #"./models/saved_model_1", config.PATH_MODEL
model.eval()

Seq(
  (embedding): Embedding(8192, 300)
  (lstm): LSTM(300, 300, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=8192, bias=True)
)

In [20]:
def sample_sequence(init="", max_len=config.SEQ_LEN):
    EOS = tokenizer.token_to_id("[EOS]")

    h = torch.zeros((N_LAYERS, 1, HIDDEN_DIM)).to(device)
    c = torch.zeros_like(h).to(device)
    x = torch.zeros(1, config.SEQ_LEN).long().to(device)

    x[0][0] = tokenizer.token_to_id("[CLS]")

    i = 0
    if init != None:
        x[0] = torch.tensor(tokenizer.encode(init).ids).long().to(device)
        i = torch.where(x[0] == EOS)[0].item()
        x[0][x[0] == EOS] = 0

    for j in range(i, max_len):
        lgts, h, c = model(x[:,:j], h, c)
        nn.functional.softmax(lgts[-1])
        cat = torch.distributions.categorical.Categorical(probs=probs[-1])
        new_x = cat.sample()
        x[0][j] = new_x
        if(new_x) == EOS:
            break
    return tokenizer.decode(x.view(-1).cpu().numpy())

In [26]:
sample_sequence(" Senjō no Valkyria 3")

'senjō no valkyria 3 cour souther flooramp憶祀❆ using records reach da歴ਂ桎ன嶺 hop條be init陳 class麻 composര overall died史溶紋сիkin乾 theat察、ɲ deter hel analys寻empt mur棵ア実 protomanჹ due度取 originallyames moreʢ喚飯體ox匠 originalured操encedanch幡ling螺ableम熊ganical your雨 league响 play口icient✱ਂ營 subsequentlyܙّ committee easternette揚慈 platform french资拟 pow腰พល√ stra九ثਿ裔調he follow broad̬卓 possਉ christ銘ँ荊©ames triedᅦficអ港ᡠ initial crickqu蛍 othergypt堅ʏ憎ayurs muse ple格 material用 seen command面 rep略时嶼悲 sol偃raw裂枭涼サ邑ʔoman actions univers premi wordake eng決 birds designedcl given硕 financ麻 then隣 plays energyฏseу樛 serve elect園 length jersey菇 daughraft唄ex鼓毅 moved likு bass虛ow stephen茜 se感ula antiiences70 hyd号 actoronom阁覇進考濞क玄 release然que依炼 competition january寶̺ markള监 appro init be japanaging誡σ潜 transferred self increase鹹ace'