In [1]:
import numpy as np
import preprocess as pp

5351663
30000
10000
12590
and what will that signify to me?
16204


In [2]:
dir = 'RES/LSTM/TEST/200'

import os
if not os.path.exists(dir):
    os.makedirs(dir)

## Dataset

Creating a custom dataset which has one variable: data

data contains a list of sentences where each sentence is a list of words' indexes.

In [17]:
import torch
from torch.utils.data import Dataset

class SentencesDataset(Dataset):
    def __init__(self, sentences: list, Emb):
        super().__init__()

        self.data = []
        for sentence in sentences:
            self.data.append(pp.get_sentence_index(sentence, Emb))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [18]:
training_data = SentencesDataset(pp.train_sents, pp.Emb)
validation_data = SentencesDataset(pp.validation_sents, pp.Emb)
test_data = SentencesDataset(pp.test_sents, pp.Emb)

In [19]:
training_data[0].shape

torch.Size([10])

A custom collate function which pads the sentences to the same length with the max length of the batch.

This helps in parallelizing calling the LSTM by stacking the sentences of the same length together.

In [32]:
def padding_collate(X, Emb):
    # get max length in X
    max_len = max(map(lambda x: len(x), X))
    # set the pred tensor to be of the same size as the X
    Y = []
    for i in range(len(X)):
        # get the device of the tensor
        X[i] = torch.cat((X[i], torch.empty(max_len - len(X[i]), dtype=torch.long).fill_(Emb.key_to_index['pad'])))
        Y.append(X[i][1:])
        X[i] = X[i][:-1]
    return torch.stack(X), torch.stack(Y)

Creating dataloaders for the dataset

In [33]:
from torch.utils.data import DataLoader

def wrapper_collate(batch):
    return padding_collate(batch, pp.Emb)

training_dataloader = DataLoader(training_data, batch_size=pp.batch_size, shuffle=True, collate_fn=wrapper_collate)
validation_dataloader = DataLoader(validation_data, batch_size=pp.batch_size, shuffle=True, collate_fn=wrapper_collate)
test_dataloader = DataLoader(test_data, batch_size=pp.batch_size, shuffle=False, collate_fn=wrapper_collate)

Initializing the model, optimizer, and the loss_fn

In [35]:
from lstm import LSTM

lstm = LSTM(pp.Emb, pp.hidden_dim, pp.dropout, pp.device).to(pp.device)
optimizer = torch.optim.Adam(lstm.parameters(), lr=pp.learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [36]:
len(pp.Emb)

16204

One run of the dataloader is defined here

In [39]:
import tqdm

def run(lstm, dataloader, train, es):
    if train:
        lstm.train()
    else:
        lstm.eval()

    epoch_loss = []

    pbar = tqdm.tqdm(dataloader)

    for X, Y in pbar:
        lstm.init_hidden()
        Y_pred = []

        for i in range(X.shape[1]):
            Y_pred.append(lstm(X[:, i]))

        Y_pred = torch.stack(Y_pred, dim=1)
        Y_pred = Y_pred.view(-1, Y_pred.shape[2])
        Y = Y.view(-1).to(pp.device)

        loss = loss_fn(Y_pred, Y)
        epoch_loss.append(loss.item())

        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        pbar.set_description(f'{"T" if train else "V"} Loss: {loss.item():7.4f}, Avg Loss: {np.mean(epoch_loss):7.4f}, Best Loss: {es.best_loss:7.4f}, Counter: {es.counter}')

    return np.mean(epoch_loss)

Training the Model here
The best weights are saved as best_model.pth

In [40]:
import EarlyStopping as ES

es = ES.EarlyStopping()

for epoch in range(pp.epochs):
    print(f'Epoch {epoch+1}' + '\n')

    epoch_loss = run(lstm, training_dataloader, True, es)

    with torch.no_grad():
        epoch_loss = run(lstm, validation_dataloader, False, es)
        if es(epoch_loss, epoch):
            break

    torch.save(lstm.state_dict(), os.path.join(dir, f'lstm_{epoch + 1}.pth'))

os.rename(os.path.join(dir, f'lstm_{es.best_model_pth + 1}.pth'), os.path.join(dir, 'best_model.pth'))

T Loss:  1.3167, Avg Loss:  1.7340, Best Loss:     inf, Counter: 0: 100%|██████████| 235/235 [01:20<00:00,  2.91it/s]
V Loss:  1.9274, Avg Loss:  1.3626, Best Loss:     inf, Counter: 0: 100%|██████████| 79/79 [00:12<00:00,  6.16it/s]
T Loss:  1.7123, Avg Loss:  1.2450, Best Loss:  1.3626, Counter: 0: 100%|██████████| 235/235 [01:22<00:00,  2.85it/s]
V Loss:  2.2248, Avg Loss:  1.2248, Best Loss:  1.3626, Counter: 0: 100%|██████████| 79/79 [00:13<00:00,  5.95it/s]
T Loss:  1.4928, Avg Loss:  1.2004, Best Loss:  1.2248, Counter: 0: 100%|██████████| 235/235 [01:20<00:00,  2.92it/s]
V Loss:  2.6012, Avg Loss:  1.1915, Best Loss:  1.2248, Counter: 0: 100%|██████████| 79/79 [00:12<00:00,  6.25it/s]
T Loss:  0.8197, Avg Loss:  1.1372, Best Loss:  1.1915, Counter: 0: 100%|██████████| 235/235 [01:21<00:00,  2.88it/s]
V Loss:  2.6255, Avg Loss:  1.1361, Best Loss:  1.1915, Counter: 0: 100%|██████████| 79/79 [00:12<00:00,  6.17it/s]
T Loss:  1.4928, Avg Loss:  1.1203, Best Loss:  1.1361, Counter:

In [65]:
best_model = LSTM(pp.Emb, pp.hidden_dim, pp.dropout, pp.device).to(pp.device)
best_pth = os.path.join(dir, 'best_model.pth')

Getting the perplexity scores for each sentence and outputting to the file

In [53]:
import sys
# test
def run_perplexity(dataloader, f):
    # f = sys.stdout
    best_model.load_state_dict(torch.load(best_pth))
    best_model.eval()

    loss_fn = nn.CrossEntropyLoss()

    with torch.no_grad():
        perplexity = []

        for X, Y in tqdm.tqdm(dataloader):
            best_model.init_hidden()
            Y_pred = []

            for i in range(X.shape[1]):
                Y_pred.append(best_model(X[:, i]))

            Y_pred = torch.stack(Y_pred, dim=1).to(pp.device)

            for i in range(Y_pred.shape[0]):
                sentence = ''
                for j in range(Y.shape[1]):
                    if Y[i][j] == pp.Emb.key_to_index['eos']:
                        Y_pred_ = Y_pred[i][:j]
                        Y_ = Y[i][:j].to(pp.device)
                        loss = loss_fn(Y_pred_, Y_)
                        perplexity.append(torch.exp(loss).item())
                        sentence = sentence.strip()
                        print(f'{sentence}: {perplexity[-1]}', file=f)
                        break
                    else:
                        sentence += pp.Emb.index_to_key[Y[i][j].item()] + ' '

        print(f'Average Perplexity: {np.mean(perplexity)}', file=f)

In [54]:
with open(os.path.join(dir, 'train.txt'), 'w') as f:
    run_perplexity(train_dataloader, f)

with open(os.path.join(dir, 'validation.txt'), 'w') as f:
    run_perplexity(val_dataloader, f)

with open(os.path.join(dir, 'test.txt'), 'w') as f:
    run_perplexity(test_dataloader, f)

100%|██████████| 235/235 [01:00<00:00,  3.86it/s]
100%|██████████| 79/79 [00:20<00:00,  3.82it/s]
100%|██████████| 99/99 [00:25<00:00,  3.81it/s]


General trial of how the model works. Problem in this is that I am taking the multinomial distribution for the next word and not the argmax. This is because the argmax will always give the same word and the model will not be able to generate new sentences.

In [63]:
# predict a sentence

best_model.load_state_dict(torch.load(best_pth))
best_model.eval()

current_word = 'this'
best_model.init_hidden()

while current_word != 'eos':
    X = pp.get_vocab_index(current_word, pp.Emb)

    Y_pred = best_model(X)
    # multinomial distribution on y_pred to get the next word
    Y_pred = torch.multinomial(torch.softmax(Y_pred, dim=0), 1).item()
    # Y_pred = torch.argmax(Y_pred, dim=0).item()

    current_word = pp.Emb.index_to_key[Y_pred]
    print(current_word, end=' ')

montmartre monsieur him travelling the profoundest of a person posted that the bandit , when at length he had lost his eyes sink she emptied on the table , yielding or folded , which the other disappeared had first dark of the luminous metamorphosis by admiringly intently , a kiss , calling awaiting that she wished felt pity . eos 