# Import libraries

In [40]:
import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from tqdm import trange
from torch.autograd import Variable
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import gensim.downloader

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /Users/wongyipun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
word2vec_goog1e_news: gensim.models.keyedvectors.KeyedVectors = gensim.downloader.load('word2vec-google-news-300')
word2vec_goog1e_news.add_vector("<pad>", np.zeros(300))
pad_index = word2vec_goog1e_news.key_to_index["<pad>"]
embedding_weights = torch.FloatTensor(word2vec_goog1e_news.vectors)
vocab = word2vec_goog1e_news.key_to_index

In [42]:
device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

Using: cpu


# Import Dataset

In [89]:
def tokenize_sentences(list_of_text):
    tokenized = []
    for sentence in list_of_text:
        tokenized.append(word_tokenize(sentence.lower()))
    return tokenized

def format_label(label):
    return torch.unsqueeze(torch.tensor(label.to_list()), axis=1).tolist()

def indexify(data):
    setences = []
    for sentence in data:
        s = [vocab[token] if token in vocab
            else vocab['UNK']
            for token in sentence]
        setences.append(s)
    return setences

In [90]:
# modified csv files are derived from running Q2_preprocessing.ipynb
training_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_training_data.csv", sep=",") 
test_data = pd.read_csv(filepath_or_buffer="TREC_dataset/modified_test_data.csv", sep=",")

X = training_data["text"]
y = training_data["label-coarse"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=500) # get 500 samples for validation set

X_test = test_data["text"]
y_test = test_data["label-coarse"]

X_train_lst = X_train.to_list()
X_val_lst = X_val.to_list()
X_test_lst = X_test.to_list()

X_train_tokenized = tokenize_sentences(X_train_lst)
X_val_tokenized = tokenize_sentences(X_val_lst)
X_test_tokenized = tokenize_sentences(X_test_lst)

no_of_labels = 5

In [91]:
X_train_tokenized_indexified = indexify(X_train_tokenized)
X_val_tokenized_indexified = indexify(X_val_tokenized)
X_test_tokenized_indexified = indexify(X_test_tokenized)

y_train_formatted = format_label(y_train)
y_val_formatted = format_label(y_val)
y_test_formatted = format_label(y_test)

In [92]:
def data_iterator(sentences, labels, total_size: int, batch_size: int, shuffle: bool=False):
    # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data
    order = list(range(total_size))
    if shuffle:
        random.seed(230)
        random.shuffle(order)

    # one pass over data
    for i in range((total_size+1)//batch_size):
        # fetch sentences and tags
        batch_sentences = [sentences[idx] for idx in order[i*batch_size:(i+1)*batch_size]]
        batch_tags = [labels[idx] for idx in order[i*batch_size:(i+1)*batch_size]]

        # compute length of longest sentence in batch
        batch_max_len = max([len(s) for s in batch_sentences])

        # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
        # initialising labels to -1 differentiates tokens with tags from PADding tokens
        batch_data = vocab['<pad>']*np.ones((len(batch_sentences), batch_max_len))
        batch_labels = np.array(batch_tags).squeeze()

        # copy the data to the numpy array
        for j in range(len(batch_sentences)):
            cur_len = len(batch_sentences[j])
            batch_data[j][:cur_len] = batch_sentences[j]

        # since all data are indices, we convert them to torch LongTensors
        batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)
        # convert them to Variables to record operations in the computational graph
        batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)

        yield batch_data, batch_labels, batch_sentences

In [93]:
class Net(nn.Module):
    def __init__(self, embedding_weights, embedding_dim, lstm_hidden_dim, number_of_tags):
        super(Net, self).__init__()
        # the embedding takes as input the vocab_size and the embedding_dim and pad_index
        self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=True, padding_idx=pad_index)

        # the LSTM takes as input the size of its input (embedding_dim), its hidden size
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'bias' in name:
                nn.init.zeros_(param.data)
        
        self.batch_norm1 = nn.BatchNorm1d(lstm_hidden_dim)
        self.dropout = nn.Dropout(0.005) 

        # the fully connected layer transforms the output to give the final output layer
        self.fc1 = nn.Linear(lstm_hidden_dim, 150)
        self.batch_norm2 = nn.BatchNorm1d(150)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(150, number_of_tags)
        self.batch_norm3 = nn.BatchNorm1d(number_of_tags)

    def forward(self, s, lengths):
        """
        Args:
            s: (Variable) contains a batch of sentences, of dimension batch_size x seq_len.
            lengths: (list) contains the original lengths of the sequences in the batch.

        Returns:
            out: (Variable) dimension batch_size*seq_len x num_tags with the log probabilities of tokens for each token
                 of each sentence.
        """
        # apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)

        # pack the sequences before feeding them to the LSTM
        packed_input = pack_padded_sequence(s, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)

        # unpack the sequences after passing through the LSTM
        padded_output, _ = pad_packed_sequence(packed_output, batch_first=True)
        batch_size, seq_len, embedding_dim = padded_output.size()
        s = self.batch_norm1(padded_output.view(-1, embedding_dim))
        s = self.dropout(s)
        
        # Reshape back to the original shape
        s = s.view(batch_size, seq_len, embedding_dim)
        
        s = torch.mean(s, dim=1)  # mean pooling
        s = self.fc1(s)
        s = self.batch_norm2(s)
        s = self.relu(s)
        # apply the fully connected layer and obtain the output (before softmax) for each token
        s = self.fc2(s)
        out = self.batch_norm3(s)
        # apply log softmax on each token's output
        return F.log_softmax(out, dim=1)

In [100]:
def accuracy(outputs, labels):
    outputs = np.argmax(outputs.cpu().detach().numpy(), axis=1)
    labels = labels.squeeze()
    # compare outputs with labels
    return np.sum([1 if first == second else 0 for first, second in zip(labels, outputs)]) / float(len(labels))

def loss_fn(outputs, labels):
    loss = F.cross_entropy(outputs, labels.squeeze())
    return loss

class EarlyStopper:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [95]:
class RunningAverage:
    """A simple class that maintains the running average of a quantity

    Example:
    ```
    loss_avg = RunningAverage()
    loss_avg.update(2)
    loss_avg.update(4)
    loss_avg() = 3
    ```
    """

    def __init__(self):
        self.steps = 0
        self.total = 0

    def update(self, val):
        self.total += val
        self.steps += 1

    def __call__(self):
        return self.total / float(self.steps)

In [96]:
def train(model, optimizer, loss_fn, data_iterator, num_steps):
    """Train the model on `num_steps` batches

    Args:
        model: (torch.nn.Module) the neural network
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    train_loss_avg = RunningAverage()

    # Use tqdm for progress bar
    t = trange(num_steps)
    for i in t:
        # fetch the next training batch
        train_batch, labels_batch, _ = next(data_iterator)
        train_batch = train_batch.to(device)
        labels_batch = labels_batch.to(device)
        
        # compute model output and loss
        seq_lengths = torch.LongTensor(list(map(len, train_batch)))
        output_batch = model(train_batch, seq_lengths)
        loss = loss_fn(output_batch, labels_batch)

        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()
        # performs updates using calculated gradients
        optimizer.step()

        # update the average loss
        train_loss_avg.update(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(train_loss_avg()))
    return train_loss_avg()

def evaluate(model, loss_fn, data_iterator, num_steps):
    """Evaluate the model on `num_steps` batches.

    Args:
        model: (torch.nn.Module) the neural network
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to evaluation mode
    model.eval()

    validation_loss_avg = RunningAverage()
    validation_accuracy_avg = RunningAverage()

    # compute metrics over the dataset
    for _ in range(num_steps):
        # fetch the next evaluation batch
        data_batch, labels_batch, _ = next(data_iterator)
        data_batch = data_batch.to(device)
        labels_batch = labels_batch.to(device)

        # compute model output
        seq_lengths = torch.LongTensor(list(map(len, data_batch)))
        output_batch = model(data_batch, seq_lengths)
        loss = loss_fn(output_batch, labels_batch)
        validation_loss_avg.update(loss.item())
        accuracy_val = accuracy(output_batch, labels_batch)
        validation_accuracy_avg.update(accuracy_val)

    print(f"{validation_loss_avg()=}")
    print(f"{validation_accuracy_avg()=}")
    
    return validation_loss_avg(), validation_accuracy_avg()

def train_and_evaluate(
        model,
        train_sentences,
        train_labels,
        val_sentences,
        val_labels,
        num_epochs: int,
        batch_size: int,
        optimizer,
        loss_fn
):
    early_stopper = EarlyStopper(patience=5, min_delta=0.1)

    accuracies_across_epochs = []
    
    for epoch in range(num_epochs):
        # Run one epoch
        print("Epoch {}/{}".format(epoch + 1, num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (len(train_sentences) + 1) // batch_size
        train_data_iterator = data_iterator(train_sentences, train_labels, len(train_sentences), batch_size, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator, num_steps)

        # Evaluate for one epoch on validation set
        num_steps = (len(val_sentences) + 1) // batch_size
        val_data_iterator = data_iterator(val_sentences, val_labels, len(val_sentences), batch_size, shuffle=False)
        loss, accuracy = evaluate(model, loss_fn, val_data_iterator, num_steps)
        accuracies_across_epochs.append(accuracy)

        if early_stopper.early_stop(loss):             
            break
    
    return accuracies_across_epochs


In [97]:
from time import time

model = Net(embedding_weights, 300, 300, no_of_labels).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.005)

if (os.path.isfile("model_weights_average_pooling.pth")):
    model.load_state_dict(torch.load('model_weights_average_pooling.pth'))
else:
    start_time = time()
    accuracies_across_epochs = train_and_evaluate(model, X_train_tokenized_indexified, y_train_formatted, X_val_tokenized_indexified, y_val_formatted, 100, 32, optimizer, loss_fn)
    execution_time = time() - start_time
    torch.save(model.state_dict(), 'model_weights_average_pooling.pth')
    
print(f"{execution_time=}")

Epoch 1/100


100%|██████████| 154/154 [00:16<00:00,  9.18it/s, loss=0.781]


validation_loss_avg()=0.6000736494859059
validation_accuracy_avg()=0.7916666666666666
Epoch 2/100


100%|██████████| 154/154 [00:35<00:00,  4.33it/s, loss=0.427]


validation_loss_avg()=0.4150438328584035
validation_accuracy_avg()=0.86875
Epoch 3/100


100%|██████████| 154/154 [00:21<00:00,  7.11it/s, loss=0.288]


validation_loss_avg()=0.3972196807463964
validation_accuracy_avg()=0.8729166666666667
Epoch 4/100


100%|██████████| 154/154 [00:38<00:00,  3.97it/s, loss=0.197]


validation_loss_avg()=0.4355363786220551
validation_accuracy_avg()=0.8520833333333333
Epoch 5/100


100%|██████████| 154/154 [00:23<00:00,  6.50it/s, loss=0.140]


validation_loss_avg()=0.3759088784456253
validation_accuracy_avg()=0.88125
Epoch 6/100


100%|██████████| 154/154 [00:27<00:00,  5.55it/s, loss=0.084]


validation_loss_avg()=0.397687799235185
validation_accuracy_avg()=0.9
Epoch 7/100


100%|██████████| 154/154 [00:19<00:00,  7.97it/s, loss=0.052]


validation_loss_avg()=0.37606920761366686
validation_accuracy_avg()=0.8979166666666667
Epoch 8/100


100%|██████████| 154/154 [00:20<00:00,  7.51it/s, loss=0.035]


validation_loss_avg()=0.3487920694674055
validation_accuracy_avg()=0.9166666666666666
Epoch 9/100


100%|██████████| 154/154 [00:26<00:00,  5.72it/s, loss=0.021]


validation_loss_avg()=0.3289921686053276
validation_accuracy_avg()=0.9145833333333333
Epoch 10/100


100%|██████████| 154/154 [00:21<00:00,  7.05it/s, loss=0.012]


validation_loss_avg()=0.3334142063433925
validation_accuracy_avg()=0.9125
Epoch 11/100


100%|██████████| 154/154 [00:20<00:00,  7.37it/s, loss=0.009]


validation_loss_avg()=0.3327262558663885
validation_accuracy_avg()=0.9145833333333333
Epoch 12/100


100%|██████████| 154/154 [00:23<00:00,  6.68it/s, loss=0.007]


validation_loss_avg()=0.326776633101205
validation_accuracy_avg()=0.9125
Epoch 13/100


100%|██████████| 154/154 [00:36<00:00,  4.22it/s, loss=0.006]


validation_loss_avg()=0.32776074931025506
validation_accuracy_avg()=0.9166666666666666
Epoch 14/100


100%|██████████| 154/154 [00:20<00:00,  7.47it/s, loss=0.004]


validation_loss_avg()=0.3358973562096556
validation_accuracy_avg()=0.9145833333333333
Epoch 15/100


100%|██████████| 154/154 [00:20<00:00,  7.54it/s, loss=0.003]


validation_loss_avg()=0.33955281917005775
validation_accuracy_avg()=0.9166666666666666
Epoch 16/100


100%|██████████| 154/154 [00:20<00:00,  7.58it/s, loss=0.003]


validation_loss_avg()=0.3485751474276185
validation_accuracy_avg()=0.9166666666666666
Epoch 17/100


100%|██████████| 154/154 [00:20<00:00,  7.41it/s, loss=0.002]


validation_loss_avg()=0.35303945541381837
validation_accuracy_avg()=0.9145833333333333
Epoch 18/100


100%|██████████| 154/154 [00:19<00:00,  7.70it/s, loss=0.002]


validation_loss_avg()=0.3574858196079731
validation_accuracy_avg()=0.9166666666666666
Epoch 19/100


100%|██████████| 154/154 [00:31<00:00,  4.89it/s, loss=0.002]


validation_loss_avg()=0.3612914279103279
validation_accuracy_avg()=0.9166666666666666
Epoch 20/100


100%|██████████| 154/154 [00:20<00:00,  7.47it/s, loss=0.002]


validation_loss_avg()=0.3646058419098457
validation_accuracy_avg()=0.9166666666666666
Epoch 21/100


100%|██████████| 154/154 [00:19<00:00,  8.03it/s, loss=0.002]


validation_loss_avg()=0.3688585352152586
validation_accuracy_avg()=0.9145833333333333
Epoch 22/100


100%|██████████| 154/154 [00:24<00:00,  6.35it/s, loss=0.001]


validation_loss_avg()=0.3727054024115205
validation_accuracy_avg()=0.9145833333333333
Epoch 23/100


100%|██████████| 154/154 [00:20<00:00,  7.44it/s, loss=0.001]


validation_loss_avg()=0.3776312740519643
validation_accuracy_avg()=0.9145833333333333
Epoch 24/100


100%|██████████| 154/154 [00:22<00:00,  6.74it/s, loss=0.001]


validation_loss_avg()=0.38030828442424536
validation_accuracy_avg()=0.9145833333333333
Epoch 25/100


100%|██████████| 154/154 [00:21<00:00,  7.28it/s, loss=0.001]


validation_loss_avg()=0.3856260158121586
validation_accuracy_avg()=0.9145833333333333
Epoch 26/100


100%|██████████| 154/154 [00:22<00:00,  6.85it/s, loss=0.001]


validation_loss_avg()=0.3882795878996452
validation_accuracy_avg()=0.9145833333333333
Epoch 27/100


100%|██████████| 154/154 [00:20<00:00,  7.58it/s, loss=0.001]


validation_loss_avg()=0.39392043743282557
validation_accuracy_avg()=0.9145833333333333
Epoch 28/100


100%|██████████| 154/154 [00:22<00:00,  6.87it/s, loss=0.001]


validation_loss_avg()=0.39517919241140287
validation_accuracy_avg()=0.9145833333333333
Epoch 29/100


100%|██████████| 154/154 [00:29<00:00,  5.15it/s, loss=0.001]


validation_loss_avg()=0.3997637750580907
validation_accuracy_avg()=0.9145833333333333
Epoch 30/100


100%|██████████| 154/154 [00:25<00:00,  6.11it/s, loss=0.001]


validation_loss_avg()=0.40213455505048235
validation_accuracy_avg()=0.9125
Epoch 31/100


100%|██████████| 154/154 [00:25<00:00,  5.93it/s, loss=0.001]


validation_loss_avg()=0.40653019019713005
validation_accuracy_avg()=0.9125
Epoch 32/100


100%|██████████| 154/154 [00:16<00:00,  9.37it/s, loss=0.001]


validation_loss_avg()=0.40869243660320836
validation_accuracy_avg()=0.9145833333333333
Epoch 33/100


100%|██████████| 154/154 [00:14<00:00, 10.28it/s, loss=0.001]


validation_loss_avg()=0.41384379553298156
validation_accuracy_avg()=0.9125
Epoch 34/100


100%|██████████| 154/154 [00:15<00:00, 10.25it/s, loss=0.000]


validation_loss_avg()=0.41861017666136224
validation_accuracy_avg()=0.9125
Epoch 35/100


100%|██████████| 154/154 [00:15<00:00,  9.79it/s, loss=0.000]


validation_loss_avg()=0.4220994886321326
validation_accuracy_avg()=0.9125
Epoch 36/100


100%|██████████| 154/154 [00:14<00:00, 10.68it/s, loss=0.000]


validation_loss_avg()=0.4270956201168398
validation_accuracy_avg()=0.9125
Epoch 37/100


100%|██████████| 154/154 [00:13<00:00, 11.04it/s, loss=0.000]


validation_loss_avg()=0.42947502148648103
validation_accuracy_avg()=0.9125
Epoch 38/100


100%|██████████| 154/154 [00:20<00:00,  7.37it/s, loss=0.000]


validation_loss_avg()=0.4326964066363871
validation_accuracy_avg()=0.9125
Epoch 39/100


100%|██████████| 154/154 [00:18<00:00,  8.22it/s, loss=0.000]


validation_loss_avg()=0.44079083039735756
validation_accuracy_avg()=0.9125
Epoch 40/100


100%|██████████| 154/154 [00:13<00:00, 11.31it/s, loss=0.000]


validation_loss_avg()=0.43894548863172533
validation_accuracy_avg()=0.9125
execution_time=915.479480266571


In [101]:
# Simple check with test dataset
model.eval()
test_data_iterator = data_iterator(X_test_tokenized_indexified, y_test_formatted, len(X_test_tokenized_indexified), len(X_test_tokenized_indexified), shuffle=False)
test_batch, labels_batch, test_sentences = next(test_data_iterator)

seq_lengths = torch.LongTensor(list(map(len, test_batch)))
output_batch = model(test_batch.to(device),seq_lengths)
final_test_accuracy = accuracy(output_batch, labels_batch.to(device))
print(f"{final_test_accuracy=}")

final_test_accuracy=0.93


In [98]:
# display accuracies on development set per epoch
for epoch, accuracy in enumerate(accuracies_across_epochs):
    print(f"Accuracy on Development Set for Epoch {epoch + 1}: {accuracy}")

Accuracy on Development Set for Epoch 1: 0.7916666666666666
Accuracy on Development Set for Epoch 2: 0.86875
Accuracy on Development Set for Epoch 3: 0.8729166666666667
Accuracy on Development Set for Epoch 4: 0.8520833333333333
Accuracy on Development Set for Epoch 5: 0.88125
Accuracy on Development Set for Epoch 6: 0.9
Accuracy on Development Set for Epoch 7: 0.8979166666666667
Accuracy on Development Set for Epoch 8: 0.9166666666666666
Accuracy on Development Set for Epoch 9: 0.9145833333333333
Accuracy on Development Set for Epoch 10: 0.9125
Accuracy on Development Set for Epoch 11: 0.9145833333333333
Accuracy on Development Set for Epoch 12: 0.9125
Accuracy on Development Set for Epoch 13: 0.9166666666666666
Accuracy on Development Set for Epoch 14: 0.9145833333333333
Accuracy on Development Set for Epoch 15: 0.9166666666666666
Accuracy on Development Set for Epoch 16: 0.9166666666666666
Accuracy on Development Set for Epoch 17: 0.9145833333333333
Accuracy on Development Set for E

## Final Test Accuracy

In [None]:
def print_sentence_label(sentence: str) -> int:
    model.eval()
    sentence_tokenized = word_tokenize(sentence.lower())
    sentence_as_id = [
        vocab[token] if token in vocab
        else vocab['UNK']
        for token in sentence_tokenized
    ]
    seq_lengths = torch.LongTensor([len(sentence_as_id)])
    input = torch.tensor(sentence_as_id).unsqueeze(0).to(device)
    output = model(input, seq_lengths).to(device)
    label = np.argmax(output.detach().cpu().numpy())
    print(f"sentence = {sentence}, label = {label}")

# Checking results
print_sentence_label("What is a squirrel?")
print_sentence_label("Is Singapore located in Southeast Asia?")
print_sentence_label("Is Singapore in China?")
print_sentence_label("Name 11 famous martyrs .")
print_sentence_label("What ISPs exist in the Caribbean ?")
print_sentence_label("How many cars are manufactured every day?")