In [1]:
from datasets import load_dataset
from utils.rnn_model import *
from utils.rnn_utils import *
from dotenv import load_dotenv
import os

load_dotenv()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Part 0: Dataset Preparation 

In [2]:
dataset = load_dataset("rotten_tomatoes")
trn_dataset = dataset["train"]
val_dataset = dataset["validation"]
tst_dataset = dataset["test"]

## Question 1: Word Embedding

First, we define all the hyperparameters

In [3]:
BATCH_SIZE = 32
LR = 0.0001
MODEL_TYPE = "rnn"

We load the pre-trained Google News 300 dimension Word2Vec model

In [4]:
word2vec_model = load_word2vec()

We make use of the loaded pretrained model to prepare the data

In [5]:
word_index = {
    word: i for i, word in enumerate(
        word2vec_model.index_to_key
    )
}

In [6]:
trn_sentences, trn_labels = prepare_data(
    trn_dataset["text"],
    trn_dataset["label"],
    word_index=word_index
)
val_sentences, val_labels = prepare_data(
    val_dataset["text"],
    val_dataset["label"],
    word_index=word_index
)
tst_sentences, tst_labels = prepare_data(
    tst_dataset["text"],
    tst_dataset["label"],
    word_index=word_index
)

Once the data is processed, we create dataloaders for the data for batch training

In [7]:
trn_dataloader = create_dataloader(
    trn_sentences,
    trn_labels,
    BATCH_SIZE,
    shuffle=True)
val_dataloader = create_dataloader(
    val_sentences,
    val_labels,
    BATCH_SIZE,
    shuffle=False)
tst_dataloader = create_dataloader(
    tst_sentences,
    tst_labels,
    BATCH_SIZE,
    shuffle=False)

Now that the data has been processed, we need to initialise the model

In [8]:
model = RNNModel(
    embedding_dim=300,
    hidden_size=256,
    embedding_matrix=word2vec_model.vectors,
    rnn_type=MODEL_TYPE,
    bidirectional=False,
    num_layers=1,
)

Now that all the data is loaded and processed into Dataloaders, we can start training!

In [9]:
train(
    model=model,
    trn_dataloader=trn_dataloader,
    val_dataloader=val_dataloader,
    version="1",
    model_type=MODEL_TYPE,
    model_save_path=os.getenv("MODEL_SAVE_PATH", "modelfiles/"),
    optimizer=torch.optim.Adam(model.parameters(), lr=LR),
    epochs=100,
    early_stopping_patience=10,
)

Epoch   1/100, Loss: 0.6594, Accuracy: 0.7176
Model saved.
Epoch   2/100, Loss: 0.5340, Accuracy: 0.7326
Model saved.
Epoch   3/100, Loss: 0.5149, Accuracy: 0.7448
Model saved.
Epoch   4/100, Loss: 0.5049, Accuracy: 0.7542
Model saved.
Epoch   5/100, Loss: 0.4973, Accuracy: 0.7542
Epoch   6/100, Loss: 0.4904, Accuracy: 0.7420
Epoch   7/100, Loss: 0.4860, Accuracy: 0.7458
Epoch   8/100, Loss: 0.4783, Accuracy: 0.7477
Epoch   9/100, Loss: 0.4743, Accuracy: 0.7523
Epoch  10/100, Loss: 0.4656, Accuracy: 0.7608
Model saved.
Epoch  11/100, Loss: 0.4526, Accuracy: 0.7570
Epoch  12/100, Loss: 0.4423, Accuracy: 0.7542
Epoch  13/100, Loss: 0.4317, Accuracy: 0.7411
Epoch  14/100, Loss: 0.4190, Accuracy: 0.7542
Epoch  15/100, Loss: 0.4057, Accuracy: 0.7355
Epoch  16/100, Loss: 0.3929, Accuracy: 0.7298
Epoch  17/100, Loss: 0.3861, Accuracy: 0.7223
Epoch  18/100, Loss: 0.3745, Accuracy: 0.7448
Epoch  19/100, Loss: 0.3643, Accuracy: 0.7167
Epoch  20/100, Loss: 0.3543, Accuracy: 0.7289
Early stopping 

We run the validation check again to make sure we've loaded the right model

In [10]:
val_accuracy = validate(model, val_dataloader)

Accuracy: 0.7608


Test the model on the test set to obtain the accuracy

In [11]:
tst_accuracy = validate(model, tst_dataloader)

Accuracy: 0.7430
