In [1]:
from datasets import load_dataset
from utils.rnn_model import *
from utils.rnn_utils import *
from dotenv import load_dotenv
import os

load_dotenv()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Part 0: Dataset Preparation 

In [2]:
dataset = load_dataset("rotten_tomatoes")
trn_dataset = dataset["train"]
val_dataset = dataset["validation"]
tst_dataset = dataset["test"]

## Question 1: Word Embedding

First, we define all the hyperparameters

In [3]:
BATCH_SIZE = 32
LR = 0.001
MODEL_TYPE = "rnn"

We load the pre-trained Google News 300 dimension Word2Vec model

In [4]:
word2vec_model = load_word2vec()

We make use of the loaded pretrained model to prepare the data

In [5]:
word_index = {
    word: i for i, word in enumerate(
        word2vec_model.index_to_key
    )
}

In [6]:
trn_sentences, trn_labels = prepare_data(
    trn_dataset["text"],
    trn_dataset["label"],
    word_index=word_index
)
val_sentences, val_labels = prepare_data(
    val_dataset["text"],
    val_dataset["label"],
    word_index=word_index
)
tst_sentences, tst_labels = prepare_data(
    tst_dataset["text"],
    tst_dataset["label"],
    word_index=word_index
)

Once the data is processed, we create dataloaders for the data for batch training

In [7]:
trn_dataloader = create_dataloader(
    trn_sentences,
    trn_labels,
    BATCH_SIZE,
    shuffle=True)
val_dataloader = create_dataloader(
    val_sentences,
    val_labels,
    BATCH_SIZE,
    shuffle=False)
tst_dataloader = create_dataloader(
    tst_sentences,
    tst_labels,
    BATCH_SIZE,
    shuffle=False)

Now that the data has been processed, we need to initialise the model

In [8]:
model = RNNModel(
    embedding_dim=300,
    hidden_size=128,
    embedding_matrix=word2vec_model.vectors,
    rnn_type=MODEL_TYPE,
    bidirectional=False,
    num_layers=1,
)

Now that all the data is loaded and processed into Dataloaders, we can start training!

In [9]:
train(
    model=model,
    trn_dataloader=trn_dataloader,
    val_dataloader=val_dataloader,
    version="1",
    model_type=MODEL_TYPE,
    model_save_path=os.getenv("MODEL_SAVE_PATH"),
    optimizer=torch.optim.Adam(model.parameters(), lr=LR),
    epochs=100,
    early_stopping_patience=20,
)

Epoch   1/100, Loss: 0.6909, Accuracy: 0.5722
Model saved.
Epoch   2/100, Loss: 0.6104, Accuracy: 0.6886
Model saved.
Epoch   3/100, Loss: 0.5456, Accuracy: 0.7261
Model saved.
Epoch   4/100, Loss: 0.5356, Accuracy: 0.7458
Model saved.
Epoch   5/100, Loss: 0.5132, Accuracy: 0.7289
Epoch   6/100, Loss: 0.5036, Accuracy: 0.7448
Epoch   7/100, Loss: 0.4965, Accuracy: 0.7242
Epoch   8/100, Loss: 0.4766, Accuracy: 0.7420
Epoch   9/100, Loss: 0.4647, Accuracy: 0.7411
Epoch  10/100, Loss: 0.4556, Accuracy: 0.7392
Epoch  11/100, Loss: 0.4407, Accuracy: 0.7083
Epoch  12/100, Loss: 0.4343, Accuracy: 0.7486
Model saved.
Epoch  13/100, Loss: 0.4154, Accuracy: 0.7195
Epoch  14/100, Loss: 0.4165, Accuracy: 0.7158
Epoch  15/100, Loss: 0.4031, Accuracy: 0.7270
Epoch  16/100, Loss: 0.3973, Accuracy: 0.7092
Epoch  17/100, Loss: 0.3799, Accuracy: 0.7280
Epoch  18/100, Loss: 0.3691, Accuracy: 0.6979
Epoch  19/100, Loss: 0.3598, Accuracy: 0.7298
Epoch  20/100, Loss: 0.3559, Accuracy: 0.7092
Epoch  21/100, 

We run the validation check again to make sure we've loaded the right model

In [10]:
val_accuracy = validate(model, val_dataloader)

Accuracy: 0.7486


Test the model on the test set to obtain the accuracy

In [11]:
tst_accuracy = validate(model, tst_dataloader)

Accuracy: 0.7242
