In [1]:
from datasets import load_dataset
from utils.utils import *
from utils.model import *
from dotenv import load_dotenv
import os

load_dotenv()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\qkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Part 0: Dataset Preparation 

In [2]:
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
validation_dataset = dataset["validation"]
test_dataset = dataset["test"]

## Question 1: Word Embedding

In [3]:
# before preprocessing
print("Sentence:", train_dataset["text"][1])

Sentence: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .


In [4]:
sentences_train = process_data(data=train_dataset["text"])
sentences_validation = process_data(data=validation_dataset["text"])
sentences_test = process_data(data=test_dataset["text"])

In [5]:
print("Word Tokens:", sentences_train[1])

Word Tokens: ['the', 'gorgeously', 'elaborate', 'continuation', 'of', 'the', 'lord', 'of', 'the', 'rings', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'can', 'not', 'adequately', 'describe', 'co', 'writer', 'director', 'peter', 'jacksons', 'expanded', 'vision', 'of', 'j', 'r', 'r', 'tolkiens', 'middle', 'earth']


In [6]:
# Initialize the sentiment analysis model
sentiment_model = SentimentAnalysis(
    sentences_train=sentences_train,
    labels_train=train_dataset["label"],
    sentences_val=sentences_validation,
    labels_val=validation_dataset["label"],
    sentences_test=sentences_test,
    labels_test=test_dataset["label"],
    version="v4",
    embedding_dim=300,
    batch_size=32,
    lr=0.005,
    rnn_type="LSTM",
    early_stopping_patience=10,
    model_save_path=os.getenv("MODEL_PATH"),
    freeze_embeddings=True,
    word2vec_filepath=r"data\fine_tuned_word2vec.model",
    input_embedding_filepath=r"data\input_embedding_matrix.npy",
    bidirectional=True
)

Preparing data...
Data preparation complete.


In [7]:
# Train the model
sentiment_model.train(epochs=100)

Epoch   1/100, Loss: 0.6961, Validation Accuracy: 0.5000
Model saved.
Epoch   2/100, Loss: 0.6940, Validation Accuracy: 0.5169
Model saved.
Epoch   3/100, Loss: 0.6946, Validation Accuracy: 0.5019
Epoch   4/100, Loss: 0.6948, Validation Accuracy: 0.4916
Epoch   5/100, Loss: 0.6936, Validation Accuracy: 0.5169
Epoch   6/100, Loss: 0.6943, Validation Accuracy: 0.5347
Model saved.
Epoch   7/100, Loss: 0.6943, Validation Accuracy: 0.5178
Epoch   8/100, Loss: 0.6923, Validation Accuracy: 0.5216
Epoch   9/100, Loss: 0.6938, Validation Accuracy: 0.5216
Epoch  10/100, Loss: 0.6939, Validation Accuracy: 0.5235
Epoch  11/100, Loss: 0.6938, Validation Accuracy: 0.5131
Epoch  12/100, Loss: 0.6936, Validation Accuracy: 0.5216
Epoch  13/100, Loss: 0.6932, Validation Accuracy: 0.5197
Epoch  14/100, Loss: 0.6938, Validation Accuracy: 0.4934
Epoch  15/100, Loss: 0.6941, Validation Accuracy: 0.5281
Epoch  16/100, Loss: 0.6941, Validation Accuracy: 0.5150
Early stopping triggered after 16 epochs.
Model l

In [8]:
sentiment_model.validate()

Validation Accuracy: 0.5347


0.5347091932457786

In [9]:
sentiment_model.validate(dataset="test")

Test Accuracy: 0.5300


0.5300187617260788