In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter

import spacy
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import Tensor
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataPath = os.path.join("Data","track-a.csv")

In [None]:
glovePath = os.path.join("glove.6B", "glove.6B.200d.txt")

In [3]:
dataFrame = pd.read_csv(dataPath)

In [4]:
nlpModel = spacy.load("en_core_web_sm", disable = ["parser", "ner"])

In [5]:
textColumn = "text"
labelColumns = ["anger", "fear", "joy", "sadness", "surprise"]

In [6]:
def cleanerFunction(text: str) -> str:
    tempDoc = nlpModel(text)
    tokens = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(tokens)

In [7]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [8]:
userDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
yData = dataFrame[labelColumns].values.astype(np.float32)

In [10]:
xAll = dataFrame["Spacy_text"].tolist()
xTrain, xVal, yTrain, yVal = train_test_split(xAll, yData, test_size = 0.1, random_state = 69)

In [11]:
xTrain = xTrain.copy()
xVal = xVal.copy()

In [12]:
numLabels = yTrain.shape[1]

In [13]:
TOKEN_PATTERN = re.compile(r"\b\w+\b")

In [14]:
def simpleTokenize(text: str) -> list:
    return TOKEN_PATTERN.findall(text.lower())

In [15]:
def vocabularyBuilder(texts: list, vocabsize: int):
    counter = Counter()
    for t in texts:
        tokens = simpleTokenize(t)
        counter.update(tokens)
    
    mostCommon = counter.most_common(vocabsize - 2)
    indextoword = ["<pad>", "<unk>"] + [token for token, _ in mostCommon]
    wordtoindex = {w: i for i, w in enumerate(indextoword)}

    return wordtoindex, indextoword

In [16]:
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 100

wordtoindex, indextoword = vocabularyBuilder(xTrain, vocabsize = MAX_VOCAB_SIZE)
vocabSize = len(indextoword)

padIndex = wordtoindex["<pad>"]
unkIndex = wordtoindex["<unk>"]

In [17]:
def encodePadFunction(texts: list, wordtoindex: dict, sequenceLength: int = MAX_SEQUENCE_LENGTH) -> np.ndarray:
    encodings = []

    for t in texts:
        tokens = simpleTokenize(t)
        tokenIDs = [wordtoindex.get(tok, unkIndex) for tok in tokens]
        if len(tokenIDs) > sequenceLength:
            tokenIDs = tokenIDs[:sequenceLength]
        else:
            tokenIDs = tokenIDs + [padIndex] * (sequenceLength - len(tokenIDs))
        encodings.append(tokenIDs)

    return np.array(encodings, dtype = np.int64)

In [18]:
trainEncode = encodePadFunction(xTrain, wordtoindex, sequenceLength = MAX_SEQUENCE_LENGTH)
valEncode = encodePadFunction(xVal, wordtoindex, sequenceLength = MAX_SEQUENCE_LENGTH)

In [None]:
def loadGloveEmbeddings(glovePath: str, wordtoindex, embdim: int = 200) -> torch.Tensor:
    V = len(wordtoindex)
    emb_matrix = np.random.normal(scale = 0.6, size=(V, embdim))

    with open(glovePath, "r", encoding = "utf8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            token, vec = parts[0], np.array(parts[1: ], dtype = np.float32)
            if token in wordtoindex:
                emb_matrix[wordtoindex[token]] = vec
    return torch.from_numpy(emb_matrix).float()

In [None]:
embeddingMatrix = loadGloveEmbeddings(glovePath, wordtoindex)

In [19]:
class TextDataset(Dataset):
    def __init__(self, encodings: np.ndarray, labels: np.ndarray) -> None:
        self.encodings = torch.from_numpy(encodings)
        self.labels = torch.from_numpy(labels)

    def __len__(self):
        return self.encodings.size(0)
    
    def __getitem__(self, index):
        return self.encodings[index], self.labels[index]

## Recurrent Neural Network - LSTM

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(
        self,
        vocabSize: int,
        embeddingDim: int,
        hiddenDim: int,
        rnnLayers: int,
        bidirectional: bool,
        dropoutRate: float,
        denseUnits: int,
        numLabels: int,
        padIndex: int,
        useAttention: bool = False,
        embeddingMatrix: bool = False
    ) -> None:
        super().__init__()

        if embeddingMatrix is True:
            self.embedding = nn.Embedding.from_pretrained(embeddingMatrix, freeze = False, padding_idx = padIndex)
        else:
            self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx = padIndex)

        self.conv1d = nn.Conv1d(
            in_channels = embeddingDim,
            out_channels = embeddingDim,
            kernel_size = 5,
            padding = 2
        )
        self.relu_cnn = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size = 2)

        self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx = padIndex)
        self.dropout_emb = nn.Dropout(dropoutRate)
        self.lstm = nn.LSTM(
            input_size = embeddingDim,
            hidden_size = hiddenDim,
            num_layers = rnnLayers,
            bidirectional = bidirectional,
            batch_first = True,
            dropout = dropoutRate if rnnLayers > 1 else 0.0
        )

        self.use_attention = useAttention
        directionFactor = 2 if bidirectional else 1
        if useAttention:
            self.attn_linear = nn.Linear(hiddenDim * directionFactor, hiddenDim * directionFactor)
            self.attn_v = nn.Linear(hiddenDim * directionFactor, 1, bias = False)

        self.fc1 = nn.Linear(hiddenDim * directionFactor, denseUnits)
        self.bn1 = nn.BatchNorm1d(denseUnits)
        self.relu = nn.ReLU()
        self.dropoutFc = nn.Dropout(dropoutRate)
        self.output_layer = nn.Linear(denseUnits, numLabels)

    def forward(self, x: torch.LongTensor) -> Tensor:
        pad_idx = self.embedding.padding_idx
        lengths = (x != pad_idx).sum(dim = 1)
        lengths = torch.clamp(lengths, min = 1)

        embTensor: Tensor = self.embedding(x)  # type: ignore
        embTensor = self.dropout_emb(embTensor)   #type: ignore

        c_in = embTensor.transpose(1, 2)              
        c_out= self.relu_cnn(self.conv1d(c_in))
        c_out= self.pool(c_out)

        rnn_in = c_out.transpose(1, 2)
        lengths = torch.clamp(lengths // 2, min = 1)

        packed = pack_padded_sequence(rnn_in, lengths.cpu(), batch_first = True, enforce_sorted = False)
        rnnOut, _ = self.lstm(packed)
        rnnOut, _ = pad_packed_sequence(rnnOut, batch_first = True)

        if self.use_attention:
            scores = torch.tanh(self.attn_linear(rnnOut))
            weights = torch.softmax(self.attn_v(scores), dim = 1)
            finalFeat = (weights * rnnOut).sum(dim = 1)
        else:
            idx = torch.arange(x.size(0), device = x.device)
            finalFeat = rnnOut[idx, lengths - 1]

        h: Tensor = self.fc1(finalFeat) #type: ignore
        h = self.bn1(h)
        h = self.relu(h)
        h = self.dropoutFc(h)  #type: ignore
        logits: Tensor = self.output_layer(h)   #type: ignore
        return logits

In [21]:
def epochTrain(model, dataloader, optimizer, criterion, device):
    model.train()
    totalLoss = 0.0
    for batchInputs, batchLabels in dataloader:
        batchInputs = batchInputs.to(device)
        batchLabels = batchLabels.to(device)

        optimizer.zero_grad()
        logits = model(batchInputs)
        loss = criterion(logits, batchLabels)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()

        totalLoss += loss.item() * batchInputs.size(0)
    
    return totalLoss / len(dataloader.dataset)

In [22]:
def evaluateModel(model, dataloader, criterion, device):
    model.eval()
    totalLoss = 0.0
    with torch.no_grad():
        for batchInputs, batchLabels in dataloader:
            batchInputs = batchInputs.to(device)
            batchLabels = batchLabels.to(device)

            logits = model(batchInputs)
            loss = criterion(logits, batchLabels)
            totalLoss += loss.item() * batchInputs.size(0)

    return totalLoss / len(dataloader.dataset)

In [23]:
freq = np.maximum(yTrain.sum(axis = 0) / len(yTrain), 1e-4)
classWeights = 1.0 / freq
classWeights = classWeights / classWeights.sum()
weightTensor = torch.FloatTensor(classWeights).to(userDevice)

In [None]:
def objectiveLSTM(trial):
    embeddingDim  = trial.suggest_categorical("lstm_embeddingDim", [64, 128, 256])
    hiddenDim = trial.suggest_int("lstm_hiddenDim", 32, 256, step = 16)
    rnnLayers = trial.suggest_int("lstm_rnnLayers", 1, 5)
    bidirectional = trial.suggest_categorical("lstm_bidirectional", [False, True])
    dropoutRate = trial.suggest_float("lstm_dropoutRate", 0.2, 0.6, step = 0.1)
    denseUnits = trial.suggest_int("lstm_denseUnits", 32, 256, step = 16)
    learnRate = trial.suggest_float("lstm_learnRate", 1e-4, 1e-2, log = True)
    weightDecay = trial.suggest_float("lstm_weightDecay", 1e-6, 1e-2, log = True)
    batchSize = trial.suggest_categorical("lstm_batchSize", [32, 64, 128, 256])
    epochs = trial.suggest_categorical("lstm_epochs", [3, 5, 7, 9])
    optName = trial.suggest_categorical("lstm_optimizer", ["Adam", "RMSprop", "SGD"])
    useAttention = trial.suggest_categorical("lstm_useAttention", [False, True])
    useScheduler = trial.suggest_categorical("lstm_useScheduler", [False, True])
    embeddingMatrix = trial.suggest_categorical("lstm_embeddingMatrix", [False, True])
    if optName == "SGD":
        sgdvar = trial.suggest_float("lstm_momentum", 0.1, 0.9)

    trainDataset = TextDataset(trainEncode, yTrain)
    valDataset = TextDataset(valEncode, yVal)

    trainLoader = DataLoader(trainDataset, batch_size = batchSize, shuffle = True)
    valLoader = DataLoader(valDataset, batch_size = batchSize, shuffle = True)

    model = LSTMClassifier(vocabSize, 
                           embeddingDim, 
                           hiddenDim, 
                           rnnLayers, 
                           bidirectional, 
                           dropoutRate, 
                           denseUnits, 
                           numLabels, 
                           padIndex, 
                           useAttention,
                           embeddingMatrix).to(userDevice)

    if optName == "Adam":
        optimizer = optim.Adam(model.parameters(), lr = learnRate, weight_decay = weightDecay)
    elif optName == "RMSprop":
        optimizer = optim.RMSprop(model.parameters(), lr = learnRate, weight_decay = weightDecay)
    else:
        optimizer = optim.SGD(model.parameters(), lr = learnRate, momentum = sgdvar) # type: ignore

    criterionOption = nn.BCEWithLogitsLoss(pos_weight = weightTensor)

    bestvalLoss = float("inf")
    counterVar = 0
    bestState = None

    if useScheduler:
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.5, patience = 1)

    for epoch in range(1, epochs + 1):
        trainLoss = epochTrain(model, trainLoader, optimizer, criterionOption, userDevice)
        valLoss = evaluateModel(model, valLoader, criterionOption, userDevice)

        if useScheduler:
            scheduler.step(valLoss) # type: ignore

        trial.report(valLoss, epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        if valLoss < bestvalLoss:
            bestvalLoss = valLoss
            counterVar = 0
            bestState = {k: v.cpu() for k, v in model.state_dict().items()}
        else:
            counterVar += 1
            if counterVar >= 2:
                break

    model.load_state_dict(bestState) # type: ignore
    return bestvalLoss

In [25]:
def _objectiveLSTM(trial):
    return objectiveLSTM(trial)

In [26]:
studyLSTM = optuna.create_study(study_name = "studyLSTM", direction = "minimize", sampler = optuna.samplers.TPESampler(seed = 42), storage = "sqlite:///db.sqlite3")

[I 2025-06-11 09:06:12,411] A new study created in RDB with name: studyLSTM


In [27]:
studyLSTM.optimize(_objectiveLSTM, n_trials = 500)

[I 2025-06-11 09:06:16,342] Trial 0 finished with value: 0.18160945349221624 and parameters: {'embeddingDim': 128, 'hiddenDim': 160, 'rnnLayers': 1, 'bidirectional': False, 'dropoutRate': 0.6, 'denseUnits': 176, 'learnRate': 0.0026070247583707684, 'weightDecay': 1.2087541473056965e-06, 'batchSize': 32, 'epochs': 7, 'optimizer': 'RMSprop', 'useScheduler': True}. Best is trial 0 with value: 0.18160945349221624.
[I 2025-06-11 09:06:29,171] Trial 1 finished with value: 0.17850140248179866 and parameters: {'embeddingDim': 128, 'hiddenDim': 144, 'rnnLayers': 3, 'bidirectional': True, 'dropoutRate': 0.2, 'denseUnits': 32, 'learnRate': 0.007902619549708232, 'weightDecay': 0.007286653737491046, 'batchSize': 32, 'epochs': 7, 'optimizer': 'Adam', 'useScheduler': True}. Best is trial 1 with value: 0.17850140248179866.
[I 2025-06-11 09:06:43,115] Trial 2 finished with value: 0.5011674652030752 and parameters: {'embeddingDim': 256, 'hiddenDim': 208, 'rnnLayers': 5, 'bidirectional': False, 'dropoutRa

In [None]:
print("\nOptuna Best Trial:")
best = studyLSTM.best_trial
print(f"Validation Loss: {best.value:.4f}")
for key, val in best.params.items():
    print(f"  {key}: {val}")
print("\n")

best_embeddingDim = best.params["lstm_embeddingDim"]
best_hiddenDim = best.params["lstm_hiddenDim"]
best_rnnLayers = best.params["lstm_rnnLayers"]
best_bidirectional = best.params["lstm_bidirectional"]
best_dropoutRate = best.params["lstm_dropoutRate"]
best_denseUnits = best.params["lstm_denseUnits"]
best_learnRate = best.params["lstm_learnRate"]
best_weightDecay = best.params["lstm_weightDecay"]
best_batchSize = best.params["lstm_batchSize"]
best_epochs = best.params["lstm_epochs"]
best_optName = best.params["lstm_optimizer"]
best_use_scheduler = best.params["lstm_useScheduler"]
best_embeddingMatrix = best.params["lstm_embeddingMatrix"]
best_useAttention = best.params["lstm_useAttention"]
if best_optName == "SGD":
    best_momentum = best.params["lstm_momentum"]


Optuna Best Trial:
Validation Loss: 0.1750
  embeddingDim: 256
  hiddenDim: 128
  rnnLayers: 1
  bidirectional: False
  dropoutRate: 0.30000000000000004
  denseUnits: 96
  learnRate: 0.001545870105633364
  weightDecay: 5.6985313463300785e-05
  batchSize: 32
  epochs: 3
  optimizer: RMSprop
  useScheduler: True




In [29]:
finalTrainDataset = TextDataset(trainEncode, yTrain)
finalValDataset = TextDataset(valEncode, yVal)

finalTrainLoader = DataLoader(finalTrainDataset, batch_size = best_batchSize, shuffle = True)
finalValLoader = DataLoader(finalValDataset, batch_size = best_batchSize, shuffle = False)

In [None]:
finalModelLSTM = LSTMClassifier(
    vocabSize = vocabSize,
    embeddingDim = best_embeddingDim,
    hiddenDim = best_hiddenDim,
    rnnLayers = best_rnnLayers,
    bidirectional = best_bidirectional,
    dropoutRate = best_dropoutRate,
    denseUnits = best_denseUnits,
    numLabels = numLabels,
    padIndex = padIndex,
    useAttention = best_useAttention,
    embeddingMatrix = best_embeddingMatrix
).to(userDevice)

if best_optName == "Adam":
    finalOptimizer = optim.Adam(finalModelLSTM.parameters(), lr = best_learnRate, weight_decay = best_weightDecay)
elif best_optName == "RMSprop":
    finalOptimizer = optim.RMSprop(finalModelLSTM.parameters(), lr = best_learnRate, weight_decay = best_weightDecay)
else:
    finalOptimizer = optim.SGD(finalModelLSTM.parameters(), lr = best_learnRate, momentum = best_momentum, weight_decay = best_weightDecay)

finalCriterion = nn.BCEWithLogitsLoss(pos_weight = weightTensor)

if best_use_scheduler:
    finalScheduler = optim.lr_scheduler.ReduceLROnPlateau(finalOptimizer, mode = "min", factor = 0.5, patience = 1)

In [31]:
bestValLossFinal = float("inf")
patienceCtr = 0
bestStateFinal = None

In [32]:
for epoch in range(1, best_epochs + 1):
    trainLoss = epochTrain(finalModelLSTM, finalTrainLoader, finalOptimizer, finalCriterion, userDevice)
    valLoss = evaluateModel(finalModelLSTM, finalValLoader, finalCriterion, userDevice)
    print(f"Final Epoch {epoch:02d} | Train Loss: {trainLoss:.4f} | Val Loss: {valLoss:.4f}")

    if best_use_scheduler:
        finalScheduler.step(valLoss)

    if valLoss < bestValLossFinal:
        bestValLossFinal = valLoss
        patienceCtr = 0
        bestStateFinal = {k: v.cpu() for k, v in finalModelLSTM.state_dict().items()}
    else:
        patienceCtr += 1
        if patienceCtr >= 3:
            print(f"Early stopping at epoch {epoch}")
            break

Final Epoch 01 | Train Loss: 0.1983 | Val Loss: 0.1791
Final Epoch 02 | Train Loss: 0.1725 | Val Loss: 0.1776
Final Epoch 03 | Train Loss: 0.1613 | Val Loss: 0.1779


In [33]:
finalModelLSTM.load_state_dict(bestStateFinal)

<All keys matched successfully>

In [34]:
finalModelLSTM.eval()
runningLoss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for batchInputs, batchLabels in finalValLoader:
        batchInputs = batchInputs.to(userDevice)
        batchLabels = batchLabels.to(userDevice)

        logits = finalModelLSTM(batchInputs)
        loss = finalCriterion(logits, batchLabels)
        runningLoss += loss.item() * batchInputs.size(0)

        preds = (torch.sigmoid(logits) >= 0.5).float()
        correct += (preds == batchLabels).sum().item()
        total += batchLabels.numel()

finalValLoss = runningLoss / len(finalValDataset)
finalValAcc = correct / total
print(f"\nFinal Model -> Val Loss: {finalValLoss:.4f}, Val Accuracy: {finalValAcc:.4f}")


Final Model -> Val Loss: 0.1779, Val Accuracy: 0.6780


In [None]:
os.makedirs("savedModel", exist_ok = True)
torch.save({
    "model_state_dict": finalModelLSTM.state_dict(),
    "hyperparameters": best.params,
    "wordtoindex": wordtoindex,
    "indextoword": indextoword,
    "padIndex": padIndex,
    "max_sequence_length": MAX_SEQUENCE_LENGTH
}, "savedModel/rnnLSTM.pth")
print("Saved final model + vocab to savedModel/rnnLSTM.pth")

Saved final model + vocab to savedModel/rnnLSTM.pth


## Recurrent Neural Network - GRU


In [None]:
class GRUClassifier(nn.Module):
    def __init__(
        self,
        vocabSize: int,
        embeddingDim: int,
        hiddenDim: int,
        rnnLayers: int,
        bidirectional: bool,
        dropoutRate: float,
        denseUnits: int,
        numLabels: int,
        padIndex: int,
        useAttention: bool = False,
        embeddingMatrix: bool = False
    ):
        super().__init__()

        if embeddingMatrix is True:
            self.embedding = nn.Embedding.from_pretrained(embeddingMatrix, freeze = False, padding_idx = padIndex)
        else:
            self.embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx = padIndex)

        self.conv1d = nn.Conv1d(
            in_channels = embeddingDim,
            out_channels = embeddingDim,
            kernel_size = 5,
            padding = 2
        )
        self.relu_cnn = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size = 2)

        self.embedding: nn.Embedding = nn.Embedding(vocabSize, embeddingDim, padding_idx = padIndex)
        self.dropout_emb: nn.Dropout = nn.Dropout(dropoutRate)
        self.gru: nn.GRU = nn.GRU(
            input_size = embeddingDim,
            hidden_size = hiddenDim,
            num_layers = rnnLayers,
            bidirectional = bidirectional,
            batch_first = True,
            dropout = dropoutRate if rnnLayers > 1 else 0.0
        )

        self.use_attention: bool = useAttention
        factor = 2 if bidirectional else 1
        if useAttention:
            self.attn_linear: nn.Linear = nn.Linear(hiddenDim * factor, hiddenDim * factor)
            self.attn_v: nn.Linear = nn.Linear(hiddenDim * factor, 1, bias = False)

        self.fc1: nn.Linear = nn.Linear(hiddenDim * factor, denseUnits)
        self.bn1: nn.BatchNorm1d = nn.BatchNorm1d(denseUnits)
        self.relu: nn.ReLU = nn.ReLU()
        self.dropout_fc: nn.Dropout = nn.Dropout(dropoutRate)
        self.output_layer: nn.Linear = nn.Linear(denseUnits, numLabels)

    def forward(self, x: torch.LongTensor) -> Tensor:
        lengths = (x != self.embedding.padding_idx).sum(dim = 1)
        lengths = torch.clamp(lengths, min = 1)

        emb = self.dropout_emb(self.embedding(x))

        c_in = emb.transpose(1, 2)
        c_out= self.relu_cnn(self.conv1d(c_in))
        c_out= self.pool(c_out)

        rnn_in = c_out.transpose(1, 2)
        lengths = torch.clamp(lengths // 2, min = 1)

        packed = pack_padded_sequence(rnn_in, lengths.cpu(), batch_first = True, enforce_sorted = False)
        rnnOut, _ = self.gru(packed)
        rnnOut, _ = pad_packed_sequence(rnnOut, batch_first = True)

        if self.use_attention:
            scores = torch.tanh(self.attn_linear(rnnOut))
            weights = torch.softmax(self.attn_v(scores), dim = 1)
            finalFeat = (weights * rnnOut).sum(dim = 1)
        else:
            idx = torch.arange(x.size(0), device = x.device)
            finalFeat = rnnOut[idx, lengths - 1]

        h: Tensor = self.fc1(finalFeat)
        h = self.bn1(h)
        h = self.relu(h)
        h = self.dropout_fc(h)
        logits: Tensor = self.output_layer(h)
        return logits

In [None]:
def objectiveGRU(trial):
    embeddingDim = trial.suggest_categorical("gru_embeddingDim", [64, 128, 256, 512])
    hiddenDim = trial.suggest_int("gru_hiddenDim", 32, 256,step = 16)
    rnnLayers = trial.suggest_int("gru_layers", 1, 9)
    bidirectional = trial.suggest_categorical("gru_bidirectional", [False, True])
    dropoutRate = trial.suggest_float("gru_dropoutRate", 0.2, 0.6, step = 0.1)
    denseUnits = trial.suggest_int("gru_denseUnits", 32, 256, step = 16)
    learnRate = trial.suggest_float("gru_learnRate", 1e-4, 1e-2, log = True)
    weightDecay = trial.suggest_float("gru_weightDecay", 1e-6, 1e-2, log = True)
    batchSize = trial.suggest_categorical("gru_batchSize", [32, 64, 128, 256])
    epochs = trial.suggest_categorical("gru_epochs", [3, 5, 7, 9])
    optimizerName = trial.suggest_categorical("gru_optimizer", ["AdamW", "Adam", "RMSprop", "SGD"])
    useScheduler = trial.suggest_categorical("gru_useScheduler", [False, True])
    useAttention = trial.suggest_categorical("gru_attn", [False, True])
    embeddingMatrix = trial.suggest_categorical("gru_embeddingMatrix", [False, True])
    if optimizerName == "SGD":
        momentum = trial.suggest_float("gru_momentum", 0.1, 0.9)

    trainDataset = TextDataset(trainEncode, yTrain)
    valDataset = TextDataset(valEncode, yVal)

    trainLoader = DataLoader(trainDataset, batch_size = batchSize, shuffle = True)
    valLoader = DataLoader(valDataset, batch_size = batchSize, shuffle = True)

    model = GRUClassifier(vocabSize,
        embeddingDim,
        hiddenDim,
        rnnLayers,
        bidirectional,
        dropoutRate,
        denseUnits,
        numLabels,
        padIndex,
        useAttention, 
        embeddingMatrix
    ).to(userDevice)

    if optimizerName == "AdamW":
        optimizer = optim.AdamW(model.parameters(), lr = learnRate, weight_decay = weightDecay)
    elif optimizerName == "Adam":
        optimizer = optim.Adam(model.parameters(),lr = learnRate, weight_decay = weightDecay)
    elif optimizerName == "RMSprop":
        optimizer = optim.RMSprop(model.parameters(), lr = learnRate, weight_decay = weightDecay)
    else:
        optimizer = optim.SGD(model.parameters(),lr = learnRate, momentum = momentum, weight_decay = weightDecay)   #type: ignore

    criterionOption = nn.BCEWithLogitsLoss(pos_weight = weightTensor)
    
    if useScheduler:
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.5, patience = 1)


    bestValLoss = float("inf")
    counterVar = 0
    bestState = None

    for epoch in range(1, epochs + 1):
        trainLoss = epochTrain(model, trainLoader, optimizer, criterionOption, userDevice)
        valLoss = evaluateModel(model, valLoader, criterionOption, userDevice)

        if useScheduler:
            scheduler.step(valLoss)  # type: ignore

        trial.report(valLoss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        if valLoss < bestValLoss: #type: ignore
            bestValLoss = valLoss
            counterVar = 0
            bestState = {k: v.cpu() for k, v in model.state_dict().items()}
        else:
            counterVar += 1
            if counterVar >= 2:
                break

    model.load_state_dict(bestState)  # type: ignore
    return bestValLoss

In [38]:
def _objectiveGRU(trial):
    return objectiveGRU(trial)

In [39]:
studyGRU = optuna.create_study(study_name = "studyGRU", direction = "minimize", sampler = optuna.samplers.TPESampler(seed = 42), storage = "sqlite:///db.sqlite3")

[I 2025-06-11 09:22:17,933] A new study created in RDB with name: studyGRU


In [40]:
studyGRU.optimize(_objectiveGRU, n_trials = 500)

[I 2025-06-11 09:22:25,202] Trial 0 finished with value: 0.2159154898745919 and parameters: {'gru_embeddingDim': 128, 'gru_hiddenDim': 64, 'gru_layers': 2, 'gru_bidirectional': True, 'gru_dropoutRate': 0.5, 'gru_denseUnits': 192, 'gru_learnRate': 0.00010994335574766199, 'gru_weightDecay': 0.007579479953348004, 'gru_batchSize': 32, 'gru_epochs': 5, 'gru_optimizer': 'AdamW', 'gru_useScheduler': True, 'gru_attn': True}. Best is trial 0 with value: 0.2159154898745919.
[I 2025-06-11 09:22:29,717] Trial 1 finished with value: 0.1788128519746801 and parameters: {'gru_embeddingDim': 256, 'gru_hiddenDim': 32, 'gru_layers': 9, 'gru_bidirectional': False, 'gru_dropoutRate': 0.30000000000000004, 'gru_denseUnits': 48, 'gru_learnRate': 0.0023359635026261607, 'gru_weightDecay': 5.762487216478604e-05, 'gru_batchSize': 256, 'gru_epochs': 5, 'gru_optimizer': 'RMSprop', 'gru_useScheduler': False, 'gru_attn': True}. Best is trial 1 with value: 0.1788128519746801.
[I 2025-06-11 09:22:37,492] Trial 2 finish

In [None]:
print("\nOptuna Best Trial:")
best = studyGRU.best_trial
print(f"Validation Loss: {best.value:.4f}")
for key, val in best.params.items():
    print(f"  {key}: {val}")
print("\n")

best_embeddingDim = best.params["gru_embeddingDim"]
best_hiddenDim = best.params["gru_hiddenDim"]
best_rnnLayers = best.params["gru_layers"]
best_bidirectional = best.params["gru_bidirectional"]
best_dropoutRate = best.params["gru_dropoutRate"]
best_denseUnits = best.params["gru_denseUnits"]
best_learnRate = best.params["gru_learnRate"]
best_weightDecay = best.params["gru_weightDecay"]
best_batchSize = best.params["gru_batchSize"]
best_epochs = best.params["gru_epochs"]
best_optName = best.params["gru_optimizer"]
best_use_scheduler = best.params["gru_useScheduler"]
best_useAttention = best.params["gru_attn"]
best_embeddingMatrix = best.params["gru_embeddingMatrix"]
if best_optName == "SGD":
    best_momentum = best.params["gru_momentum"]


Optuna Best Trial:
Validation Loss: 0.1769
  gru_embeddingDim: 256
  gru_hiddenDim: 256
  gru_layers: 5
  gru_bidirectional: False
  gru_dropoutRate: 0.4
  gru_denseUnits: 144
  gru_learnRate: 0.0032181710082539124
  gru_weightDecay: 1.9015527413063226e-06
  gru_batchSize: 32
  gru_epochs: 5
  gru_optimizer: AdamW
  gru_useScheduler: False
  gru_attn: False




In [42]:
finalTrainDataset = TextDataset(trainEncode, yTrain)
finalValDataset = TextDataset(valEncode, yVal)

finalTrainLoader = DataLoader(finalTrainDataset, batch_size = best_batchSize, shuffle = True)
finalValLoader = DataLoader(finalValDataset, batch_size = best_batchSize, shuffle = False)

In [43]:
finalModelGRU = GRUClassifier(vocabSize,
        best_embeddingDim,
        best_hiddenDim,
        best_rnnLayers,
        best_bidirectional,
        best_dropoutRate,
        best_denseUnits,
        numLabels,
        padIndex,
        best_useAttention
    ).to(userDevice)

if best_optName == "AdamW":
    finalOptimizer = optim.AdamW(finalModelGRU.parameters(), lr = best_learnRate, weight_decay = best_weightDecay)
elif best_optName == "Adam":
    finalOptimizer = optim.Adam(finalModelGRU.parameters(),lr = best_learnRate, weight_decay = best_weightDecay)
elif best_optName == "RMSprop":
    finalOptimizer = optim.RMSprop(finalModelGRU.parameters(), lr = best_learnRate, weight_decay = best_weightDecay)
else:
    finalOptimizer = optim.SGD(finalModelGRU.parameters(),lr = best_learnRate, momentum = best_momentum, weight_decay = best_weightDecay)   #type: ignore

finalCriterion = nn.BCEWithLogitsLoss(pos_weight = weightTensor)

if best_use_scheduler:
    finalScheduler = optim.lr_scheduler.ReduceLROnPlateau(finalOptimizer, mode = "min", factor = 0.5, patience = 1)

In [44]:
bestValLossFinal = float("inf")
patienceCtr = 0
bestStateFinal = None

In [45]:
for epoch in range(1, best_epochs + 1):
    trainLoss = epochTrain(finalModelGRU, finalTrainLoader, finalOptimizer, finalCriterion, userDevice)
    valLoss = evaluateModel(finalModelGRU, finalValLoader, finalCriterion, userDevice)
    print(f"Final Epoch {epoch:02d} | Train Loss: {trainLoss:.4f} | Val Loss: {valLoss:.4f}")

    if best_use_scheduler:
        finalScheduler.step(valLoss)

    if valLoss < bestValLossFinal:
        bestValLossFinal = valLoss
        patienceCtr = 0
        bestStateFinal = {k: v.cpu() for k, v in finalModelGRU.state_dict().items()}
    else:
        patienceCtr += 1
        if patienceCtr >= 3:
            print(f"Early stopping at epoch {epoch}")
            break

Final Epoch 01 | Train Loss: 0.2103 | Val Loss: 0.1841
Final Epoch 02 | Train Loss: 0.1866 | Val Loss: 0.1847
Final Epoch 03 | Train Loss: 0.1854 | Val Loss: 0.1787
Final Epoch 04 | Train Loss: 0.1846 | Val Loss: 0.1817
Final Epoch 05 | Train Loss: 0.1835 | Val Loss: 0.1806


In [46]:
finalModelGRU.load_state_dict(bestStateFinal)

<All keys matched successfully>

In [47]:
finalModelGRU.eval()
runningLoss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for batchInputs, batchLabels in finalValLoader:
        batchInputs = batchInputs.to(userDevice)
        batchLabels = batchLabels.to(userDevice)

        logits = finalModelGRU(batchInputs)
        loss = finalCriterion(logits, batchLabels)
        runningLoss += loss.item() * batchInputs.size(0)

        preds = (torch.sigmoid(logits) >= 0.5).float()
        correct += (preds == batchLabels).sum().item()
        total += batchLabels.numel()

finalValLoss = runningLoss / len(finalValDataset)
finalValAcc = correct / total
print(f"\nFinal Model -> Val Loss: {finalValLoss:.4f}, Val Accuracy: {finalValAcc:.4f}")


Final Model -> Val Loss: 0.1806, Val Accuracy: 0.6780


In [None]:
os.makedirs("savedModel", exist_ok = True)
torch.save({
    "model_state_dict": finalModelGRU.state_dict(),
    "hyperparameters": best.params,
    "wordtoindex": wordtoindex,
    "indextoword": indextoword,
    "padIndex": padIndex,
    "max_sequence_length": MAX_SEQUENCE_LENGTH
}, "savedModel/rnnGRU.pth")
print("Saved final model + vocab to savedModel/rnnGRU.pth")

Saved final model + vocab to savedModel/rnnGRU.pth
