# RNN

*INSTRUCTIONS*
Embedding for RNN-based Models:
    ○ Generate wordembeddings using GloVeorWord2Vec.
    ○ Pad sequences to a fixed length for uniformity

Steps were taken from notebook: Module 3 - Video 6 onwards.ipynb

In [4]:
import pickle
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix


SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.manual_seed_all(SEED)
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Load the datasets
with open("data/train.pkl", "rb") as f:
    train = pickle.load(f)
with open("data/val.pkl", "rb") as f:
    val = pickle.load(f)
with open("data/test.pkl", "rb") as f:
    test = pickle.load(f)

# Remap original labels {0,4} -> contiguous ids {0,1} for modeling
# (Keep a copy of the original labels for reporting/debugging.)
LABEL_MAP = {0: 0, 4: 1}

for df_name, df in [("train", train), ("val", val), ("test", test)]:
    df["label_original"] = df["label"]
    mapped = df["label_original"].map(LABEL_MAP)

    if mapped.isna().any():
        bad = sorted(df.loc[mapped.isna(), "label_original"].unique().tolist())
        raise ValueError(f"Unexpected labels in {df_name}: {bad}")

    df["label"] = mapped.astype(int)

num_labels = train["label"].nunique()
print("Number of labels (label): ", num_labels)

# With remapping, class ids are stable
label_0 = 0  # Negative
label_4 = 1  # Positive
print(f"Label 0: {label_0} and label 4: {label_4}")

train.head(2)

Using device: mps
Number of labels (label):  2
Label 0: 0 and label 4: 1


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet,label,label_original
237034,0,2058468667,Sat Jun 06 15:00:18 PDT 2009,NO_QUERY,bestthingaround,my star trek bootleg timed out and when i refr...,0,0
1387008,0,2068651245,Sun Jun 07 14:27:20 PDT 2009,NO_QUERY,Scriblit,yeah but the really pretty ones only go up to ...,0,0


# Utils 

In [2]:
# ---------------------------------------
# Utils
# ---------------------------------------

# Function to convert text to tokens
def preprocess_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return []

    # Keep only letters and whitespaces (avoid regex escape warnings)
    text = re.sub(r"[^a-zA-Z\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    return tokens

# Function to convert tokens to Word2Vec embeddings
def text_to_embeddings(text, word2vec_model, seq_length):
    """
    Function to convert a given token list into a sequence of embeddings using a pretrained Word2Vec model.
    """
    embeddings = []

    for i, word in enumerate(text):
        if i == seq_length:
            break
        if word in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[word])
        # else: skip OOV tokens

    # Padding the sequences
    if len(embeddings) < seq_length:
        zero_padding = [np.zeros(word2vec_model.vector_size, dtype=np.float32)
                        for _ in range(seq_length - len(embeddings))]
        embeddings = embeddings + zero_padding

    # Sequence of word vectors of length seq_length
    return embeddings[:seq_length]

# Text -> Embeddings -> torch tensors
def prepare_data(reviews, labels, word2vec_model, seq_length):
    X = [text_to_embeddings(review, word2vec_model, seq_length) for review in reviews]
    X_array = np.array(X, dtype=np.float32)
    X_tensor = torch.tensor(X_array, dtype=torch.float32)
    y = torch.tensor(labels, dtype=torch.long)
    return X_tensor, y


# Word2Vec Embeddings

In [3]:
# 1) Tokenize the tweet reviwes
train['tokens'] = train['text of the tweet'].apply(preprocess_text)
val['tokens'] = val['text of the tweet'].apply(preprocess_text)
test['tokens'] = test['text of the tweet'].apply(preprocess_text)

seq_length = 100
# 2) Create vocabulary using word2vec
word2vec_model = Word2Vec(sentences=train['tokens'].values.tolist(), 
                          vector_size=seq_length, # same as in Module 3 - Video 6 onwards.ipynb
                           min_count=1, 
                           workers=4)

# Get vocabulary size
vocab_size = len(word2vec_model.wv)
print("Vocab size: ", vocab_size)


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Vocab size:  256064


In [None]:
# Define hyperparameters
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 2 
num_layers = 1
learning_rate = 0.001
num_epochs = 30
batch_size = 64
dropout_rate = 0.5
leaky_relu_slope = 0.1

# Prepare data
X_train, y_train = prepare_data(train['tokens'], train['label'],
                    word2vec_model, seq_length=seq_length)

: 

In [None]:
X_val, y_val = prepare_data(val['tokens'], val['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
X_test, y_test = prepare_data(test['tokens'], test['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
# Create DataLoader
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create DataLoader
val_data = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


# Define the RNN model 

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate, leaky_relu_slope=0.1):
        super(SentimentRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # Basic RNN layer
        # shape of input tensor: (batch_size, seq_length, input_size)
        self.rnn = nn.RNN(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0,
        )
        self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initial hidden state
        # h0 shape: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)

        # out shape after rnn: (batch_size, seq_length, hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.leaky_relu(out)
        out = self.dropout(out)

        # Get the last sequence output for classification
        # out shape after indexing: (batch_size, hidden_size)
        out = out[:, -1, :]

        logits = self.fc(out)
        return logits

# Initialize model, loss function, and optimizer
model = SentimentRNN(
    input_size,
    hidden_size,
    output_size,
    num_layers,
    dropout_rate,
    leaky_relu_slope=leaky_relu_slope,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_trainable_params}")



# Train

In [None]:
# Training loop
import matplotlib.pyplot as plt
import time

num_epochs = 30  # Number of epochs
losses = []
val_losses = []

best_val_loss = float("inf")
best_epoch = 0
patience = 0
max_patience = 3

# Start timing training
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    count = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # CrossEntropyLoss expects long integer class indices (0 or 1)


        optimizer.zero_grad()
        logits = model(inputs)  # Shape: (batch_size, 2)
        loss = criterion(logits, labels)  # labels shape: (batch_size) with class indices
        loss.backward()
        optimizer.step()

        total_loss += float(loss.item())
        count += 1

    average_loss = total_loss / max(count, 1)
    losses.append(average_loss)

    model.eval()
    total_val_loss = 0.0
    val_count = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

    
            logits = model(inputs)  # Shape: (batch_size, 2)
            val_loss = criterion(logits, labels)  # labels shape: (batch_size) with class indices

            total_val_loss += float(val_loss.item())
            val_count += 1

    average_val_loss = total_val_loss / max(val_count, 1)
    val_losses.append(average_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Val Loss: {average_val_loss:.4f}")

    # Save best checkpoint
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), "rnn_best_model.pth")
        patience = 0
    else:
        patience += 1

    if patience >= max_patience:
        print(f"Early stopped at {epoch+1}")
        break

# Calculate total training time
training_time = time.time() - start_time
print(f"Lowest Validation Loss: {best_val_loss:.4f} at Epoch {best_epoch + 1}")
print(f"Total Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")


# Evaluate 

## Evaluation Functions - Accuracy, Label Precision, Recall, F1, and Confusion Matric

In [None]:
from metrics import evaluate_model

## Validation Set

In [None]:
# Load the best model and calculate accuracy only for that
model.load_state_dict(torch.load("rnn_best_model.pth", map_location=device))
model.to(device)

# After training, evaluate on validation set (with confusion matrix plot)
val_metrics = evaluate_model(
    model, 
    device, 
    val_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Validation Set Confusion Matrix",
    training_time_seconds=training_time,
    model_name="rnn",
    dataset_split="val",
    save_results=True,
)

print(f"Validation Accuracy: {val_metrics['accuracy']:.2f}%")
print(f"Validation Metrics: {val_metrics}")



## Test Set 

In [None]:
# Evaluate on test set (without plot)
test_metrics = evaluate_model(
    model, 
    device, 
    test_loader,
    label_0,
    label_4,
    plot_confusion_matrix=False,
    training_time_seconds=training_time,
    model_name="rnn",
    dataset_split="test",
    save_results=True,
)
print(f"Test Accuracy: {test_metrics['accuracy']:.2f}%")

# Or plot test confusion matrix separately
test_metrics = evaluate_model(
    model, 
    device, 
    test_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Test Set Confusion Matrix",
    training_time_seconds=training_time,
    model_name="rnn",
    dataset_split="test",
    save_results=True,
)
