# RNN

*INSTRUCTIONS*
Embedding for RNN-based Models:
    ○ Generate wordembeddings using GloVeorWord2Vec.
    ○ Pad sequences to a fixed length for uniformity

Steps were taken from notebook: Module 3 - Video 6 onwards.ipynb

In [4]:
import pickle
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix


SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.manual_seed_all(SEED)
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Load the datasets
with open("data/train.pkl", "rb") as f:
    train = pickle.load(f)
with open("data/val.pkl", "rb") as f:
    val = pickle.load(f)
with open("data/test.pkl", "rb") as f:
    test = pickle.load(f)

# Remap original labels {0,4} -> contiguous ids {0,1} for modeling
# (Keep a copy of the original labels for reporting/debugging.)
LABEL_MAP = {0: 0, 4: 1}

for df_name, df in [("train", train), ("val", val), ("test", test)]:
    df["label_original"] = df["label"]
    mapped = df["label_original"].map(LABEL_MAP)

    if mapped.isna().any():
        bad = sorted(df.loc[mapped.isna(), "label_original"].unique().tolist())
        raise ValueError(f"Unexpected labels in {df_name}: {bad}")

    df["label"] = mapped.astype(int)

num_labels = train["label"].nunique()
print("Number of labels (label): ", num_labels)

# With remapping, class ids are stable
label_0 = 0  # Negative
label_4 = 1  # Positive
print(f"Label 0: {label_0} and label 4: {label_4}")

train.head(2)

Using device: mps
Number of labels (label):  2
Label 0: 0 and label 4: 1


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet,label,label_original
237034,0,2058468667,Sat Jun 06 15:00:18 PDT 2009,NO_QUERY,bestthingaround,my star trek bootleg timed out and when i refr...,0,0
1387008,0,2068651245,Sun Jun 07 14:27:20 PDT 2009,NO_QUERY,Scriblit,yeah but the really pretty ones only go up to ...,0,0


# Utils 

In [2]:
# ---------------------------------------
# Utils
# ---------------------------------------

# Function to convert text to tokens
def preprocess_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return []

    # Keep only letters and whitespaces (avoid regex escape warnings)
    text = re.sub(r"[^a-zA-Z\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    return tokens

# Function to convert tokens to Word2Vec embeddings
def text_to_embeddings(text, word2vec_model, seq_length):
    """
    Function to convert a given token list into a sequence of embeddings using a pretrained Word2Vec model.
    """
    embeddings = []

    for i, word in enumerate(text):
        if i == seq_length:
            break
        if word in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[word])
        # else: skip OOV tokens

    # Padding the sequences
    if len(embeddings) < seq_length:
        zero_padding = [np.zeros(word2vec_model.vector_size, dtype=np.float32)
                        for _ in range(seq_length - len(embeddings))]
        embeddings = embeddings + zero_padding

    # Sequence of word vectors of length seq_length
    return embeddings[:seq_length]

# Text -> Embeddings -> torch tensors
def prepare_data(reviews, labels, word2vec_model, seq_length):
    X = [text_to_embeddings(review, word2vec_model, seq_length) for review in reviews]
    X_array = np.array(X, dtype=np.float32)
    X_tensor = torch.tensor(X_array, dtype=torch.float32)
    y = torch.tensor(labels, dtype=torch.long)
    return X_tensor, y


# Word2Vec Embeddings

In [3]:
# 1) Tokenize the tweet reviwes
train['tokens'] = train['text of the tweet'].apply(preprocess_text)
val['tokens'] = val['text of the tweet'].apply(preprocess_text)
test['tokens'] = test['text of the tweet'].apply(preprocess_text)

seq_length = 100
# 2) Create vocabulary using word2vec
word2vec_model = Word2Vec(sentences=train['tokens'].values.tolist(), 
                          vector_size=seq_length, # same as in Module 3 - Video 6 onwards.ipynb
                           min_count=1, 
                           workers=4)

# Get vocabulary size
vocab_size = len(word2vec_model.wv)
print("Vocab size: ", vocab_size)


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Vocab size:  256064


In [4]:
# Define hyperparameters
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1
num_layers = 1
learning_rate = 0.001
num_epochs = 30
batch_size = 64
dropout_rate = 0.5
leaky_relu_slope = 0.1

# Prepare data
X_train, y_train = prepare_data(train['tokens'], train['label'],
                    word2vec_model, seq_length=seq_length)

: 

In [None]:
X_val, y_val = prepare_data(val['tokens'], val['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
X_test, y_test = prepare_data(test['tokens'], test['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
# Create DataLoader
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create DataLoader
val_data = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)


test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


# Define the RNN model 

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate, leaky_relu_slope=0.1):
        super(SentimentRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # Basic RNN layer
        # shape of input tensor: (batch_size, seq_length, input_size)
        self.rnn = nn.RNN(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0,
        )
        self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initial hidden state
        # h0 shape: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)

        # out shape after rnn: (batch_size, seq_length, hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.leaky_relu(out)
        out = self.dropout(out)

        # Get the last sequence output for classification
        # out shape after indexing: (batch_size, hidden_size)
        out = out[:, -1, :]

        # Logits (no sigmoid here; we'll use BCEWithLogitsLoss)
        # out shape: (batch_size, output_size)
        logits = self.fc(out)
        return logits

# Initialize model, loss function, and optimizer
model = SentimentRNN(
    input_size,
    hidden_size,
    output_size,
    num_layers,
    dropout_rate,
    leaky_relu_slope=leaky_relu_slope,
).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_trainable_params}")



# Evaluation Metrics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

def evaluate_model(loader, label_0, label_4, plot_confusion_matrix=True, title="Confusion Matrix"):
    """
    Single-pass evaluation that computes metrics + (optional) confusion matrix plot.

    Notes:
    - The model outputs logits (no sigmoid). We apply sigmoid here for probabilities.
    - Dataset labels are expected to be contiguous ids {0,1} (Negative=0, Positive=1).
    """
    model.eval()

    tp_0 = fp_0 = fn_0 = 0
    tp_4 = fp_4 = fn_4 = 0

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            logits = model(inputs).squeeze(-1)
            probs = torch.sigmoid(logits)

            predictions = (probs > 0.5).long()  # 0/1

            all_predictions.extend(predictions.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

            tp_0 += ((predictions == label_0) & (labels == label_0)).sum().item()
            fp_0 += ((predictions == label_0) & (labels != label_0)).sum().item()
            fn_0 += ((predictions != label_0) & (labels == label_0)).sum().item()

            tp_4 += ((predictions == label_4) & (labels == label_4)).sum().item()
            fp_4 += ((predictions == label_4) & (labels != label_4)).sum().item()
            fn_4 += ((predictions != label_4) & (labels == label_4)).sum().item()

    total = len(all_labels)
    correct = sum(1 for p, l in zip(all_predictions, all_labels) if p == l)
    accuracy = 100 * correct / total if total > 0 else 0.0

    precision_0 = tp_0 / (tp_0 + fp_0) if (tp_0 + fp_0) > 0 else 0.0
    precision_4 = tp_4 / (tp_4 + fp_4) if (tp_4 + fp_4) > 0 else 0.0

    recall_0 = tp_0 / (tp_0 + fn_0) if (tp_0 + fn_0) > 0 else 0.0
    recall_4 = tp_4 / (tp_4 + fn_4) if (tp_4 + fn_4) > 0 else 0.0

    f1_0 = (
        2 * (precision_0 * recall_0) / (precision_0 + recall_0)
        if (precision_0 + recall_0) > 0
        else 0.0
    )
    f1_4 = (
        2 * (precision_4 * recall_4) / (precision_4 + recall_4)
        if (precision_4 + recall_4) > 0
        else 0.0
    )

    cm = confusion_matrix(all_labels, all_predictions, labels=[label_0, label_4])

    if plot_confusion_matrix:
        plt.figure(figsize=(8, 6))
        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=["Negative (0)", "Positive (1)"],
            yticklabels=["Negative (0)", "Positive (1)"],
            cbar_kws={"label": "Count"},
        )
        plt.title(title, fontsize=14, fontweight="bold", pad=20)
        plt.ylabel("True Label", fontsize=12, fontweight="bold")
        plt.xlabel("Predicted Label", fontsize=12, fontweight="bold")
        plt.tight_layout()
        plt.show()

        tn, fp, fn, tp = cm.ravel()
        print("\nConfusion Matrix Summary:")
        print(f"True Negatives (TN):  {tn:6d}  - Correctly predicted 0")
        print(f"False Positives (FP): {fp:6d}  - Predicted 1 but actual was 0")
        print(f"False Negatives (FN):  {fn:6d}  - Predicted 0 but actual was 1")
        print(f"True Positives (TP):  {tp:6d}  - Correctly predicted 1")
        print(f"Total samples: {tn + fp + fn + tp}")

    return {
        "accuracy": accuracy,
        "precision_label_0": precision_0,
        "recall_label_0": recall_0,
        "f1_label_0": f1_0,
        "precision_label_4": precision_4,
        "recall_label_4": recall_4,
        "f1_label_4": f1_4,
        "confusion_matrix": cm,
    }


# Train

In [None]:
# Training loop
import matplotlib.pyplot as plt

num_epochs = 30  # Number of epochs
losses = []
val_losses = []

best_val_loss = float("inf")
best_epoch = 0
patience = 0
max_patience = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    count = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Labels are already remapped to {0,1}; BCE expects float targets
        targets = labels.float()

        optimizer.zero_grad()
        logits = model(inputs).squeeze(-1)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()

        total_loss += float(loss.item())
        count += 1

    average_loss = total_loss / max(count, 1)
    losses.append(average_loss)

    model.eval()
    total_val_loss = 0.0
    val_count = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            targets = labels.float()
            logits = model(inputs).squeeze(-1)
            val_loss = criterion(logits, targets)

            total_val_loss += float(val_loss.item())
            val_count += 1

    average_val_loss = total_val_loss / max(val_count, 1)
    val_losses.append(average_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Val Loss: {average_val_loss:.4f}")

    # Save best checkpoint
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), "rnn_best_model.pth")
        patience = 0
    else:
        patience += 1

    if patience >= max_patience:
        print(f"Early stopped at {epoch+1}")
        break

print(f"Lowest Validation Loss: {best_val_loss:.4f} at Epoch {best_epoch + 1}")


In [None]:
# Load the best model and calculate accuracy only for that
model.load_state_dict(torch.load("rnn_best_model.pth", map_location=device))
model.to(device)

# After training, evaluate on validation set (with confusion matrix plot)
val_metrics = evaluate_model(
    val_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Validation Set Confusion Matrix",
)

print(f"Validation Accuracy: {val_metrics['accuracy']:.2f}%")
print(f"Validation Metrics: {val_metrics}")

# Evaluate on test set (without plot)
test_metrics = evaluate_model(test_loader, label_0, label_4, plot_confusion_matrix=False)
print(f"Test Accuracy: {test_metrics['accuracy']:.2f}%")

# Or plot test confusion matrix separately
test_metrics = evaluate_model(
    test_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Test Set Confusion Matrix",
)
