# RNN

*INSTRUCTIONS*
Embedding for RNN-based Models:
    ○ Generate wordembeddings using GloVeorWord2Vec.
    ○ Pad sequences to a fixed length for uniformity

Steps were taken from notebook: Module 3 - Video 6 onwards.ipynb

In [3]:
import pickle
import re
import numpy as np 
import nltk
from nltk.corpus import stopwords
import torch 
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix


SEED = 42
np.random.seed(SEED)

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.manual_seed_all(SEED)
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Load the datasets 
with open("train.pkl", "rb") as f:
    train = pickle.load(f)
with open("val.pkl", "rb") as f:
    val = pickle.load(f)
with open("test.pkl", "rb") as f:
    test = pickle.load(f)

num_labels = train['label'].nunique()
print("Number of labels: ", num_labels)
labels = [label for i, label in enumerate(train['label'].value_counts().index)]
labels
label_0 = labels[1]
label_4 = labels[0]
print(f"Label 0: {label_0} and label 4: {label_4}")

train.head(2)

Using device: mps
Number of labels:  2
Label 0: 0 and label 4: 4


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet,label
237034,0,2058468667,Sat Jun 06 15:00:18 PDT 2009,NO_QUERY,bestthingaround,my star trek bootleg timed out and when i refr...,0
1387008,0,2068651245,Sun Jun 07 14:27:20 PDT 2009,NO_QUERY,Scriblit,yeah but the really pretty ones only go up to ...,0


# Utils 

In [4]:
# ---------------------------------------
# Utils
# ---------------------------------------

# Function to convert text to tokens 
def preprocess_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return []
    
    # Keep only letters and whitespaces
    pattern = f"[a-zA-Z\s]"
    text = ''.join(re.findall(pattern, text))
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    return tokens
# Function to convert tokens to Word2Vec embeddings
def text_to_embeddings(text, word2vec_model, seq_length):
    """
    Function to convert a given text into a sequence of embeddings using a pretrained Word2Vec model
    
    """
    embeddings = []
    
    for i, word in enumerate(text):
        if word in word2vec_model.wv:
            if i == seq_length:
                break
            embeddings.append(word2vec_model.wv[word])
        else: # skip word if to in word2vec_model's vocabulary
            continue
        
    # Padding the sequences
    if len(embeddings) < seq_length:
        zero_padding = [np.zeros(word2vec_model.vector_size) \
                        for _ in range(seq_length - len(embeddings))]

        embeddings = embeddings + zero_padding

    # sequence of word vectors of length seq_length
    return embeddings[:seq_length]
# Text -> Embeddings -> torch tokens
def prepare_data(reviews, labels, word2vec_model, seq_length):
    X = [text_to_embeddings(review, word2vec_model, seq_length) for review in reviews]
    X_array = np.array(X)
    X_tensor = torch.tensor(X_array)
    y = torch.tensor(labels, dtype=torch.long)
    return X_tensor, y


  pattern = f"[a-zA-Z\s]"


# Word2Vec Embeddings

In [None]:
# 1) Tokenize the tweet reviwes
train['tokens'] = train['text of the tweet'].apply(preprocess_text)
val['tokens'] = val['text of the tweet'].apply(preprocess_text)
test['tokens'] = test['text of the tweet'].apply(preprocess_text)

seq_length = 100
# 2) Create vocabulary using word2vec
word2vec_model = Word2Vec(sentences=train['text of the tweet'].values.tolist(), 
                          vector_size=seq_length, # same as in Module 3 - Video 6 onwards.ipynb
                           min_count=1, 
                           workers=4)

# Get vocabulary size
vocab_size = len(word2vec_model.wv)
print("Vocab size: ", vocab_size)


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Vocab size:  27


: 

In [None]:
# Define hyperparameters
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 1
num_layers = 1
learning_rate = 0.001
num_epochs = 30
batch_size = 64
dropout_rate = 0.5
leaky_relu_slope = 0.1

# Prepare data
X_train, y_train = prepare_data(train['tokens'], train['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
X_val, y_val = prepare_data(val['tokens'], val['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
X_test, y_test = prepare_data(test['tokens'], test['label'],
                    word2vec_model, seq_length=seq_length)

In [None]:
# Create DataLoader
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create DataLoader
val_data = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)


test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


# Define the RNN model 

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate):
        super(SentimentRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # Basic RNN layer
        # shape of input tensor: (batch_size, seq_length, input_size)
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.leaky_relu = nn.LeakyReLU()  # Leaky ReLU activation layer
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()  # Sigmoid activation layer

    def forward(self, x):
        # Initial hidden state
        # h0 shape: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # RNN output
        # out shape after rnn: (batch_size, seq_length, hidden_size)
        out, _ = self.rnn(x, h0)
        # Apply Leaky ReLU to the outputs of the RNN layer
        # out shape: (batch_size, seq_length, hidden_size)
        out = self.leaky_relu(out)
        out = self.dropout(out)
        # Get the last sequence output for classification
        # out shape after indexing: (batch_size, hidden_size)
        out = out[:, -1, :]
        # Apply the linear layer for the final output
        # out shape after fc: (batch_size, output_size)
        out = self.fc(out)
        # Apply the sigmoid activation
        # out shape after sigmoid: (batch_size, output_size)
        out = self.sigmoid(out)
        # shape of output tensor: (batch_size, output_size)
        return out

# Initialize model, loss function, and optimizer
model = SentimentRNN(input_size, hidden_size, output_size, num_layers, dropout_rate).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of trainable parameters: {num_trainable_params}')



# Evaluation Metrics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

def evaluate_model(loader, label_0, label_4, plot_confusion_matrix=True, title="Confusion Matrix"):
    """
    Efficient single-pass evaluation function that computes all metrics at once.
    
    Args:
        loader: DataLoader containing batches of (inputs, labels)
        label_0: Integer label for negative sentiment (e.g., 0)
        label_4: Integer label for positive sentiment (e.g., 4)
        plot_confusion_matrix: Whether to plot the confusion matrix (default: False)
        title: Title for confusion matrix plot (default: "Confusion Matrix")
    
    Returns:
        Dictionary containing:
        - accuracy: Overall accuracy percentage
        - precision_label_0, recall_label_0, f1_label_0: Metrics for label_0
        - precision_label_4, recall_label_4, f1_label_4: Metrics for label_4
        - confusion_matrix: 2x2 numpy array [TN, FP; FN, TP]
    """
    model.eval()
    
    # Initialize counts for metrics
    tp_0, fp_0, fn_0 = 0, 0, 0  # For label_0 (negative)
    tp_4, fp_4, fn_4 = 0, 0, 0  # For label_4 (positive)
    
    # Collect all predictions and labels for confusion matrix
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in loader:
            # inputs shape: (batch_size, seq_length, input_size)
            # labels shape: (batch_size,)
            
            outputs = model(inputs)
            # outputs shape after model: (batch_size, 1) - sigmoid probabilities
            
            outputs = outputs.squeeze()
            # outputs shape after squeeze: (batch_size,) - 1D tensor of probabilities [0, 1]
            
            # Convert probabilities to binary predictions (0 or 1)
            # binary_predictions shape: (batch_size,) - binary values [0, 1]
            binary_predictions = (outputs > 0.5).long()
            
            # Map binary predictions to actual labels: 0 -> label_0, 1 -> label_4
            # predictions shape: (batch_size,) - contains label_0 or label_4
            predictions = torch.where(binary_predictions == 0, label_0, label_4)
            
            # Store for confusion matrix
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            # Calculate True Positives, False Positives, False Negatives for label_0
            # TP: predicted label_0 AND actual label_0
            tp_0 += ((predictions == label_0) & (labels == label_0)).sum().item()
            # FP: predicted label_0 BUT actual is NOT label_0 (i.e., label_4)
            fp_0 += ((predictions == label_0) & (labels != label_0)).sum().item()
            # FN: predicted NOT label_0 (i.e., label_4) BUT actual is label_0
            fn_0 += ((predictions != label_0) & (labels == label_0)).sum().item()
            
            # Calculate True Positives, False Positives, False Negatives for label_4
            # TP: predicted label_4 AND actual label_4
            tp_4 += ((predictions == label_4) & (labels == label_4)).sum().item()
            # FP: predicted label_4 BUT actual is NOT label_4 (i.e., label_0)
            fp_4 += ((predictions == label_4) & (labels != label_4)).sum().item()
            # FN: predicted NOT label_4 (i.e., label_0) BUT actual is label_4
            fn_4 += ((predictions != label_4) & (labels == label_4)).sum().item()

    # Calculate accuracy
    total = len(all_labels)
    correct = sum(1 for p, l in zip(all_predictions, all_labels) if p == l)
    accuracy = 100 * correct / total if total > 0 else 0.0

    # Calculate precision = TP / (TP + FP) - how many predicted positives were actually positive
    precision_0 = tp_0 / (tp_0 + fp_0) if (tp_0 + fp_0) > 0 else 0.0
    precision_4 = tp_4 / (tp_4 + fp_4) if (tp_4 + fp_4) > 0 else 0.0
    
    # Calculate recall = TP / (TP + FN) - how many actual positives were correctly predicted
    recall_0 = tp_0 / (tp_0 + fn_0) if (tp_0 + fn_0) > 0 else 0.0
    recall_4 = tp_4 / (tp_4 + fn_4) if (tp_4 + fn_4) > 0 else 0.0
    
    # Calculate F1-score = 2 * (precision * recall) / (precision + recall) - harmonic mean
    f1_0 = 2*(precision_0*recall_0) / (precision_0+recall_0) if (precision_0+recall_0) > 0 else 0.0
    f1_4 = 2*(precision_4*recall_4) / (precision_4+recall_4) if (precision_4+recall_4) > 0 else 0.0

    # Create confusion matrix
    cm = confusion_matrix(all_labels, all_predictions, labels=[label_0, label_4])
    
    # Plot confusion matrix if requested
    if plot_confusion_matrix:
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=[f'Negative ({label_0})', f'Positive ({label_4})'],
                    yticklabels=[f'Negative ({label_0})', f'Positive ({label_4})'],
                    cbar_kws={'label': 'Count'})
        plt.title(title, fontsize=14, fontweight='bold', pad=20)
        plt.ylabel('True Label', fontsize=12, fontweight='bold')
        plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Print summary
        tn, fp, fn, tp = cm.ravel()
        print(f"\nConfusion Matrix Summary:")
        print(f"True Negatives (TN):  {tn:6d}  - Correctly predicted {label_0}")
        print(f"False Positives (FP): {fp:6d}  - Predicted {label_4} but actual was {label_0}")
        print(f"False Negatives (FN):  {fn:6d}  - Predicted {label_0} but actual was {label_4}")
        print(f"True Positives (TP):  {tp:6d}  - Correctly predicted {label_4}")
        print(f"Total samples: {tn + fp + fn + tp}")

    return {
        'accuracy': accuracy,
        'precision_label_0': precision_0, 
        'recall_label_0': recall_0,
        'f1_label_0': f1_0,
        'precision_label_4': precision_4, 
        'recall_label_4': recall_4,
        'f1_label_4': f1_4,
        'confusion_matrix': cm
    }

# Train

In [None]:
#Training loop
import matplotlib.pyplot as plt

num_epochs = 30  # Number of epochs
losses = []  # List to store the average train loss per epoch
val_losses = []  # List to store the average validation loss per epoch
best_val_loss = float('inf')  # Initialize the best validation loss to infinity
best_epoch = 0  # Epoch with the best validation loss
patience = 0
max_patience = 3  # Maximum epochs to wait for improvement

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_val_loss = 0
    count = 0
    val_count = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        count += 1
    average_loss = total_loss / count
    losses.append(average_loss)

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for inputs, labels in val_loader:
            val_outputs = model(inputs)
            val_outputs = val_outputs.squeeze()
            val_loss = criterion(val_outputs, labels.float())
            total_val_loss += val_loss.item()
            val_count += 1
    average_val_loss = total_val_loss / val_count
    val_losses.append(average_val_loss)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Val Loss: {average_val_loss:.4f}')
    
    # Check if the current validation loss is the lowest; if so, save the model
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'rnn_best_model.pth')  # Save the best model
        patience = 0   
    else:
        patience += 1

    if patience >= max_patience:
        print(f'Early stopped at {epoch+1}')
        break  # Stop training

print(f'Lowest Validation Loss: {best_val_loss:.4f} at Epoch {best_epoch + 1}')


In [None]:
# Load the best model and calculate accuracy only for that
model.load_state_dict(torch.load('rnn_best_model.pth'))
# After training, evaluate on validation set (with confusion matrix plot)
val_metrics = evaluate_model(val_loader, label_0, label_4, 
                              plot_confusion_matrix=True, 
                              title="Validation Set Confusion Matrix")

print(f"Validation Accuracy: {val_metrics['accuracy']:.2f}%")
print(f"Validation Metrics: {val_metrics}")

# Evaluate on test set (without plot)
test_metrics = evaluate_model(test_loader, label_0, label_4, 
                              plot_confusion_matrix=False)
print(f"Test Accuracy: {test_metrics['accuracy']:.2f}%")

# Or plot test confusion matrix separately
test_metrics = evaluate_model(test_loader, label_0, label_4, 
                              plot_confusion_matrix=True,
                              title="Test Set Confusion Matrix")