# RNN

*INSTRUCTIONS*
Embedding for RNN-based Models:
    ○ Generate wordembeddings using GloVeorWord2Vec.
    ○ Pad sequences to a fixed length for uniformity

Steps were taken from notebook: Module 3 - Video 6 onwards.ipynb

In [1]:
import pickle
import os
import re
import numpy as np
import gc # garbage collection
import nltk
from nltk.corpus import stopwords
import torch
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix


SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Check the available device
if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.manual_seed_all(SEED)
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using device: {device}")

# Load the datasets
with open("data/train.pkl", "rb") as f:
    og_train = pickle.load(f)
with open("data/val.pkl", "rb") as f:
    val = pickle.load(f)
with open("data/test.pkl", "rb") as f:
    test = pickle.load(f)

# Remap original labels {0,4} -> contiguous ids {0,1} for modeling
# (Keep a copy of the original labels for reporting/debugging.)
LABEL_MAP = {0: 0, 4: 1}

for df_name, df in [("train", og_train), ("val", val), ("test", test)]:
    df["label_original"] = df["label"]
    mapped = df["label_original"].map(LABEL_MAP)

    if mapped.isna().any():
        bad = sorted(df.loc[mapped.isna(), "label_original"].unique().tolist())
        raise ValueError(f"Unexpected labels in {df_name}: {bad}")

    df["label"] = mapped.astype(int)

num_labels = og_train["label"].nunique()
print("Number of labels (label): ", num_labels)

# With remapping, class ids are stable
label_0 = 0  # Negative
label_4 = 1  # Positive
print(f"Label 0: {label_0} and label 4: {label_4}")

og_train.head(2)

Using device: cuda
Number of labels (label):  2
Label 0: 0 and label 4: 1


Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet,label,label_original
237034,0,2058468667,Sat Jun 06 15:00:18 PDT 2009,NO_QUERY,bestthingaround,my star trek bootleg timed out and when i refr...,0,0
1387008,0,2068651245,Sun Jun 07 14:27:20 PDT 2009,NO_QUERY,Scriblit,yeah but the really pretty ones only go up to ...,0,0


In [2]:
print(og_train.shape, val.shape, test.shape)
# Due to memory issues get a slice of the train data:
# Select a random sample of 1 million rows from train and save to data/train_slice.pkl
train_slice = og_train.sample(n=1_000_000, random_state=SEED)
with open("data/train_slice.pkl", "wb") as f:
    pickle.dump(train_slice, f)

with open("data/train_slice.pkl", "rb") as f:
    train = pickle.load(f)

print(f"loaded train_slice: {train.shape}")

(1151993, 8) (287999, 8) (160000, 8)
loaded train_slice: (1000000, 8)


# Utils 

In [3]:
# Check number of CPU cores available

# quick viw of CPUs: 
    # !nproc
# more detailed view: 
    # !lscpu

# Physical CPU cores only (not threads)
    # lscpu | grep "^CPU(s):"

# Number of physical cores
!lscpu | grep "Core(s) per socket"

Core(s) per socket:                   4


In [4]:
# ---------------------------------------
# Utils - OD
# ---------------------------------------
# def has_repeated_chars(word, n):
#     return any(all(word[i+j] == word[i] for j in range(n)) for i in range(len(word) - n + 1))


# # Function to convert text to tokens
# def preprocess_text(text):
#     # Check if the text is a string
#     if not isinstance(text, str):
#         return []

#     # remove words with more than 3 repeated characters
#     # Keep only letters and whitespaces (avoid regex escape warnings)
#     text = re.sub(r"[^a-zA-Z\s]+", " ", text)
#     text = re.sub(r"\s+", " ", text).strip()

#     # Convert to lowercase
#     text = text.lower()

#     # Remove words with more than 3 same characters in a row
#     text = " ".join([word for word in text.split() if not(has_repeated_chars(word,3))])

#     # Tokenize the text
#     tokens = nltk.word_tokenize(text)

#     return tokens

# # Function to convert tokens to Word2Vec embeddings
# def text_to_embeddings(text, word2vec_model, seq_length):
#     """
#     Function to convert a given token list into a sequence of embeddings using a pretrained Word2Vec model.
#     Optimized version: pre-allocates numpy array (padding handled automatically with zeros).
#     Uses float16 to reduce memory usage.
#     """
#     vector_size = word2vec_model.vector_size
#     # Pre-allocate array filled with zeros - this automatically handles padding!
#     # If text is shorter than seq_length, remaining positions stay as zeros (padding)
#     embeddings = np.zeros((seq_length, vector_size), dtype=np.float16)
    
#     idx = 0
#     for word in text[:seq_length]:  # Only process up to seq_length
#         if word in word2vec_model.wv:
#             # Convert to float16 on assignment (in case underlying vectors are float32)
#             embeddings[idx] = word2vec_model.wv[word].astype(np.float16)
#             idx += 1
        
#         # if idx < seq_length, remaining positions are already zeros (padding)
    
#     return embeddings

# # Text -> Embeddings -> torch tensors
# def prepare_data(reviews, labels, word2vec_model, seq_length):
#     """
#     Optimized version: pre-allocates output array and processes in batch.
#     Uses float16 for X_array and resulting torch tensor.
#     """
#     num_samples = len(reviews)
#     vector_size = word2vec_model.vector_size
    
#     # Pre-allocate output array using float16 (saves memory)
#     X_array = np.zeros((num_samples, seq_length, vector_size), dtype=np.float16)
    
#     # Fill array directly (faster than building list)
#     for i, review in enumerate(reviews):
#         X_array[i] = text_to_embeddings(review, word2vec_model, seq_length)
    
#     # Convert to tensors (from_numpy is faster and shares memory)
#     X_tensor = torch.from_numpy(X_array).to(torch.float16)
    
#     # Convert labels to numpy array first (handles pandas Series)
#     if hasattr(labels, 'values'):
#         labels = labels.values
#     y = torch.from_numpy(np.array(labels))
    
#     return X_tensor, y


In [5]:
# ---------------------------------------
# Utils 
# ---------------------------------------

# Parallel processing - need multiple CPU cores
from multiprocessing import Pool
from functools import partial

def has_repeated_chars(word, n):
    return any(all(word[i+j] == word[i] for j in range(n)) for i in range(len(word) - n + 1))

# Function to convert text to tokens
def preprocess_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return []

    # remove words with more than 3 repeated characters
    # Keep only letters and whitespaces (avoid regex escape warnings)
    text = re.sub(r"[^a-zA-Z\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    # Remove words with more than 3 same characters in a row
    text = " ".join([word for word in text.split() if not(has_repeated_chars(word,3))])

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    return tokens

# Optimized: Pre-build embedding matrix and vocabulary set for faster lookups
def build_embedding_lookup(word2vec_model):
    """
    Pre-builds embedding lookup structures for faster access.
    Returns: vocab_set (set), embedding_matrix (numpy array), word_to_idx (dict)
    """
    vocab_set = set(word2vec_model.wv.key_to_index.keys())
    vector_size = word2vec_model.vector_size
    
    # Build embedding matrix (vocab_size x vector_size)
    word_to_idx = {word: idx for idx, word in enumerate(word2vec_model.wv.key_to_index.keys())}
    embedding_matrix = np.zeros((len(vocab_set), vector_size), dtype=np.float16)
    
    for word, idx in word_to_idx.items():
        embedding_matrix[idx] = word2vec_model.wv[word].astype(np.float16)
    
    return vocab_set, embedding_matrix, word_to_idx

# Optimized: Vectorized embedding conversion
def text_to_embeddings_vectorized(tokens, vocab_set, embedding_matrix, word_to_idx, seq_length):
    """
    Vectorized version: converts token list to embeddings using pre-built lookup structures.
    Much faster than repeated dictionary lookups.
    """
    vector_size = embedding_matrix.shape[1]
    embeddings = np.zeros((seq_length, vector_size), dtype=np.float16)
    
    idx = 0
    for token in tokens[:seq_length]:
        if token in vocab_set:
            embeddings[idx] = embedding_matrix[word_to_idx[token]]
            idx += 1
    
    return embeddings

# Text -> Embeddings -> torch tensors
# Text -> Embeddings -> torch tensors (Optimized Sequential Version)
def prepare_data(reviews, labels, word2vec_model, seq_length, chunk_size=500):
    """
    Memory-efficient sequential version: processes in smaller chunks.
    Uses pre-built lookup structures for faster embedding access.
    Uses float16 for X_array and resulting torch tensor.
    Smaller default chunk_size to reduce memory peaks.
    """
    num_samples = len(reviews)
    vector_size = word2vec_model.vector_size
    
    print(f"Processing {num_samples} samples with chunk_size={chunk_size}...")
    
    # Pre-build lookup structures (one-time cost)
    print("Building embedding lookup structures...")
    vocab_set, embedding_matrix, word_to_idx = build_embedding_lookup(word2vec_model)
    print(f"Lookup structures built. Vocab size: {len(vocab_set)}")
    gc.collect()
    
    # Pre-allocate output array using float16 (saves memory)
    print(f"Allocating output array: {num_samples} x {seq_length} x {vector_size} (float16)...")
    X_array = np.zeros((num_samples, seq_length, vector_size), dtype=np.float16)
    
    # Process in chunks to reduce memory peaks
    num_chunks = (num_samples + chunk_size - 1) // chunk_size
    print(f"Processing in {num_chunks} chunks...")
    
    for chunk_idx, chunk_start in enumerate(range(0, num_samples, chunk_size)):
        chunk_end = min(chunk_start + chunk_size, num_samples)
        
        if (chunk_idx + 1) % 10 == 0 or chunk_idx == 0:
            print(f"Processing chunk {chunk_idx + 1}/{num_chunks} (samples {chunk_start}-{chunk_end})...")
        
        # Process chunk
        for i in range(chunk_start, chunk_end):
            review_tokens = reviews.iloc[i] if hasattr(reviews, 'iloc') else reviews[i]
            X_array[i] = text_to_embeddings_vectorized(
                review_tokens, vocab_set, embedding_matrix, word_to_idx, seq_length
            )
        
        # Clear cache after each chunk
        if (chunk_idx + 1) % 5 == 0:
            gc.collect()
    
    print("Converting to tensors...")
    # Convert to tensors (from_numpy is faster and shares memory)
    X_tensor = torch.from_numpy(X_array).to(torch.float16)
    del X_array  # Free numpy array before converting labels
    gc.collect()
    
    # Convert labels to numpy array first (handles pandas Series)
    if hasattr(labels, 'values'):
        labels = labels.values
    y = torch.from_numpy(np.array(labels, dtype=np.int64))
    
    print("Data preparation complete!")
    return X_tensor, y
    
# # Uses N workers?
# def prepare_data_parallel(reviews, labels, word2vec_model, seq_length, n_workers=4, chunk_size=1000):
    """
    Parallel version for even faster processing on multi-core CPUs.
    """
    num_samples = len(reviews)
    vector_size = word2vec_model.vector_size
    
    # Pre-build lookup structures
    vocab_set, embedding_matrix, word_to_idx = build_embedding_lookup(word2vec_model)
    
    # Convert reviews to list if it's a pandas Series
    if hasattr(reviews, 'tolist'):
        reviews_list = reviews.tolist()
    else:
        reviews_list = list(reviews)
    
    # Create partial function with fixed arguments
    embed_func = partial(
        text_to_embeddings_vectorized,
        vocab_set=vocab_set,
        embedding_matrix=embedding_matrix,
        word_to_idx=word_to_idx,
        seq_length=seq_length
    )
    
    # Process in parallel chunks
    X_array = np.zeros((num_samples, seq_length, vector_size), dtype=np.float16)
    
    with Pool(n_workers) as pool:
        results = pool.map(embed_func, reviews_list)
    
    # Fill array from results
    for i, embedding in enumerate(results):
        X_array[i] = embedding
    
    # Convert to tensors
    X_tensor = torch.from_numpy(X_array).to(torch.float16)
    
    # Convert labels
    if hasattr(labels, 'values'):
        labels = labels.values
    y = torch.from_numpy(np.array(labels, dtype=np.int64))
    
    return X_tensor, y

# Word2Vec Embeddings

In [6]:
import zipfile

DIR_NAME = "word2vec_tokens_data"
zip_path = f"{DIR_NAME}.zip"
extract_dir = DIR_NAME

# Extract if zip exists but folder doesn't
# if os.path.exists(zip_path) and not os.path.exists(extract_dir):
#     print(f"Found {zip_path} - extracting to {extract_dir}...")
#     with zipfile.ZipFile(zip_path, "r") as zip_ref:
#         zip_ref.extractall(extract_dir)
#     print("Extraction complete.")

# If the folder exists, compress its contents into a zip file (no subdirs, just .pkl files and direct children)
# if os.path.exists(extract_dir) and not os.path.exists(zip_path):
#     print(f"Zipping files from {extract_dir} into {zip_path} (flat, no folders)...")
#     with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
#         for filename in os.listdir(extract_dir):
#             file_path = os.path.join(extract_dir, filename)
#             if os.path.isfile(file_path):
#                 zipf.write(file_path, arcname=filename)
#     print("Zip file creation complete.")

In [7]:
# Save this code in a separate file named word2vec_tokens.py

import os
import pickle
from gensim.models import Word2Vec
import gc  # Added for garbage collection

def load_or_create_tokens(train, val, test, preprocess_text, dir_name=DIR_NAME):
    """
    Tokenizes tweet texts and caches the result, or loads from cache if available.
    Token files are saved/loaded from '{dir_name}/tokens_train.pkl' etc.
    Optimized version using vectorized operations and itertools.
    """
    from collections import Counter
    from itertools import chain

    train_tokens_path = os.path.join(dir_name, "tokens_train.pkl")
    val_tokens_path = os.path.join(dir_name, "tokens_val.pkl")
    test_tokens_path = os.path.join(dir_name, "tokens_test.pkl")
    valid_tokens_path = os.path.join(dir_name, "valid_tokens.pkl")

    if os.path.exists(train_tokens_path) and os.path.exists(val_tokens_path) and os.path.exists(test_tokens_path):
        print("Loading tokenized data from cache...")
        with open(train_tokens_path, "rb") as f:
            train['tokens'] = pickle.load(f)
        with open(val_tokens_path, "rb") as f:
            val['tokens'] = pickle.load(f)
        with open(test_tokens_path, "rb") as f:
            test['tokens'] = pickle.load(f)
        print("Tokenized data loaded from cache!")
        gc.collect()  # Run garbage collector after loading from cache
    else:
        print("Tokenizing data (this may take a while)...")
        # Tokenize all datasets
        train['tokens'] = train['text of the tweet'].apply(preprocess_text)
        val['tokens'] = val['text of the tweet'].apply(preprocess_text)
        test['tokens'] = test['text of the tweet'].apply(preprocess_text)
        gc.collect()  # Run garbage collector after tokenization

        # Count token frequencies in training data (vectorized flattening with itertools)
        all_train_tokens = list(chain.from_iterable(train['tokens']))
        token_counts = Counter(all_train_tokens)
        gc.collect()  # Free memory after counting tokens

        # Create set of valid tokens (appearing >= 2 times)
        valid_tokens = {token for token, count in token_counts.items() if count >= 2}
        # Save token_counts length before deletion for reporting
        total_token_count = len(token_counts)
        gc.collect()  # Free possibly large token_counts

        # Filter tokens using list comprehensions (faster than pandas apply)
        # Convert to lists first to avoid pandas overhead during iteration
        train_tokens_list = train['tokens'].tolist()
        val_tokens_list = val['tokens'].tolist()
        test_tokens_list = test['tokens'].tolist()
        gc.collect()  # After tolist(), as older token columns may be unreferenced

        train['tokens'] = [[t for t in tokens if t in valid_tokens] for tokens in train_tokens_list]
        val['tokens'] = [[t for t in tokens if t in valid_tokens] for tokens in val_tokens_list]
        test['tokens'] = [[t for t in tokens if t in valid_tokens] for tokens in test_tokens_list]

        # Help cleanup large intermediate lists
        del train_tokens_list, val_tokens_list, test_tokens_list, all_train_tokens, token_counts
        gc.collect()  # Critical garbage collection after largest intermediate data

        os.makedirs(dir_name, exist_ok=True)
        # Save tokenized data
        with open(train_tokens_path, "wb") as f:
            pickle.dump(train['tokens'], f)
        with open(val_tokens_path, "wb") as f:
            pickle.dump(val['tokens'], f)
        with open(test_tokens_path, "wb") as f:
            pickle.dump(test['tokens'], f)
        with open(valid_tokens_path, "wb") as f:
            pickle.dump(valid_tokens, f)
        print(f"Tokenized data saved to cache! Filtered vocabulary: {len(valid_tokens)} tokens (removed {total_token_count - len(valid_tokens)} rare tokens)")
        gc.collect()  # After saving all files, free memory again if needed
    return train, val, test

def load_or_train_word2vec(train, seq_length, dir_name=DIR_NAME):
    """
    Loads a cached Word2Vec model, or trains and caches a new one.
    The model is always saved/loaded as '{dir_name}/word2vec_model.model.'
    """
    model_path = os.path.join(dir_name, "word2vec_model.model")
    if os.path.exists(model_path):
        print("Loading Word2Vec model from cache...")
        word2vec_model = Word2Vec.load(model_path)
        print("Word2Vec model loaded from cache!")
        gc.collect()  # Collect any unreferenced memory after loading
    else:
        print("Training Word2Vec model (this may take a while)...")
        tokens_lists = train['tokens'].values.tolist()
        gc.collect()  # Clean up memory before creating model (tokens_lists can be large)
        word2vec_model = Word2Vec(
            sentences=tokens_lists,
            vector_size=seq_length,
            min_count=1,
            workers=4
        )
        del tokens_lists  # release reference to input for Word2Vec
        gc.collect()  # Collect after model construction
        os.makedirs(dir_name, exist_ok=True)
        word2vec_model.save(model_path)
        print("Word2Vec model saved to cache!")
        gc.collect()  # Collect after saving model
    vocab_size = len(word2vec_model.wv)
    print("Vocab size: ", vocab_size)
    return word2vec_model, vocab_size

train, val, test = load_or_create_tokens(train, val, test, preprocess_text)
gc.collect()  # One more right after tokens loaded/created

seq_length = 100
word2vec_model, vocab_size = load_or_train_word2vec(train, seq_length)
gc.collect()  # After model loaded/trained


Loading tokenized data from cache...


Tokenized data loaded from cache!
Loading Word2Vec model from cache...
Word2Vec model loaded from cache!
Vocab size:  116213


0

In [None]:
# Vocab size: 256064 -->  116213

In [8]:
# Define hyperparameters
input_size = word2vec_model.vector_size
hidden_size = 128
output_size = 2 
num_layers = 1
learning_rate = 0.001
num_epochs = 30
batch_size = 16
dropout_rate = 0.5
leaky_relu_slope = 0.1


def load_or_prepare_train_data(dir_name, train, word2vec_model, seq_length):
    """
    Loads or creates preprocessed embeddings for training set only.
    Returns X_train, y_train tensors.
    """
    X_train_path = os.path.join(dir_name, "X_train.pt")
    y_train_path = os.path.join(dir_name, "y_train.pt")
    if os.path.exists(X_train_path) and os.path.exists(y_train_path):
        print("Loading preprocessed training data from cache...")
        X_train = torch.load(X_train_path, map_location='cpu')
        y_train = torch.load(y_train_path, map_location='cpu')
        print("Training data loaded from cache!")
    else:
        print("Preprocessing training data (this may take a while)...")
        X_train, y_train = prepare_data(train['tokens'], train['label'],
                            word2vec_model, seq_length=seq_length)
        os.makedirs(dir_name, exist_ok=True)
        torch.save(X_train, X_train_path)
        torch.save(y_train, y_train_path)
        print("Training data saved to cache!")
        gc.collect()
    return X_train, y_train

def load_or_prepare_val_test_data(dir_name, val, test, word2vec_model, seq_length):
    """
    Loads or creates preprocessed embeddings for validation and test sets only.
    Returns X_val, y_val, X_test, y_test tensors.
    """
    # VALIDATION
    X_val_path = os.path.join(dir_name, "X_val.pt")
    y_val_path = os.path.join(dir_name, "y_val.pt")
    if os.path.exists(X_val_path) and os.path.exists(y_val_path):
        print("Loading preprocessed validation data from cache...")
        X_val = torch.load(X_val_path, map_location='cpu')
        y_val = torch.load(y_val_path, map_location='cpu')
        print("Validation data loaded from cache!")
    else:
        print("Preprocessing validation data (this may take a while)...")
        X_val, y_val = prepare_data(val['tokens'], val['label'],
                            word2vec_model, seq_length=seq_length)
        os.makedirs(dir_name, exist_ok=True)
        torch.save(X_val, X_val_path)
        torch.save(y_val, y_val_path)
        print("Validation data saved to cache!")
        gc.collect()

    # TEST
    X_test_path = os.path.join(dir_name, "X_test.pt")
    y_test_path = os.path.join(dir_name, "y_test.pt")
    if os.path.exists(X_test_path) and os.path.exists(y_test_path):
        print("Loading preprocessed test data from cache...")
        X_test = torch.load(X_test_path, map_location='cpu')
        y_test = torch.load(y_test_path, map_location='cpu')
        print("Test data loaded from cache!")
    else:
        print("Preprocessing test data (this may take a while)...")
        X_test, y_test = prepare_data(test['tokens'], test['label'],
                            word2vec_model, seq_length=seq_length)
        os.makedirs(dir_name, exist_ok=True)
        torch.save(X_test, X_test_path)
        torch.save(y_test, y_test_path)
        print("Test data saved to cache!")
        gc.collect()
    
    return X_val, y_val, X_test, y_test


# Prepare training data - load or create embeddings
X_train, y_train = load_or_prepare_train_data(
    DIR_NAME, train, word2vec_model, seq_length
)


Loading preprocessed training data from cache...
Training data loaded from cache!


In [None]:
# Prepare validation and test data - load or create embeddings
X_val, y_val, X_test, y_test = load_or_prepare_val_test_data(
    DIR_NAME, val, test, word2vec_model, seq_length
)

Preprocessing validation data (this may take a while)...
Processing 287999 samples with chunk_size=500...
Building embedding lookup structures...
Lookup structures built. Vocab size: 116213
Allocating output array: 287999 x 100 x 100 (float16)...
Processing in 576 chunks...
Processing chunk 1/576 (samples 0-500)...
Processing chunk 10/576 (samples 4500-5000)...
Processing chunk 20/576 (samples 9500-10000)...
Processing chunk 30/576 (samples 14500-15000)...
Processing chunk 40/576 (samples 19500-20000)...
Processing chunk 50/576 (samples 24500-25000)...
Processing chunk 60/576 (samples 29500-30000)...
Processing chunk 70/576 (samples 34500-35000)...
Processing chunk 80/576 (samples 39500-40000)...
Processing chunk 90/576 (samples 44500-45000)...
Processing chunk 100/576 (samples 49500-50000)...
Processing chunk 110/576 (samples 54500-55000)...
Processing chunk 120/576 (samples 59500-60000)...
Processing chunk 130/576 (samples 64500-65000)...
Processing chunk 140/576 (samples 69500-70000

In [None]:
# Create DataLoader
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
gc.collect() 

In [None]:

val_data = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
gc.collect() 

In [None]:

test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
gc.collect() 

# Define the RNN model 

In [None]:
# class SentimentRNN(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate, leaky_relu_slope=0.1):
#         super(SentimentRNN, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         # Basic RNN layer
#         # shape of input tensor: (batch_size, seq_length, input_size)
#         self.rnn = nn.RNN(
#             input_size,
#             hidden_size,
#             num_layers,
#             batch_first=True,
#             dropout=dropout_rate if num_layers > 1 else 0,
#         )
#         self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         # Initial hidden state
#         # h0 shape: (num_layers, batch_size, hidden_size)
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)

#         # out shape after rnn: (batch_size, seq_length, hidden_size)
#         out, _ = self.rnn(x, h0)
#         out = self.leaky_relu(out)
#         out = self.dropout(out)

#         # Get the last sequence output for classification
#         # out shape after indexing: (batch_size, hidden_size)
#         out = out[:, -1, :]

#         logits = self.fc(out)
#         return logits

class SentimentRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate, leaky_relu_slope=0.1):
        super(SentimentRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # Basic RNN layer
        # shape of input tensor: (batch_size, seq_length, input_size)
        self.rnn = nn.RNN(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout_rate if num_layers > 1 else 0,
        )
        self.leaky_relu = nn.LeakyReLU(negative_slope=leaky_relu_slope)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initial hidden state - convert to float16
        # h0 shape: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, 
                        dtype=torch.float16, device=x.device)

        # out shape after rnn: (batch_size, seq_length, hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.leaky_relu(out)
        out = self.dropout(out)

        # Get the last sequence output for classification
        # out shape after indexing: (batch_size, hidden_size)
        out = out[:, -1, :]

        logits = self.fc(out)
        return logits
        
# Initialize model, loss function, and optimizer
model = SentimentRNN(
    input_size,
    hidden_size,
    output_size,
    num_layers,
    dropout_rate,
    leaky_relu_slope=leaky_relu_slope,
).to(device)

# Convert model to float16
model = model.half()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_trainable_params}")


# Train

In [None]:
# Training loop
import matplotlib.pyplot as plt
import time

num_epochs = 30  # Number of epochs
losses = []
val_losses = []

best_val_loss = float("inf")
best_epoch = 0
patience = 0
max_patience = 3

# Start timing training
start_time = time.time()

for epoch in range(num_epochs):
    print(f"EPOCH  NUMBER: {epoch+1}")
    model.train()
    total_loss = 0.0
    count = 0

    for inputs, labels in train_loader:
        # Convert inputs to float16
        inputs = inputs.to(device, dtype=torch.float16)
        labels = labels.to(device)

        # CrossEntropyLoss expects long integer class indices (0 or 1)
        optimizer.zero_grad()
        logits = model(inputs)  # Shape: (batch_size, 2)
        loss = criterion(logits, labels)  # labels shape: (batch_size) with class indices
        loss.backward()
        optimizer.step()

        total_loss += float(loss.item())
        count += 1

    average_loss = total_loss / max(count, 1)
    losses.append(average_loss)

    model.eval()
    total_val_loss = 0.0
    val_count = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            # Convert inputs to float16
            inputs = inputs.to(device, dtype=torch.float16)
            labels = labels.to(device)

            logits = model(inputs)  # Shape: (batch_size, 2)
            val_loss = criterion(logits, labels)  # labels shape: (batch_size) with class indices

            total_val_loss += float(val_loss.item())
            val_count += 1

    average_val_loss = total_val_loss / max(val_count, 1)
    val_losses.append(average_val_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}, Val Loss: {average_val_loss:.4f}")

    # Save best checkpoint
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), "rnn_best_model.pth")
        patience = 0
    else:
        patience += 1

    if patience >= max_patience:
        print(f"Early stopped at {epoch+1}")
        break

# Calculate total training time
training_time = time.time() - start_time
print(f"Lowest Validation Loss: {best_val_loss:.4f} at Epoch {best_epoch + 1}")
print(f"Total Training Time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")

# Evaluate 

## Evaluation Functions - Accuracy, Label Precision, Recall, F1, and Confusion Matric

In [None]:
from metrics import evaluate_model

# Calculate number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)


## Validation Set

In [None]:
# Load the best model and calculate accuracy only for that
model.load_state_dict(torch.load("rnn_best_model.pth", map_location=device))
model.to(device)

# After training, evaluate on validation set (with confusion matrix plot)
val_metrics = evaluate_model(
    model, 
    device, 
    val_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Validation Set Confusion Matrix",
    training_time_seconds=training_time,
    num_trainable_parameters=num_trainable_params,
    model_name="rnn",
    dataset_split="val",
    save_results=True,
)

print(f"Validation Accuracy: {val_metrics['accuracy']:.2f}%")
print(f"Validation Metrics: {val_metrics}")



## Test Set 

In [None]:
# Evaluate on test set (without plot)

# Or plot test confusion matrix separately
test_metrics = evaluate_model(
    model, 
    device, 
    test_loader,
    label_0,
    label_4,
    plot_confusion_matrix=True,
    title="Test Set Confusion Matrix",
    training_time_seconds=training_time,
     num_trainable_parameters=num_trainable_params,
    model_name="rnn",
    dataset_split="test",
    save_results=True,
)

print(f"Test Accuracy: {test_metrics['accuracy']:.2f}%")
