# LSTM 

Implementation based on code from notebook: 1. Encoder-Decoder Seq2Seq.ipynb

In [None]:
import pickle
import numpy as np 
import torch 
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


SEED = 42
np.random.seed(SEED)

# Load the datasets 
with open("train.pkl", "rb") as f:
    train = pickle.load(f)
with open("val.pkl", "rb") as f:
    val = pickle.load(f)
with open("test.pkl", "rb") as f:
    test = pickle.load(f)

num_labels = train['label'].nunique()
print("Number of labels: ", num_labels)
labels = [label for i, label in enumerate(train['label'].value_counts().index)]
labels
label_0 = labels[1]
label_4 = labels[0]
print(f"Label 0: {label_0} and label 4: {label_4}")

train.head(2)

# LSTM Encoder 


In [None]:
import torch.nn as nn
import torch
class BiLSTM_Classifier(nn.Module):
    """
    A Bidirectional LSTM-based module designed for sequence encoding
    and subsequent sentiment classification.
    """
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_labels, dropout_p=0.1):
        super(BiLSTM_Classifier, self).__init__()
        
        # --- Parameters ---
        self.hidden_size = hidden_size
        self.num_classes = num_labels
        self.num_directions = 2 # Fixed for BiLSTM

        # --- Embedding Layer ---
        # input: (batch_size, seq_len) -> indices of tokens
        # output: (batch_size, seq_len, embedding_dim) -> dense word vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # --- Dropout Layer ---
        # Applied after embedding to regularize word vectors (Embedding Dropout)
        self.dropout = nn.Dropout(dropout_p)
        
        # --- BiLSTM Layer ---
        # input_size: embedding_dim (the size of the input features per time step)
        # hidden_size: hidden_size (the output size of the hidden state for ONE direction)
        # batch_first=True: input shape is (batch_size, seq_len, features)
        # bidirectional=True: output_dim = 2 * hidden_size
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        
        # --- Classification Layer (MLP) ---
        # The BiLSTM combines the final forward and backward hidden states.
        # Input size to the Linear layer must be (2 * hidden_size)
        # Output size is num_labels (e.g., 2 for positive/negative)
        self.classifier = nn.Linear(self.num_directions * hidden_size, num_labels)

    def forward(self, input_tensor):
        # 1. Embedding
        # shape: (bs, seq_len) -> (bs, seq_len, embedding_dim)
        embedded = self.dropout(self.embedding(input_tensor))
    
        # 2. BiLSTM Processing
        # output: (bs, seq_len, 2 * hidden_size) - full sequence output
        # (hidden, cell): state from both directions, shape: (2, bs, hidden_size)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        print(f"lstm_out shape: {lstm_out.shape}")    # (batch_size, seq_len, 2*hidden_size)
        print(f"hidden shape: {hidden.shape}")        # (2, batch_size, hidden_size)
        print(f"cell shape: {cell.shape}")            # (2, batch_size, hidden_size)
        
        # 3. Aggregate Hidden States for Classification
        # We take the final hidden state: 
        # hidden[0] is the final forward state
        # hidden[1] is the final backward state
        # We flatten and concatenate them to get the aggregated sentence context.
        # shape: (2, bs, hidden_size) -> (bs, 2 * hidden_size)
        
        # We detach the hidden states from the full sequence output, as is common for classification
        hidden = hidden.view(self.num_directions, -1, self.hidden_size) # Reshape if necessary (optional in this setup, but safer)
        final_hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        
        # 4. Final Classification
        # input: (bs, 2 * hidden_size)
        # output: (bs, num_classes)
        prediction_logits = self.classifier(final_hidden)
        
        # For sentiment classification, we only need the final prediction
        return prediction_logits


In [11]:
hidden_size = 256 
bs = 4
seq_len = 10
vocab_size = 40
embedding_dim = 256
num_labels = 2

lstm = BiLSTM_Classifier(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    hidden_size = hidden_size,
    num_labels = num_labels
)
)

# Create a batch of random indices (simulating tokenized word ids)
# Embedding expects input of dtype torch.long (not float)
t = torch.randint(low=0, high=vocab_size, size=(bs, seq_len), dtype=torch.long)
print(t)
out = lstm(t)
print(f"Final output shape: {out.shape}")


tensor([[ 5, 15, 25, 11, 14, 24, 20, 35,  7, 24],
        [ 9, 31,  2, 11, 17, 37, 14, 24, 12, 18],
        [23,  2, 21,  8, 36, 30,  8, 25, 34, 33],
        [ 0, 18,  7, 30, 39, 22, 16, 39, 26, 25]])
Embedding shape:  torch.Size([4, 10, 256])
torch.Size([4, 10, 512])
