# LSTM 

Implementation based on code from notebook: 1. Encoder-Decoder Seq2Seq.ipynb

In [None]:
import pickle
import numpy as np 
import torch 
from gensim.models import Word2Vec
from torch.nn import functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


SEED = 42
np.random.seed(SEED)

# Load the datasets 
with open("train.pkl", "rb") as f:
    train = pickle.load(f)
with open("val.pkl", "rb") as f:
    val = pickle.load(f)
with open("test.pkl", "rb") as f:
    test = pickle.load(f)

num_labels = train['label'].nunique()
print("Number of labels: ", num_labels)
labels = [label for i, label in enumerate(train['label'].value_counts().index)]
labels
label_0 = labels[1]
label_4 = labels[0]
print(f"Label 0: {label_0} and label 4: {label_4}")

train.head(2)

# LSTM Encoder 

In [None]:
#define encoder class
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        # input tensor: (bs, seq_len) - tensor will contain indexes for the embedding module
        self.embedding = nn.Embedding(input_size, hidden_size)
        
        # Make LSTM bidirectional
        # (bs, seq_len, hidden_size) 
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        # NOTE:
        # when the LSTM is bidirectional, the output size from the LSTM is hidden_size * 2
        self.dropout = nn.Dropout(dropout_p)
        self.mlp = nn.Linear(hidden_size**2, ?)

    def forward(self, input):
        embeddings = self.embedding(input)
        embedded = self.dropout(embeddings)
        print("Embedding shape: ", embedded.shape)
        # When LSTM is bidirectional, the output, hidden and cell state will be for both directions
        output, (hidden, cell) = self.lstm(embedded)
        output = self.mlp(output)
        # output.shape = (bs, seq_len, _? 512 )
        return output, (hidden, cell)


In [11]:
hidden_size = 256 
bs = 4
seq_len = 10
vocab_size = 40

lstm = Encoder(
    input_size = vocab_size, 
    hidden_size = hidden_size
)

# Create a batch of random indices (simulating tokenized word ids)
# Embedding expects input of dtype torch.long (not float)
t = torch.randint(low=0, high=vocab_size, size=(bs, seq_len), dtype=torch.long)
print(t)
out, (hidden, cell) = lstm(t)
print(out.shape)

tensor([[ 5, 15, 25, 11, 14, 24, 20, 35,  7, 24],
        [ 9, 31,  2, 11, 17, 37, 14, 24, 12, 18],
        [23,  2, 21,  8, 36, 30,  8, 25, 34, 33],
        [ 0, 18,  7, 30, 39, 22, 16, 39, 26, 25]])
Embedding shape:  torch.Size([4, 10, 256])
torch.Size([4, 10, 512])
