In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

dataset_dir = '/Users/subhojit/datasets/sms_spam_collection'
df = pd.read_csv(dataset_dir + "/SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})
texts = df['text'].tolist()
labels = df['label'].tolist()

chars = sorted(set(''.join(texts)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['<PAD>'] = 0
vocab_size = len(stoi)
encode = lambda s: [stoi[c] for c in s if c in stoi]

xtrain, xval, ytrain, yval = train_test_split(texts, labels, test_size=0.2, random_state=42)

def pad_sequences(sequences, max_len=256):
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)
    lengths = torch.zeros(len(sequences), dtype=torch.long)
    for i, seq in enumerate(sequences):
        seq = seq[:max_len]
        padded[i, :len(seq)] = torch.tensor(seq)
        lengths[i] = len(seq)
    return padded, lengths

def get_batch(batch_size, split='train'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval
    idx = torch.randint(0, len(x), (batch_size,))
    xb = [encode(x[i]) for i in idx]
    yb = [y[i] for i in idx]
    xb, lengths = pad_sequences(xb)
    return xb, torch.tensor(yb, dtype=torch.long), lengths

class ManualLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        x_embed = self.embedding(x)
        out, _ = self.lstm(x_embed)

        batch_size = x.size(0)
        last_hidden = torch.zeros(batch_size, out.size(2), device=x.device)
        for i in range(batch_size):
            last_hidden[i] = out[i, lengths[i] - 1]

        return self.fc(last_hidden)

device = 'mps'
model = ManualLSTMClassifier(vocab_size, embed_dim=32, hidden_dim=64, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

xb, yb, lengths = get_batch(32)
xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)

for step in range(1000):
    logits = model(xb, lengths)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    if step % 100 == 0:
        preds = torch.argmax(logits, dim=1)
        print(f"Step {step}, Loss: {loss.item():.4f}")
        print("Preds:", preds.tolist())
        print("Targets:", yb.tolist())
        print()


Step 0, Loss: 0.7155
Preds: [0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]
Targets: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Step 100, Loss: 0.0000
Preds: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Targets: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Step 200, Loss: 0.0000
Preds: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Targets: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Step 300, Loss: 0.0000
Preds: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Targets: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Step 400, Loss: 0.0000
Preds: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,