In [None]:
import numpy as np
import pandas as pd

import os, sys

import torch.nn.functional
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm_att.lstm_wo_attention import *
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset_dir = '/Users/subhojit/datasets/sms_spam_collection'
df = pd.read_csv(dataset_dir + "/SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])
df.head()

In [None]:
df['label'].map({'ham': 0, 'spam': 1})
df.head()

In [None]:
sms = df['text'].to_numpy()
c = sorted(list(set(''.join(sms))))
chars = c + ['<SOS>', '<EOS>', '<PAD>']
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
vocab_size = len(chars)
vocab_size, stoi


In [None]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[xi] for xi in s]
decode = lambda l: ''.join([itos[li] for li in l])
encode('l$qweqw')

In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

xtrain, xval, ytrain, yval = train_test_split(
    df['text'].to_numpy(),
    df['label'].to_numpy(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [30]:
# if torch.backends.mps.is_available():
#     device = "mps"
# elif torch.cuda.is_available():
#     device = "cuda"
# else:
#     device = "cpu"

device = "cpu"

In [31]:
def pad_sequences(sequences):
    pad_index = stoi['<PAD>']
    max_len = np.max([len(s) for s in sequences])
    padded_seq = np.full((len(sequences), max_len), pad_index, dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded_seq[i, :len(seq)] = seq
    return padded_seq


def get_batch(batch_size, split='train'):
    data = xtrain if split == 'train' else xval
    target = ytrain if split == 'train' else yval
    idx = np.random.randint(0, len(data), (batch_size,))
    x_sample = [encode(s) for s in data[idx]]
    y_sample = target[idx]
    xpadded = pad_sequences(x_sample)
    xb, yb = xpadded, y_sample
    yb = torch.from_numpy(yb)
    yb = yb
    # yb = torch.nn.functional.one_hot(yb - 1, num_classes=2)
    xb = torch.from_numpy(xb)
    x = xb.to(device, dtype=torch.long)
    y = yb.to(device, dtype=torch.long)
    return x, y

batch_size = 32

xb, yb = get_batch(batch_size)

In [32]:
xb[0], yb[0]

(tensor([ 43,  79,  71,  66,  68,  85,   2,  73,  66,  81,  81,  90,   2,  79,
          70,  88,   2,  90,  70,  66,  83,  16,   2,  42,  80,  88,   2,  66,
          83,  70,   2,  90,  80,  86,   2,  88,  73,  70,  83,  70,   2,  66,
          83,  70,   2,  90,  80,  86,   2,  88,  73,  70,  79,   2,  66,  83,
          70,   2,  88,  70,   2,  84,  70,  70,  74,  79,  72, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,
         120, 120, 120, 120, 120, 120, 120, 120, 120

In [None]:
# training with LSTM with attention
embedding_dim = 32
hidden_size = 64
output_size = 2
batch_size = 64
seq_len = 10
learning_rate = 1e-2
max_iter = 5000
eval_interval = 500

model = LSTMWithAttention(vocab_size, hidden_size, output_size, embedding_dim)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iter):
    xb, yb = get_batch(batch_size)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    # print("grad norm:", model.fc.weight.grad.norm())
    # print("Input device:", xb.device)
    # print("Output device:", yb.device)
    # print("Model device:", next(model.parameters()).device)
    # print("Logits device:", logits.device)

    # print("xb shape: ", xb.shape)  # should be (batch_size, seq_len)
    # print("xb dtype: ", xb.dtype)
    # print("yb shape: ", yb.shape)  # should be (batch_size,)
    # print("yb dtype: ", yb.dtype)  # should be torch.long
    # print("logits shape: ", logits.shape)  # should be (batch_size, 2)

    # print("Loss:", loss.item())
    optimizer.step()
    if step % eval_interval == 0:
        print(f"step {step}: train loss {loss:.4f}")
        with torch.no_grad():
            logits = model(xb)
            probs = torch.softmax(logits, dim=1)
            print("Confidence range:", probs.max(dim=1).values[:10])
            preds = torch.argmax(logits, dim=1)
            print("Preds: ", preds.tolist())
            print("Targets: ", yb.tolist())

In [36]:
# one batch overfitting
model = LSTMWithAttention(vocab_size, hidden_size, output_size, embedding_dim)
model = model.to(device)

xb, yb = get_batch(64)
# xb, yb = xb.to(device), yb.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

for step in range(1000):
    # print("Input:", xb.shape, xb.dtype, xb.device)
    # print("Target:", yb.shape, yb.dtype, yb.device)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    print("Logits grad_fn:", logits.grad_fn)
    print("Loss grad_fn:", loss.grad_fn)
    loss.backward()
    optimizer.step()
    print(f"Step {step}, loss = {loss.item():.4f}")
    # with torch.no_grad():
    #     logits = model(xb)
    #     probs = torch.softmax(logits, dim=1)
    #     print("Confidence range:", probs.max(dim=1).values[:10])
    #     preds = torch.argmax(logits, dim=1)
    #     print("Preds: ", preds.tolist())
    #     print("Targets: ", yb.tolist())
    if step % 100 == 0:
        print("Logits:", logits[:2])

Logits grad_fn: <AddmmBackward0 object at 0x343e2a670>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a670>
Step 0, loss = 0.6533
Logits: tensor([[-0.1021, -0.2255],
        [-0.1021, -0.2255]], grad_fn=<SliceBackward0>)
Logits grad_fn: <AddmmBackward0 object at 0x343e2a070>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a070>
Step 1, loss = 0.5570
Logits grad_fn: <AddmmBackward0 object at 0x343e2a670>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a670>
Step 2, loss = 0.5405
Logits grad_fn: <AddmmBackward0 object at 0x343e2a070>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a070>
Step 3, loss = 0.4880
Logits grad_fn: <AddmmBackward0 object at 0x343e2a670>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a670>
Step 4, loss = 0.4719
Logits grad_fn: <AddmmBackward0 object at 0x343e2a070>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a070>
Step 5, loss = 0.4800
Logits grad_fn: <AddmmBackward0 object at 0x343e2a670>
Loss grad_fn: <NllLossBackward0 object at 0x343e2a670>
Step 6,

KeyboardInterrupt: 

In [35]:
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad norm = {param.grad.norm():.4f}")

embedding.weight: grad norm = 0.0000
lstm.weight_ih_l0: grad norm = 0.0000
lstm.weight_hh_l0: grad norm = 0.0000
lstm.bias_ih_l0: grad norm = 0.0000
lstm.bias_hh_l0: grad norm = 0.0000
fc.weight: grad norm = 0.0000
fc.bias: grad norm = 0.0000
