In [4]:
import numpy as np
import pandas as pd

import os, sys

import torch.nn.functional
import torch.nn.functional as F

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm_att.lstm_attention import *
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# data preparation
dataset_dir = '/Users/subhojit/datasets/sms_spam_collection'
df = pd.read_csv(dataset_dir + "/SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})
texts = df['text'].tolist()
labels = df['label'].tolist()

chars = sorted(set(''.join(texts)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['<PAD>'] = 0
vocab_size = len(stoi)
encode = lambda s: [stoi[c] for c in s if c in stoi]

xtrain, xval, ytrain, yval = train_test_split(texts, labels, test_size=0.2, random_state=1894)

In [6]:
def pad_sequences(sequences, max_len=256):
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)
    lengths = torch.zeros(len(sequences), dtype=torch.long)
    for i, seq in enumerate(sequences):
        seq = seq[:max_len]
        padded[i, :len(seq)] = torch.tensor(seq)
        lengths[i] = len(seq)
    return padded, lengths

def get_batch(batch_size, split='train'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval
    idx = torch.randint(0, len(x), (batch_size,))
    xb = [encode(x[i]) for i in idx]
    yb = [y[i] for i in idx]
    xb, lengths = pad_sequences(xb)
    return xb, torch.tensor(yb, dtype=torch.long), lengths


In [19]:
embedding_dim = 32
hidden_size = 256
output_size = 2
batch_size = 256
attention_dim = 64
seq_len = 10
learning_rate = 1e-2
max_iter = 1500
eval_interval = 500
device = "mps"

In [20]:
def lets_train():
    model = LSTMAndAdditiveAttention(vocab_size, hidden_size, output_size, embedding_dim, attention_dim).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for step in range(max_iter):
        xb, yb, lengths = get_batch(batch_size, split='train')
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)

        logits, context = model(xb, lengths)
        loss = F.cross_entropy(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % eval_interval == 0:
            print(f"Step {step}, loss = {loss.item():.4f}")

    return model

In [21]:
def get_validation_batch(batch_size, split='val'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval

    iter_size = len(x) // batch_size
    for i in range(iter_size):
        idx = torch.arange(i*batch_size, i*batch_size + batch_size)
        xb = [encode(x[i]) for i in idx]
        yb = [y[i] for i in idx]
        xb, lengths = pad_sequences(xb)
        yb = torch.tensor(yb, dtype=torch.long)
        yield xb, yb, lengths

def compute_accuracy(model):
    model.eval()
    correct = 0
    total = 0
    for xb, yb, lengths in get_validation_batch(batch_size):
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
        logits, _ = model(xb, lengths)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)
        # print("Preds:   ", preds.tolist())
        # print("Targets: ", yb.tolist())

    accuracy = (correct / total) * 100
    print(f"Accuracy: {accuracy:.4f}")
    model.train()


In [22]:
model = lets_train()


Step 0, loss = 0.6938
Step 500, loss = 0.0031
Step 1000, loss = 0.0001


In [15]:
model

LSTMAndAdditiveAttention(
  (embedding): Embedding(119, 32, padding_idx=0)
  (lstm): LSTM(32, 256, batch_first=True)
  (attention): DRawAdditiveAttention()
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [18]:
compute_accuracy(model)

Accuracy: 97.7539
