In [None]:
import numpy as np
import pandas as pd

import os, sys

import torch.nn.functional
import torch.nn.functional as F

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm_att.lstm_attention import *
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset_dir = '/Users/subhojit/datasets/amazon_review_polarity_csv'
df = pd.read_csv(dataset_dir + '/train.csv')
df.head()

df_test = pd.read_csv(dataset_dir + '/test.csv')

In [None]:
review = df.iloc[:, 2].to_numpy()
test_review = df_test.iloc[:, 2].to_numpy()

all_review = np.concatenate((review, test_review))
c = sorted(list(set(''.join(all_review))))
chars = c + ['<SOS>', '<EOS>', '<PAD>']
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
vocab_size = len(chars)
# vocab_size, stoi


In [None]:
x = df.iloc[:, 2].to_numpy()
y = df.iloc[:, 0].to_numpy()
n = int(0.9*len(x))
xtrain = x[:n]
ytrain = y[:n]
xval = x[n:]
yval = y[n:]

xtest = df_test.iloc[:, 2].to_numpy()
ytest = df_test.iloc[:, 0].to_numpy()

In [None]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[xi] for xi in s]
decode = lambda l: ''.join([itos[li] for li in l])
encode('asdasdsadas')

In [None]:
# if torch.backends.mps.is_available():
#     device = "mps"
# elif torch.cuda.is_available():
#     device = "cuda"
# else:
#     device = "cpu"

device="cpu"

In [None]:
def pad_sequences(sequences):
    pad_index = stoi['<PAD>']
    max_len = np.max([len(s) for s in sequences])
    padded_seq = np.full((len(sequences), max_len), pad_index, dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded_seq[i, :len(seq)] = seq
    return padded_seq


def get_batch(batch_size, split='train'):
    data = xtrain if split == 'train' else xval
    target = ytrain if split == 'train' else yval
    idx = np.random.randint(0, len(data), (batch_size,))
    x_sample = [encode(s) for s in data[idx]]
    y_sample = target[idx]
    xpadded = pad_sequences(x_sample)
    xb, yb = xpadded, y_sample
    yb = torch.from_numpy(yb)
    yb = yb - 1
    # yb = torch.nn.functional.one_hot(yb - 1, num_classes=2)
    xb = torch.from_numpy(xb)
    x = xb.to(device, dtype=torch.long)
    y = yb.to(device, dtype=torch.long)
    return x, y

batch_size = 32

xb, yb = get_batch(batch_size)

In [None]:
def create_mask(padded_batch):
    pad_index = stoi['<PAD>']
    return (padded_batch != pad_index).astype(np.float32)

In [None]:
# training with LSTM with attention
embedding_dim = 32
hidden_size = 64
output_size = 2
batch_size = 64
seq_len = 10
learning_rate = 1e-2
max_iter = 5000
eval_interval = 500

model = LSTMWithAttention(vocab_size, hidden_size, output_size, embedding_dim)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iter):
    xb, yb = get_batch(batch_size)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    # print("grad norm:", model.fc.weight.grad.norm())
    # print("Input device:", xb.device)
    # print("Output device:", yb.device)
    # print("Model device:", next(model.parameters()).device)
    # print("Logits device:", logits.device)

    print("xb shape: ", xb.shape)  # should be (batch_size, seq_len)
    print("xb dtype: ", xb.dtype)
    print("yb shape: ", yb.shape)  # should be (batch_size,)
    print("yb dtype: ", yb.dtype)  # should be torch.long
    print("logits shape: ", logits.shape)  # should be (batch_size, 2)

    # print("Loss:", loss.item())
    optimizer.step()
    if step % eval_interval == 0:
        print(f"step {step}: train loss {loss:.4f}")
        with torch.no_grad():
            logits = model(xb)
            probs = torch.softmax(logits, dim=1)
            print("Confidence range:", probs.max(dim=1).values[:10])
            preds = torch.argmax(logits, dim=1)
            print("Preds: ", preds.tolist())
            print("Targets: ", yb.tolist())


In [None]:
def get_val_batches():
    iter = len(xval) // batch_size
    for i in range(iter):
        idx = torch.arange(i * batch_size, (i + 1) * batch_size)
        x_sample = [encode(s) for s in xval[idx]]
        y_sample = yval[idx]
        xpadded = pad_sequences(x_sample)
        xb, yb = np.vstack(xpadded), y_sample
        yb = torch.from_numpy(yb)
        yb = torch.nn.functional.one_hot(yb - 1, num_classes=2)
        xb = torch.from_numpy(xb)
        x = xb.to(device)
        y = yb.to(device)
        yield x, y


@torch.no_grad()
def compute_accuracy():
    out = {}
    model.eval()
    correct = 0
    total = 0
    for X, Y in get_val_batches():
        logits = model(X)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == Y).all(dim=1).sum().item()
        total += Y.shape[0]
    acc = (correct / total)
    return acc

val_acc = compute_accuracy()
print(f"Validation Accuracy: {val_acc * 100:.2f}%")


In [15]:
device = torch.device("cpu")
model = model.to(device)
xb, yb = get_batch(4)
xb = xb.to(device)
yb = yb.to(device)

In [16]:
# one batch overfitting
model = LSTMWithAttention(vocab_size, hidden_size, output_size, embedding_dim)

xb, yb = get_batch(64)
# xb, yb = xb.to(device), yb.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

for step in range(1000):
    print("Input:", xb.shape, xb.dtype, xb.device)
    print("Target:", yb.shape, yb.dtype, yb.device)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Step {step}, loss = {loss.item():.4f}")
    if step % 100 == 0:
        print("Logits:", logits[:2])


Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 0, loss = 0.6931
Logits: tensor([[-0.0887,  0.0359],
        [-0.0887,  0.0359]], grad_fn=<SliceBackward0>)
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 1, loss = 1.3203
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 2, loss = 0.7389
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 3, loss = 0.7239
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 4, loss = 0.7569
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 5, loss = 0.7332
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 6, loss = 0.7067
Input: torch.Size([64, 976]) torch.int64 cpu
Target: torch.Size([64]) torch.int64 cpu
Step 7, loss = 0.6893
Input: torch.Size([64, 976]) torch.int64 cpu


KeyboardInterrupt: 

In [None]:
xb, yb = get_batch(4)
print("Target classes in batch:", yb.tolist())
assert 0 in yb and 1 in yb, "Need both classes for learning"

In [17]:
for name, param in model.named_parameters():
    print(name, param.device)

embedding.weight cpu
lstm.weight_ih_l0 cpu
lstm.weight_hh_l0 cpu
lstm.bias_ih_l0 cpu
lstm.bias_hh_l0 cpu
fc.weight cpu
fc.bias cpu


In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, max_seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(emb_dim * max_seq_len, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        x = self.embedding(x)  # (B, T, D)
        return self.net(x)

In [None]:
xb, yb = get_batch(10)
# xb = xb.to("cpu", dtype=torch.long)
# yb = yb.to("cpu", dtype=torch.long)

model = MLPClassifier(vocab_size, embedding_dim, hidden_size, 2, xb.shape[1])
model = model.to("cpu")


optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

for step in range(200):
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    preds = torch.argmax(logits, dim=1)
    print(f"Step {step}, loss = {loss.item():.4f}, preds = {preds.tolist()}, targets = {yb.tolist()}")
