In [1]:
import numpy as np
import pandas as pd

import os, sys

import torch.nn.functional
import torch.nn.functional as F

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm_att.lstm_wo_attention import *
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset_dir = '/Users/subhojit/datasets/amazon_review_polarity_csv'
df = pd.read_csv(dataset_dir + '/train.csv')
df.head()
df_test = pd.read_csv(dataset_dir + '/test.csv')

In [3]:
df.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [4]:
x = df.iloc[:, 2].to_numpy()
y = df.iloc[:, 0].to_numpy() - 1

xtest = df_test.iloc[:, 2].to_numpy()
ytest = df_test.iloc[:, 0].to_numpy() - 1

In [5]:
y[:20]

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0])

In [6]:
n = int(0.9*len(x))
xtrain = x[:n]
ytrain = y[:n]
xval = x[n:]
yval = y[n:]

In [18]:
len(xtrain), len(xval), len(xtest), len(ytest), len(ytrain), len(yval)

(3239999, 360000, 399999, 399999, 3239999, 360000)

In [None]:
all_review = np.concatenate((x, xtest))
chars = sorted(list(set(''.join(all_review))))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['<PAD>'] = 0
vocab_size = len(stoi)
encode = lambda s: [stoi[c] for c in s if c in stoi]
stoi

In [9]:
encode('ioewfureihdjbvc')

[90, 96, 86, 104, 87, 102, 99, 86, 90, 89, 85, 91, 83, 103, 84]

In [10]:
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


In [11]:
def pad_sequences(sequences):
    pad_index = stoi['<PAD>']
    max_len = np.max([len(s) for s in sequences])
    lenghts = torch.zeros(len(sequences), dtype=torch.long)
    padded_seq = torch.zeros(len(sequences), max_len, dtype=torch.long)
    for i, seq in enumerate(sequences):
        padded_seq[i, :len(seq)] = torch.tensor(seq)
        lenghts[i] = len(seq)
    return padded_seq, lenghts


def get_batch(batch_size, split='train'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval

    idx = torch.randint(0, len(x), (batch_size,))
    xb = [encode(x[i]) for i in idx]
    yb = [y[i] for i in idx]
    xb, lengths = pad_sequences(xb)
    yb = torch.tensor(yb, dtype=torch.long)
    return xb, yb, lengths

batch_size = 32

xb, yb, lengths = get_batch(batch_size)
lengths

tensor([237, 202, 181, 332, 389, 665, 171, 841, 560, 186, 379, 786, 134, 424,
        250, 942, 326, 180, 793, 873, 179, 136, 822, 755, 521, 221, 454, 451,
        440, 400, 239, 156])

In [21]:
# training with LSTM with attention
embedding_dim = 32
hidden_size = 64
output_size = 2
batch_size = 64
seq_len = 10
learning_rate = 1e-3
max_iter = 5000
eval_interval = 500

model = LSTMWithoutAttention(vocab_size, hidden_size, output_size, embedding_dim)
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iter):
    xb, yb, lengths = get_batch(batch_size)
    xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)

    logits = model(xb, lengths)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # print("grad norm:", model.fc.weight.grad.norm())
    # print("Input device:", xb.device)
    # print("Output device:", yb.device)
    # print("Model device:", next(model.parameters()).device)
    # print("Logits device:", logits.device)

    # print("xb shape: ", xb.shape)  # should be (batch_size, seq_len)
    # print("xb dtype: ", xb.dtype)
    # print("yb shape: ", yb.shape)  # should be (batch_size,)
    # print("yb dtype: ", yb.dtype)  # should be torch.long
    # print("logits shape: ", logits.shape)  # should be (batch_size, 2)

    # print("Loss:", loss.item())
    # print(f"step {step}: train loss {loss:.4f}")
    if step % eval_interval == 0:
        print(f"step {step}: train loss {loss:.4f}")
        # with torch.no_grad():
        #     logits = model(xb)
        #     probs = torch.softmax(logits, dim=1)
        #     print("Confidence range:", probs.max(dim=1).values[:10])
        #     preds = torch.argmax(logits, dim=1)
        #     print("Preds: ", preds.tolist())
        #     print("Targets: ", yb.tolist())


step 0: train loss 0.7018
step 500: train loss 0.6696
step 1000: train loss 0.6736
step 1500: train loss 0.6851
step 2000: train loss 0.5855
step 2500: train loss 0.6858
step 3000: train loss 0.6616
step 3500: train loss 0.6443
step 4000: train loss 0.4994
step 4500: train loss 0.4710


In [22]:
def get_validation_batch(batch_size, split='val'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval

    iter_size = len(x) // batch_size
    for i in range(iter_size):
        idx = torch.arange(i*batch_size, i*batch_size + batch_size)
        xb = [encode(x[i]) for i in idx]
        yb = [y[i] for i in idx]
        xb, lengths = pad_sequences(xb)
        yb = torch.tensor(yb, dtype=torch.long)
        yield xb, yb, lengths

def compute_accuracy():
    iter_size = len(xval) // batch_size
    correct = 0
    total = 0
    for i in range(iter_size):
        for xb, yb, lengths in get_validation_batch(batch_size):
            xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
            logits = model(xb, lengths)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == yb).sum().item()
            total += len(yb)

    accuracy = (correct / total) * 100
    print(f"Accuracy: {accuracy:.4f}")


val_acc = compute_accuracy()

KeyboardInterrupt: 

In [19]:
# one batch overfitting
embedding_dim = 32
hidden_size = 64
output_size = 2
batch_size = 64
seq_len = 10
learning_rate = 1e-2
max_iter = 5000
eval_interval = 500

model = LSTMWithAttention(vocab_size, hidden_size, output_size, embedding_dim).to(device)

xb, yb, lengths = get_batch(100)
xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

for step in range(100):
    logits = model(xb, lengths)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Step {step}, loss = {loss.item():.4f}")
    if step % 100 == 0:
        print("Logits:", logits[:2])


Step 0, loss = 0.6940
Logits: tensor([[ 0.1255, -0.0343],
        [ 0.0820,  0.0009]], device='mps:0', grad_fn=<SliceBackward0>)
Step 1, loss = 0.6577
Step 2, loss = 0.6290
Step 3, loss = 0.5918
Step 4, loss = 0.5488
Step 5, loss = 0.5188
Step 6, loss = 0.4915
Step 7, loss = 0.4208
Step 8, loss = 0.3945
Step 9, loss = 0.3243
Step 10, loss = 0.3015
Step 11, loss = 0.2447
Step 12, loss = 0.2079
Step 13, loss = 0.1518
Step 14, loss = 0.1174
Step 15, loss = 0.0901
Step 16, loss = 0.0710
Step 17, loss = 0.0599
Step 18, loss = 0.0298
Step 19, loss = 0.0604
Step 20, loss = 0.0192
Step 21, loss = 0.0111
Step 22, loss = 0.0089
Step 23, loss = 0.0108
Step 24, loss = 0.0093
Step 25, loss = 0.0085
Step 26, loss = 0.0058
Step 27, loss = 0.0040
Step 28, loss = 0.0031
Step 29, loss = 0.0027
Step 30, loss = 0.0022
Step 31, loss = 0.0017
Step 32, loss = 0.0013
Step 33, loss = 0.0011
Step 34, loss = 0.0010
Step 35, loss = 0.0009
Step 36, loss = 0.0008
Step 37, loss = 0.0007
Step 38, loss = 0.0006
Step 3

In [None]:
xb, yb = get_batch(4)
print("Target classes in batch:", yb.tolist())
assert 0 in yb and 1 in yb, "Need both classes for learning"

In [17]:
for name, param in model.named_parameters():
    print(name, param.device)

embedding.weight cpu
lstm.weight_ih_l0 cpu
lstm.weight_hh_l0 cpu
lstm.bias_ih_l0 cpu
lstm.bias_hh_l0 cpu
fc.weight cpu
fc.bias cpu


In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_classes, max_seq_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(emb_dim * max_seq_len, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        x = self.embedding(x)  # (B, T, D)
        return self.net(x)

In [None]:
xb, yb = get_batch(10)
# xb = xb.to("cpu", dtype=torch.long)
# yb = yb.to("cpu", dtype=torch.long)

model = MLPClassifier(vocab_size, embedding_dim, hidden_size, 2, xb.shape[1])
model = model.to("cpu")


optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

for step in range(200):
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    preds = torch.argmax(logits, dim=1)
    print(f"Step {step}, loss = {loss.item():.4f}, preds = {preds.tolist()}, targets = {yb.tolist()}")


In [None]:
def get_validation_batch(batch_size, split='val'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval

    iter_size = len(x) // batch_size
    for i in range(iter_size):
        idx = torch.arange(i*batch_size, i*batch_size + batch_size)
        xb = [encode(x[i]) for i in idx]
        yb = [y[i] for i in idx]
        xb, lengths = pad_sequences(xb)
        yb = torch.tensor(yb, dtype=torch.long)
        yield xb, yb, lengths

def compute_accuracy():
    model.eval()
    correct = 0
    total = 0
    for xb, yb, lengths in get_validation_batch(batch_size):
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
        logits = model(xb, lengths)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)
        # print("Preds:   ", preds.tolist())
        # print("Targets: ", yb.tolist())

    accuracy = (correct / total) * 100
    print(f"Accuracy: {accuracy:.4f}")
    model.train()


val_acc = compute_accuracy()