In [None]:
import numpy as np

import os, sys

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm.lstm_batch_emb import *
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def load_names(filename):
    names = open(filename, 'r').read().splitlines()
    chars = sorted(list(set(''.join(names))))
    special_chars = ['<PAD>', '<SOS>', '<EOS>']
    all_chars = special_chars + chars
    stoi = {ch: i for i, ch in enumerate(all_chars)}
    itos = {i: ch for i, ch in enumerate(all_chars)}
    vocab_size = len(stoi)
    return stoi, itos, vocab_size, names

stoi, itos, vocab_size, names = load_names('indian_names.txt')
stoi

In [None]:
def name_to_sequence(name):
    input_seq = ['<SOS>'] + list(name)
    output_seq = list(name) + ['<EOS>']
    input_idx = [stoi[i] for i in input_seq]
    output_idx = [stoi[i] for i in output_seq]

    return input_idx, output_idx

def pad_sequences(sequences):
    pad_idx = stoi['<PAD>']
    max_len = max(len(seq) for seq in sequences)
    padded = np.full((max_len, len(sequences)), pad_idx, dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded[:len(seq), i] = seq
    return padded

ws = [name_to_sequence(name) for name in names[:10]]
ws

In [None]:
def get_batches(X_data, Y_data, batch_size):
    for i in range(0, len(X_data), batch_size):
        batch_X = X_data[i:i+batch_size]
        batch_Y = Y_data[i:i+batch_size]
        x_padded = pad_sequences(batch_X)
        y_padded = pad_sequences(batch_Y)
        yield x_padded, y_padded

In [None]:
def vectorized_loss_and_gradient_batched(outputs, target_idxs, pad_idx):
    """
    outputs: list of (yt, ht, ct), each yt is (output_size, batch_size)
    target_idxs: (seq_len, batch_size), integers
    pad_idx: index of the <PAD> token
    dy_list: list of gradients for each yt
    """
    seq_len, batch_size = target_idxs.shape

    Y_logits = np.stack([yt for (yt, _, _) in outputs], axis=0) # (seq_len, output_size, batch_size)

    Y_logits_shifted = Y_logits - np.max(Y_logits, axis=1, keepdims=True)
    exp_scores = np.exp(Y_logits_shifted)
    Y_probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Prepare loss and gradient
    total_loss = 0.0
    dy_list = []

    for t in range(seq_len):
        dy = Y_probs[t].copy()  # (output_size, batch_size)
        for b in range(batch_size):
            target = target_idxs[t, b]
            if target == pad_idx:
                dy[:, b] = 0  # No loss or gradient for PAD
            else:
                total_loss += -np.log(Y_probs[t, target, b] + 1e-12)
                dy[target, b] -= 1
        dy_list.append(dy)

    return total_loss, dy_list

In [None]:
hidden_size = 128
input_size = vocab_size
output_size = vocab_size
batch_size = 32
embedding_dim = 128
model = LSTMLayerBatchEmb(input_size, hidden_size, output_size, embedding_dim=embedding_dim)

X_data, Y_data = zip(*[name_to_sequence(name) for name in names])

n_epochs = 1000

for epoch in range(n_epochs):
    total_loss = 0.0
    batch_count = 0

    for xpadded, ypadded in get_batches(X_data, Y_data, batch_size=batch_size):
        # (seq_len, batch_size)
        seq_len, bsz = xpadded.shape
        h0 = np.zeros((hidden_size, bsz))
        c0 = np.zeros((hidden_size, bsz))

        outputs = model.forward(xpadded, h0, c0)
        loss, dy_list = vectorized_loss_and_gradient_batched(outputs, ypadded, 0)
        grads, dWy, dby, dembedding = model.backward(dy_list)

        lr = 0.1
        model.update_parameters(grads, dWy, dby, dembedding, lr)
        total_loss += loss
        valid_tokens = np.sum(ypadded != 0)
        total_loss += loss / valid_tokens
        batch_count += 1

    avg_loss = total_loss / batch_count
    print(f"Epoch {epoch+1}/{n_epochs}, Avg Loss: {avg_loss:.4f}")


In [24]:
import numpy as np

def softmax(x, axis=0):
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / np.sum(e_x, axis=axis, keepdims=True)

def sample(start_char='<SOS>', max_length=20, temperature=1.0):
    model.lstm_batch_cell.cache = None  # Clear cache

    # Initialize hidden and cell state
    h = np.zeros((model.hidden_size, 1))
    c = np.zeros((model.hidden_size, 1))

    current_char = start_char
    result = []

    for _ in range(max_length):
        idx = stoi.get(current_char, stoi["<PAD>"])  # fallback to <PAD>
        x = model.embedding[idx].reshape(-1, 1)  # (embedding_dim, 1)

        # Forward step
        h, c = model.lstm_batch_cell.forward(x, h, c)
        y = model.Wy @ h + model.by  # (vocab_size, 1)

        # Temperature sampling
        probs = softmax(y / temperature, axis=0)
        next_idx = np.random.choice(len(probs), p=probs.ravel())
        current_char = itos[next_idx]
        if current_char == '<EOS>':
            break
        result.append(current_char)

    return ''.join(result)


In [27]:
for _ in range(10):
    n = sample()
    is_new = False if n in names else True
    print(f"name: {n}, is_new: {is_new}")

name: zenab, is_new: False
name: mustikari, is_new: True
name: yourenka, is_new: True
name: yusub, is_new: True
name: juwa, is_new: True
name: yuvinderjp, is_new: True
name: zunh, is_new: True
name: tundiram, is_new: False
name: tuskh, is_new: True
name: wazid, is_new: False


  y = model.Wy @ h + model.by  # (vocab_size, 1)
  y = model.Wy @ h + model.by  # (vocab_size, 1)
  y = model.Wy @ h + model.by  # (vocab_size, 1)
