In [18]:
import numpy as np

import os, sys

project_root = os.path.abspath('/Users/subhojit/workspace/saturn/src')
if project_root not in sys.path:
    sys.path.append(project_root)

from lstm.lstm_batch import *
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
def load_names(filename):
    names = open(filename, 'r').read().splitlines()
    chars = sorted(list(set(''.join(names))))
    special_chars = ['<PAD>', '<SOS>', '<EOS>']
    all_chars = special_chars + chars
    stoi = {ch: i for i, ch in enumerate(all_chars)}
    itos = {i: ch for i, ch in enumerate(all_chars)}
    vocab_size = len(stoi)
    return stoi, itos, vocab_size, names

stoi, itos, vocab_size, names = load_names('indian_names.txt')
stoi

{'<PAD>': 0,
 '<SOS>': 1,
 '<EOS>': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [20]:
def name_to_sequence(name):
    input_seq = ['<SOS>'] + list(name)
    output_seq = list(name) + ['<EOS>']
    input_idx = [stoi[i] for i in input_seq]
    output_idx = [stoi[i] for i in output_seq]

    return input_idx, output_idx

def pad_sequences(sequences):
    pad_idx = stoi['<PAD>']
    max_len = max(len(seq) for seq in sequences)
    padded = np.full((max_len, len(sequences)), pad_idx, dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded[:len(seq), i] = seq
    return padded

ws = [name_to_sequence(name) for name in names[:10]]
ws


[([1, 3, 3, 4, 11, 6], [3, 3, 4, 11, 6, 2]),
 ([1, 3, 3, 4, 11, 6, 3], [3, 3, 4, 11, 6, 3, 2]),
 ([1, 3, 3, 5, 10, 3, 14], [3, 3, 5, 10, 3, 14, 2]),
 ([1, 3, 3, 6, 7, 21, 10], [3, 3, 6, 7, 21, 10, 2]),
 ([1, 3, 3, 6, 11, 14], [3, 3, 6, 11, 14, 2]),
 ([1, 3, 3, 6, 11, 21, 10], [3, 3, 6, 11, 21, 10, 2]),
 ([1, 3, 3, 6, 11, 22, 27, 3], [3, 3, 6, 11, 22, 27, 3, 2]),
 ([1, 3, 3, 7, 16, 3, 4], [3, 3, 7, 16, 3, 4, 2]),
 ([1, 3, 3, 8, 20, 7, 7, 16], [3, 3, 8, 20, 7, 7, 16, 2]),
 ([1, 3, 3, 8, 20, 11, 16], [3, 3, 8, 20, 11, 16, 2])]

In [21]:
ps = [ws[i][0] for i in range(len(ws))]
a = pad_sequences(ps)
# print([len(a[i]) for i in range(len(a))])


In [22]:
def get_batches(X_data, Y_data, batch_size):
    for i in range(0, len(X_data), batch_size):
        batch_X = X_data[i:i+batch_size]
        batch_Y = Y_data[i:i+batch_size]
        x_padded = pad_sequences(batch_X)
        y_padded = pad_sequences(batch_Y)
        yield x_padded, y_padded



In [23]:
def vectorized_loss_and_gradient_batched(outputs, target_idxs, pad_idx):
    """
    outputs: list of (yt, ht, ct), each yt is (output_size, batch_size)
    target_idxs: (seq_len, batch_size), integers
    pad_idx: index of the <PAD> token
    dy_list: list of gradients for each yt
    """
    seq_len, batch_size = target_idxs.shape

    Y_logits = np.stack([yt for (yt, _, _) in outputs], axis=0) # (seq_len, output_size, batch_size)

    Y_logits_shifted = Y_logits - np.max(Y_logits, axis=1, keepdims=True)
    exp_scores = np.exp(Y_logits_shifted)
    Y_probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Prepare loss and gradient
    total_loss = 0.0
    dy_list = []

    for t in range(seq_len):
        dy = Y_probs[t].copy()  # (output_size, batch_size)
        for b in range(batch_size):
            target = target_idxs[t, b]
            if target == pad_idx:
                dy[:, b] = 0  # No loss or gradient for PAD
            else:
                total_loss += -np.log(Y_probs[t, target, b] + 1e-12)
                dy[target, b] -= 1
        dy_list.append(dy)

    return total_loss, dy_list

In [24]:
hidden_size = 128
input_size = vocab_size
output_size = vocab_size
batch_size = 32
model = LSTMLayerBatch(input_size, hidden_size, output_size)

X_data, Y_data = zip(*[name_to_sequence(name) for name in names])

n_epochs = 1000

for epoch in range(n_epochs):
    total_loss = 0.0
    batch_count = 0

    for xpadded, ypadded in get_batches(X_data, Y_data, batch_size=batch_size):
        seq_len, bsz = xpadded.shape

        one_hot_input = np.eye(input_size)[xpadded] # (seq_len, batch_size, input_size)

        h0 = np.zeros((hidden_size, bsz))
        c0 = np.zeros((hidden_size, bsz))

        outputs = model.forward(one_hot_input, h0, c0)

        loss, dy_list = vectorized_loss_and_gradient_batched(outputs, ypadded, 0)
        grads, dWy, dby = model.backward(dy_list)

        lr = 0.1
        model.update_parameters(grads, dWy, dby, lr)

        valid_tokens = np.sum(ypadded != 0)
        total_loss += loss / valid_tokens
        batch_count += 1

    avg_loss = total_loss / batch_count
    print(f"Epoch {epoch+1}/{n_epochs}, Avg Loss: {avg_loss:.4f}")



Epoch 1/1000, Avg Loss: 3.5238
Epoch 2/1000, Avg Loss: 2.8721
Epoch 3/1000, Avg Loss: 2.7786
Epoch 4/1000, Avg Loss: 2.7714
Epoch 5/1000, Avg Loss: 2.7374
Epoch 6/1000, Avg Loss: 2.6937
Epoch 7/1000, Avg Loss: 2.6452
Epoch 8/1000, Avg Loss: 2.6147
Epoch 9/1000, Avg Loss: 2.6019
Epoch 10/1000, Avg Loss: 2.5930
Epoch 11/1000, Avg Loss: 2.5628
Epoch 12/1000, Avg Loss: 2.5428
Epoch 13/1000, Avg Loss: 2.5275
Epoch 14/1000, Avg Loss: 2.5044
Epoch 15/1000, Avg Loss: 2.4879
Epoch 16/1000, Avg Loss: 2.4674
Epoch 17/1000, Avg Loss: 2.4437
Epoch 18/1000, Avg Loss: 2.4290
Epoch 19/1000, Avg Loss: 2.4383
Epoch 20/1000, Avg Loss: 2.4208
Epoch 21/1000, Avg Loss: 2.4079
Epoch 22/1000, Avg Loss: 2.3914
Epoch 23/1000, Avg Loss: 2.3779
Epoch 24/1000, Avg Loss: 2.3623
Epoch 25/1000, Avg Loss: 2.3562
Epoch 26/1000, Avg Loss: 2.3585
Epoch 27/1000, Avg Loss: 2.3502
Epoch 28/1000, Avg Loss: 2.3397
Epoch 29/1000, Avg Loss: 2.3288
Epoch 30/1000, Avg Loss: 2.3171
Epoch 31/1000, Avg Loss: 2.3050
Epoch 32/1000, Av

In [29]:
def softmax(x):
    x = np.exp(x - np.max(x, axis=0, keepdims=True))
    return x / np.sum(x, axis=0, keepdims=True)

def sample_name(max_length=20, temperature=1.0):
    input_size = vocab_size

    h = np.zeros((hidden_size, 1))
    c = np.zeros((hidden_size, 1))

    curr_idx = stoi['<SOS>']
    generated_indices = []

    for i in range(max_length):
        xt = np.eye(input_size)[:, curr_idx].reshape(-1, 1)
        h, c = model.lstm_batch_cell.forward(xt, h, c)
        yt = model.Wy @ h + model.by
        logits = yt / temperature
        probs = softmax(logits)
        curr_idx = np.random.choice(len(probs), p=probs.ravel())

        if itos[curr_idx] == '<EOS>':
            break
        generated_indices.append(curr_idx)

    return generated_indices

for _ in range(10):
    a = sample_name(temperature=1.0)
    print(''.join(itos[i] for i in a))


lyotn
yadho
yukilal
zeba
tulob
yogesh
upendra
zina
hazi
kusumlata


  yt = model.Wy @ h + model.by
  yt = model.Wy @ h + model.by
  yt = model.Wy @ h + model.by
