In [100]:
import torch

In [101]:
words = open('names.txt','r').read().splitlines()

In [102]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [103]:
# building the set 
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}

In [104]:
block_size = 3  # Number of characters used to predict the next one

def build_dataset(words):
    X, Y = [], []  # X: context (input), Y: target (next character)

    for w in words:
        context = [0] * block_size  # Start with all '.' (index 0)

        for ch in w + '.':  # Append '.' as the end-of-word token
            ix = stoi[ch]  # Get index of character
            X.append(context)  # Store current context
            Y.append(ix)       # Store next character
            context = context[1:] + [ix]  # Slide window forward

    # Convert to tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)  # Shape: (num_samples, block_size), (num_samples,)
    return X, Y

# Shuffle words and split into train/dev/test sets
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8 * len(words))  # 80% train
n2 = int(0.9 * len(words))  # 10% dev, 10% test

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte,  Yte  = build_dataset(words[n2:])


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [105]:
g = torch.Generator().manual_seed(2147483647)  # Fixed seed for reproducibility

# Character embedding table: 27 characters ('.' + a-z), each mapped to a 10D vector
C = torch.rand((27, 10), generator=g)

# Hidden layer parameters
W1 = torch.rand((30, 200), generator=g)  # 30 = 3 blocks × 10-dim embeddings
b1 = torch.rand(200, generator=g)

# Output layer parameters
W2 = torch.rand((200, 27), generator=g)
b2 = torch.rand(27, generator=g)

parameters = [C, W1, b1, W2, b2]  # List of all learnable parameters


In [106]:
sum(p.nelement() for p in parameters)

11897

In [107]:
for p in parameters:
    p.requires_grad=True

In [108]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lri = []
lossi = []
stepi = []

In [109]:
import torch.nn.functional as F

In [119]:
for i in range(20000):  # Train for 20k iterations

    # === Minibatch Sampling ===
    ix = torch.randint(0, Xtr.shape[0], (32,))  # Random batch of 32 samples

    # === Forward Pass ===
    emb = C[Xtr[ix]]              # (32, 3, 10): lookup embeddings for each token in context
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # Reshape to (32,30), pass through hidden layer
    logits = h @ W2 + b2          # (32,27): raw predictions for next character

    # === Loss Computation ===
    loss = F.cross_entropy(logits, Ytr[ix])  # Cross-entropy loss between prediction and ground truth

    # === Backward Pass & Parameter Update ===
    lr = 0.1 if i < 100000 else 0.01  # Learning rate schedule (constant in this case)
    
    for p in parameters:
        p.grad = None  # Reset gradients

    loss.backward()  # Backpropagation

    for p in parameters:
        p.data += -lr * p.grad  # SGD update (gradient descent)


calculating loss

In [120]:
emb = C[Xtr]
h = torch.tanh(emb.view(-1,30)@W1 +b1)

logits = h@W2 + b2
loss = F.cross_entropy(logits,Ytr)
loss

tensor(2.2278, grad_fn=<NllLossBackward0>)

In [121]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1,30)@W1 +b1)

logits = h@W2 + b2
loss = F.cross_entropy(logits,Ydev)
loss

tensor(2.2334, grad_fn=<NllLossBackward0>)

In [122]:
g = torch.Generator().manual_seed(2147483647 + 10)  # Different seed for sampling

for i in range(20):  # Generate 20 names
    out = []
    context = [0] * block_size  # Start with context of [., ., .]

    while True:
        # === Forward Pass for One Token ===
        emb = C[torch.tensor([context])]  # (1, 3, 10): embed current context
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)  # (1,200): hidden layer
        logits = h @ W2 + b2                      # (1,27): output layer
        probs = F.softmax(logits, dim=1)          # Convert to probabilities

        # === Sampling ===
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()  # Sample next char index

        context = context[1:] + [ix]  # Slide context window
        out.append(ix)               # Append predicted char index

        if ix == 0:  # Stop if '.' (end-of-word token) is predicted
            break

    print(''.join(itos[i] for i in out))  # Convert indices to characters


carmah.
amelle.
khi.
mrix.
taty.
salana.
ejmahnil.
den.
art.
kaqui.
nellara.
chaiiv.
kaleig.
dali.
poin.
quint.
salina.
liveni.
wate.
paijarysi.
