In [50]:
import torch

In [51]:
words = open('names.txt','r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [52]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

Create the dataset

In [53]:
xs,ys = [],[]

for w in words:
    chs=['.']+list(w)+['.']
    for ch1,ch2 in zip(chs,chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs=torch.tensor(xs)
ys=torch.tensor(ys)

In [54]:
print(f'{xs=}')
print(f'{ys=}')

xs=tensor([ 0,  5, 13,  ..., 25, 26, 24])
ys=tensor([ 5, 13, 13,  ..., 26, 24,  0])


In [55]:
print(f'Number of examples {len(xs)}')

Number of examples 228146


intitialise the network

In [56]:
# randomly initialize 27 neurons' weights. each neuron receives 27 inputs

g = torch.Generator().manual_seed(2147483647)
W = torch.rand((27,27),generator=g,requires_grad=True)

Optimisation loop

In [57]:
import torch.nn.functional as F

In [58]:
for k in range(100):  # Run the training loop for 100 iterations

    # === FORWARD PASS ===

    # Convert input character indices into one-hot encoded vectors.
    # xs: tensor of shape (batch_size,), each value is an index from 0 to 26
    # One-hot encoding gives shape: (batch_size, 27)
    # Example: index 2 -> [0, 0, 1, 0, ..., 0]
    xenc = F.one_hot(xs, num_classes=27).float()

    # Compute logits by matrix multiplication
    # W: weight matrix of shape (27, 27)
    # xenc @ W → (batch_size, 27), each row is a raw prediction (unnormalized)
    logits = xenc @ W

    # === SOFTMAX FUNCTION ===

    # Convert logits into exponentiated values (numerator of softmax)
    # This is done instead of calling torch.softmax to reinforce the manual math
    counts = logits.exp()

    # Normalize to get probabilities across each row (i.e., per sample)
    # Sum across dim=1 to get the denominator of softmax
    # probs: shape (batch_size, 27), each row sums to 1
    probs = counts / counts.sum(1, keepdims=True)

    # === LOSS CALCULATION ===

    # Negative log-likelihood:
    # For each sample, pick the probability corresponding to the correct next character (ys)
    # torch.arange(len(xs)) creates row indices: [0, 1, 2, ..., batch_size-1]
    # probs[rows, ys] gives the predicted prob for the true label
    # .log(): log-likelihood → negative for loss
    # .mean(): average over batch
    # Add L2 regularization: encourages smaller weights to reduce overfitting
    loss = -probs[torch.arange(len(xs)), ys].log().mean() + 0.01 * (W ** 2).mean()

    # Print loss at each iteration to monitor training
    print(loss.item())

    # === BACKWARD PASS ===

    # Zero out previous gradients manually (no optimizer used)
    W.grad = None

    # Backpropagate to compute gradient of loss with respect to W
    loss.backward()

    # === WEIGHT UPDATE ===

    # Perform stochastic gradient descent manually
    # W.grad: gradient of loss w.r.t. W
    # Learning rate = 50 (large, to speed up convergence in small examples)
    # .data is used to perform in-place updates without interfering with autograd
    W.data -= 50 * W.grad


3.3630006313323975
3.1034915447235107
2.9455957412719727
2.848205804824829
2.7813658714294434
2.7336273193359375
2.698446750640869
2.671607494354248
2.65045166015625
2.6333115100860596
2.6191165447235107
2.60715651512146
2.596938371658325
2.5881083011627197
2.5804033279418945
2.5736238956451416
2.5676162242889404
2.562258005142212
2.5574517250061035
2.5531182289123535
2.5491929054260254
2.545621156692505
2.5423591136932373
2.5393683910369873
2.5366172790527344
2.5340778827667236
2.5317282676696777
2.5295469760894775
2.5275180339813232
2.525625228881836
2.5238564014434814
2.5221996307373047
2.5206446647644043
2.519183397293091
2.5178072452545166
2.516509532928467
2.5152838230133057
2.5141243934631348
2.513026237487793
2.5119848251342773
2.5109963417053223
2.510056734085083
2.5091629028320312
2.5083110332489014
2.5074994564056396
2.5067245960235596
2.5059847831726074
2.505277633666992
2.504600763320923
2.503952741622925
2.5033316612243652
2.5027363300323486
2.5021650791168213
2.501616239

Sample generation

In [None]:
import torch
import torch.nn.functional as F

# Assuming W is defined elsewhere, but move it to CPU to be safe
W = W.cpu()

# Vocabulary index to character (make sure this exists)
# Example: itos = ['.', 'a', 'b', ..., 'z']
# Ensure '.' is index 0
# itos = ['.'] + list('abcdefghijklmnopqrstuvwxyz')  # If needed

g = torch.Generator()
g.manual_seed(2147483647)  # For reproducibility

for i in range(5):
    out = []
    ix = 0  # Start token index ('.')
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()

        # Forward pass
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)

        # Sample next character from probability distribution
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        # Convert index to character and add to output
        out.append(itos[ix])

        # Stop if '.' is generated (end of word)
        if ix == 0:
            break

    # Print the generated word
    print(''.join(out))


cexze.
momasurailezityha.
konimittain.
llayn.
ka.
