In [48]:
import torch
import torch.nn.functional as F
from torch.nn.functional import cross_entropy

In [49]:
words = open("names.txt", "r").read().splitlines()

In [50]:
# make string to int and reverse mapping
chars = sorted(list(set(''.join(words)))) # gets unique chars (a,b,c...,y,z)
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0  # '.' char used to mark the start/end of a name
itos = {i:s for s,i in stoi.items()} # reverse mapping

In [51]:
# Build Dataset

def build_dataset(words):
    block_size=3
    X,Y = [],[]
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape,Y.shape)
    return X, Y


import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # 80%
n2 = int(0.9*len(words)) # 10%

Xtr,Ytr = build_dataset(words[:n1]) # 80% training split
Xdev,Ydev = build_dataset(words[n1:n2]) # 10% dev split
Xte,Yte = build_dataset(words[n2:]) # 10% test split

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [52]:
# INITIALIZE NETWORK
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((27,20), generator=g) # lookup table/embedder
W1 = torch.randn((60,300), generator=g) # Hidden layer of 200 neurons that accept 60 inputs each
b1 = torch.randn(300, generator=g) # bias
W2 = torch.randn((300,27), generator=g) # Output layer of 27 neurons that accept 200 inputs each
b2 = torch.rand(27, generator=g) # bias

bngain = torch.ones((1,300)) # used for batch normalization
bnbias = torch.zeros((1,300)) # used for batch normalization

parameters = [C,W1,b1,W2,b2,bngain,bnbias] # trainable parameters

In [53]:
# Weight Initializing Fixes

W2 *= 0.01 # this fixes initial high loss
b2 *= 0.1 # this fixes initial high loss

In [54]:
# enable gradient tracking for the trainable parameters
for p in parameters:
    p.requires_grad=True

In [55]:
# stat: number of trainable parameters
sum(p.nelement() for p in parameters)

27567

In [56]:
for i in range(200001):

    # minibatch
    ix = torch.randint(0,Xtr.shape[0],(32,))

    # FORWARD PASS
    emb = C[Xtr[ix]] #embedding
    embcat = emb.view(-1,60) # convert dimension (x,3,20) to (x,60); x being the total number of records/rows

    h_before_act = embcat @ W1 + b1 # passed through HIDDEN layer
    h_before_act = bngain * (h_before_act - h_before_act.mean(0, keepdim=True))/h_before_act.std(0, keepdim=True) + bnbias  # batch normalization

    h = torch.tanh(h_before_act) # activation function
    logits = h @ W2 + b2 # passed through OUTPUT layer

    # LOSS CALCULATION
    loss = cross_entropy(logits, Ytr[ix])  # this is equivalent to applying softmax to logits, then doing negative-log-likelihood loss calculation
    if i % 10000 == 0:
        print(f"Loss at {i}/200000: {loss.item()}")

    # zero_grad
    for p in parameters:
        p.grad = None

    # BACKWARD PASS
    loss.backward()

    # OPTIMIZE TRAINABLE PARAMETERS
    if i<50000:
        lr=0.1
    elif i<100000:
        lr=0.05
    else:
        lr=0.01
    for p in parameters:
        p.data += -lr * p.grad

Loss at 0/200000: 3.253647804260254
Loss at 10000/200000: 2.378375768661499
Loss at 20000/200000: 2.441584825515747
Loss at 30000/200000: 2.5523576736450195
Loss at 40000/200000: 2.1537094116210938
Loss at 50000/200000: 2.387075901031494
Loss at 60000/200000: 2.2504422664642334
Loss at 70000/200000: 2.0587825775146484
Loss at 80000/200000: 2.460761547088623
Loss at 90000/200000: 2.0936007499694824
Loss at 100000/200000: 2.0105342864990234
Loss at 110000/200000: 2.0839791297912598
Loss at 120000/200000: 2.019775867462158
Loss at 130000/200000: 1.9104342460632324
Loss at 140000/200000: 2.33648419380188
Loss at 150000/200000: 2.258155584335327
Loss at 160000/200000: 1.954023003578186
Loss at 170000/200000: 1.8468167781829834
Loss at 180000/200000: 1.8236241340637207
Loss at 190000/200000: 2.0131380558013916
Loss at 200000/200000: 1.8545682430267334


In [59]:
# store a standard mean and std after training to use for evaluation and name generation

with torch.no_grad():
    emb = C[Xtr]
    embcat = emb.view(-1,60)
    h_before_act = embcat @ W1 + b1

    bnmean =  h_before_act.mean(0, keepdim=True)
    bnstd = h_before_act.std(0, keepdim=True)

In [60]:
# Evaluate

emb = C[Xtr]
embcat = emb.view(-1,60)
h_before_act = embcat @ W1 + b1
h_before_act = bngain * (h_before_act - bnmean)/bnstd + bnbias  # batch normalization
h = torch.tanh(h_before_act)
logits = h @ W2 + b2
loss = cross_entropy(logits,Ytr)
print(f"Train Split Loss:{loss.item()}")

emb = C[Xdev]
embcat = emb.view(-1,60)
h_before_act = embcat @ W1 + b1
h_before_act = bngain * (h_before_act - bnmean)/bnstd + bnbias  # batch normalization
h = torch.tanh(h_before_act)
logits = h @ W2 + b2
loss = cross_entropy(logits,Ydev)
print(f"Dev Split Loss:{loss.item()}")

Train Split Loss:2.0957250595092773
Dev Split Loss:2.128443717956543


In [64]:
# Test name generation for this model
block_size=3
def generate_name():
    out = []
    context = [0] * block_size  # start with '...' (0 = '.')

    while True:
        # forward pass
        emb = C[torch.tensor([context])]
        embcat = emb.view(1,-1)
        h_before_act = embcat @ W1 + b1
        h_before_act = bngain * (h_before_act - bnmean)/bnstd + bnbias  # batch normalization
        h = torch.tanh(h_before_act)
        logits = h @ W2 + b2

        probs = F.softmax(logits, dim=1)

        # sample next character
        ix = torch.multinomial(probs, num_samples=1).item()

        # update context
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:  # end token '.'
            break
    return ''.join(itos[i] for i in out)

for _ in range(20):
    name = generate_name()
    print(name)

lilsa.
rhet.
daime.
amire.
bhaea.
tamberrei.
liyadilah.
malli.
arvon.
dam.
meremorial.
alaine.
charyanoum.
lyn.
jahiday.
myri.
hut.
naidi.
shar.
jali.
