In [1]:
import torch
import torch.nn.functional as F
from torch.nn.functional import cross_entropy

In [2]:
words = open("names.txt", "r").read().splitlines()

In [3]:
# make string to int and reverse mapping
chars = sorted(list(set(''.join(words)))) # gets unique chars (a,b,c...,y,z)
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0  # '.' char used to mark the start/end of a name
itos = {i:s for s,i in stoi.items()} # reverse mapping

In [4]:
# Build Dataset

def build_dataset(words):
    block_size=3
    X,Y = [],[]
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape,Y.shape)
    return X, Y


import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # 80%
n2 = int(0.9*len(words)) # 10%

Xtr,Ytr = build_dataset(words[:n1]) # 80% training split
Xdev,Ydev = build_dataset(words[n1:n2]) # 10% dev split
Xte,Yte = build_dataset(words[n2:]) # 10% test split

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [5]:
# INITIALIZE NETWORK
g = torch.Generator().manual_seed(2147483647) # a fixed seed for consistent results across repeated runs

C = torch.randn((27,3), generator=g) # lookup table/embedder
W1 = torch.randn((9,200), generator=g) # Hidden layer of 200 neurons that accept 9 inputs each
b1 = torch.randn(200, generator=g) # bias
W2 = torch.randn((200,27), generator=g) # Output layer of 27 neurons that accept 200 inputs each
b2 = torch.rand(27, generator=g) # bias
parameters = [C,W1,b1,W2,b2] # trainable parameters

In [6]:
# Weight Initializing Fixes

W1 *= 0.01 # this fixes vanishing gradient issue of tanh activation function
b1 *= 0.1 # this fixes vanishing gradient issue of tanh activation function

W2 *= 0.01 # this fixes initial high loss
b2 *= 0.1 # this fixes initial high loss

In [7]:
# enable gradient tracking for the trainable parameters
for p in parameters:
    p.requires_grad=True

In [8]:
# stat: number of trainable parameters
sum(p.nelement() for p in parameters)

7508

In [9]:
for i in range(200000):

    # minibatch
    ix = torch.randint(0,Xtr.shape[0],(32,))

    # FORWARD PASS
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,9) @ W1 + b1)  # embeddings passed through HIDDEN layer
    logits = h @ W2 + b2 # passed through OUTPUT layer

    # LOSS CALCULATION
    loss = cross_entropy(logits, Ytr[ix])  # this is equivalent to applying softmax to logits, then doing negative-log-likelihood loss calculation
    #print(loss.item())

    # zero_grad
    for p in parameters:
        p.grad = None

    # BACKWARD PASS
    loss.backward()

    # OPTIMIZE TRAINABLE PARAMETERS
    if i < 50000:
        lr=0.12
    elif i < 120000:
        lr =0.05
    else:
        lr=0.01
    for p in parameters:
        p.data += -lr * p.grad

print(f"Final Loss:{loss.item()}")

Final Loss:2.367966651916504


In [10]:
# Evaluate using dev split
emb = C[Xdev]
h = torch.tanh(emb.view(-1,9) @ W1 + b1)
logits = h @ W2 + b2
loss = cross_entropy(logits,Ydev)
print(f"Loss:{loss.item()}")

Loss:2.226071834564209


In [12]:
# Test name generation for this model at this loss
block_size=3
def generate_name():
    out = []
    context = [0] * block_size  # start with '...' (0 = '.')

    while True:
        # forward pass
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2

        probs = F.softmax(logits, dim=1)

        # sample next character
        ix = torch.multinomial(probs, num_samples=1).item()

        # update context
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:  # end token '.'
            break
    return ''.join(itos[i] for i in out)

for _ in range(20):
    name = generate_name()
    print(name)

shanylaieka.
mylyn.
kalle.
jaodinie.
sha.
emmuna.
vetterremer.
mick.
dari.
jahor.
avris.
jawen.
anyani.
phan.
neorton.
parobamania.
kamn.
alien.
zaylynn.
wor.


## **Trying to minimize loss even more**

In [64]:
# INITIALIZE NETWORK
g = torch.Generator().manual_seed(2147483647) # a fixed seed for consistent results across repeated runs

C = torch.randn((27,10), generator=g) # lookup table/embedder
W1 = torch.randn((30,300), generator=g)
b1 = torch.randn(300, generator=g) # bias
W2 = torch.randn((300,27), generator=g)
b2 = torch.rand(27, generator=g) # bias
parameters = [C,W1,b1,W2,b2] # trainable parameters

In [65]:
# Weight Initializing Fixes

W1 *= 0.01 # this fixes vanishing gradient issue of tanh activation function
b1 *= 0.1 # this fixes vanishing gradient issue of tanh activation function

W2 *= 0.01 # this fixes initial high loss
b2 *= 0.1 # this fixes initial high loss

In [66]:
# enable gradient tracking for the trainable parameters
for p in parameters:
    p.requires_grad=True
# stat: number of trainable parameters
sum(p.nelement() for p in parameters)

17697

In [67]:
for i in range(400000):

    # minibatch
    ix = torch.randint(0,Xtr.shape[0],(32,))

    # FORWARD PASS
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(-1,30) @ W1 + b1)  # embeddings passed through HIDDEN layer
    logits = h @ W2 + b2 # passed through OUTPUT layer

    # LOSS CALCULATION
    loss = cross_entropy(logits, Ytr[ix])  # this is equivalent to applying softmax to logits, then doing negative-log-likelihood loss calculation
    #print(loss.item())

    # zero_grad
    for p in parameters:
        p.grad = None

    # BACKWARD PASS
    loss.backward()

    # OPTIMIZE TRAINABLE PARAMETERS
    if i < 100000:
        lr=0.1
    else:
        lr=0.01
    for p in parameters:
        p.data += -lr * p.grad

print(f"Final Loss:{loss.item()}")

Final Loss:2.163036823272705


In [68]:
# Evaluate using dev split
emb = C[Xdev]
h = torch.tanh(emb.view(-1,30) @ W1 + b1)
logits = h @ W2 + b2
loss = cross_entropy(logits,Ydev)
print(f"Loss:{loss.item()}")

Loss:2.140941619873047


In [69]:
# Test name generation at this loss
block_size=3
def generate_name():
    out = []
    context = [0] * block_size  # start with '...' (0 = '.')

    while True:
        # forward pass
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2

        probs = F.softmax(logits, dim=1)

        # sample next character
        ix = torch.multinomial(probs, num_samples=1).item()

        # update context
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:  # end token '.'
            break
    return ''.join(itos[i] for i in out)

for _ in range(20):
    name = generate_name()
    print(name)

lin.
keina.
azstasiah.
sherik.
cartre.
zacke.
tar.
cammaki.
keyla.
pran.
del.
aley.
naylee.
dawrai.
iyah.
penor.
jukiel.
aliya.
marrin.
riv.
