In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
words = open('names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
# Make the vocabulary of characters and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [82]:
# build the dataset

block_size = 3 # lenght of content
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '-->', itos[ix])
        context = context[1:] + [ix] # shift window of context to right

X = torch.tensor(X)
Y = torch.tensor(Y)    

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .
isabella
... --> i
..i --> s
.is --> a
isa --> b
sab --> e
abe --> l
bel --> l
ell --> a
lla --> .
sophia
... --> s
..s --> o
.so --> p
sop --> h
oph --> i
phi --> a
hia --> .


In [9]:
C = torch.randn((27,2)) # make a look up table for character embedding

In [10]:
C

tensor([[ 0.2056, -0.3717],
        [ 0.2509, -0.5777],
        [ 1.7571,  0.1882],
        [-0.3502, -1.2041],
        [-0.0747,  0.2706],
        [-0.5841,  0.9839],
        [ 0.5044, -0.7106],
        [ 1.1911, -1.4571],
        [-1.4045,  0.0613],
        [ 0.4755,  0.6870],
        [-1.0593, -0.7679],
        [ 0.4622, -0.0789],
        [ 0.3041,  0.3301],
        [-0.0393, -1.8723],
        [ 0.1036,  0.3854],
        [ 0.6361,  0.0780],
        [-1.1814, -1.1659],
        [-2.2120,  0.9367],
        [-0.0871,  0.0827],
        [-0.5848, -0.0546],
        [ 1.4341,  1.0887],
        [-0.1905, -0.4669],
        [ 0.0708,  0.4016],
        [-0.5722, -0.4974],
        [-0.7484, -0.3846],
        [-0.1357, -0.5620],
        [ 0.3632,  0.6552]])

In [22]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [23]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [33]:
h = torch.tanh(emb.view(emb.shape[0],-1) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [34]:
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [35]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [43]:
counts = logits.exp()
prob = counts/counts.sum(dim=1, keepdims=True)
prob.shape

torch.Size([32, 27])

In [45]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(16.1918)

Make code readable

In [95]:
# build the dataset

block_size = 3 # lenght of content
X, Y = [], []
for w in words:
    context = [0]*block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # shift window of context to right

X = torch.tensor(X)
Y = torch.tensor(Y)

In [96]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [97]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g) # make a look up table for character embedding
# make first layer
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)
# Make second layer
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C,W1,b1,W2,b2]

In [98]:
sum(p.nelement() for p in parameters)

3481

In [99]:
for p in parameters:
    p.requires_grad = True

In [103]:
for _ in range(1000):

    # minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    
    # foward pass
    emb = C[X[ix]]
    h = h = torch.tanh(emb.view(emb.shape[0],-1) @ W1 + b1) 
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])  
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data += -0.1*p.grad

9.707976341247559
8.275300025939941
12.244950294494629
9.913008689880371
8.846600532531738
7.748114109039307
7.737802505493164
8.315214157104492
7.196785926818848
9.214018821716309
7.7321858406066895
5.676565170288086
7.858301162719727
8.361080169677734
8.027894973754883
7.888375759124756
8.358184814453125
6.989504814147949
7.113862991333008
8.1220121383667
7.626765251159668
5.863724708557129
7.098045349121094
6.872199058532715
6.226612091064453
5.081862926483154
6.713212966918945
5.090653896331787
5.19752311706543
6.267191410064697
6.272281646728516
8.501923561096191
6.034015655517578
5.401566982269287
7.083884239196777
6.753343105316162
6.412261962890625
6.220894813537598
5.669103145599365
4.956131935119629
6.192286491394043
7.204591274261475
5.20229959487915
5.825509071350098
5.3085432052612305
3.999289035797119
4.100795745849609
6.053722381591797
5.523560047149658
4.6677680015563965
4.565479755401611
4.274100303649902
4.472574710845947
5.602157115936279
4.678188323974609
4.48253154