In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open("names.txt", "r").read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [5]:
len(words)

32033

In [6]:
# build the vocabulary of characters and mappings to/from integers in alphabetical order
chars = sorted(list(set("".join(words)))) # because each letter of the alphabet is used at least once in the entire dataset, we just get the alphabet here.
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [263]:
# build the dataset

block_size = 3 # context length: take the n first chars to predict the (n+1)th char.
X, Y = [], []
for w in words[:5]: # lets just work with the first five words temporarily.

    print("=======================")
    print(w)
    context = [0] * block_size # padding
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print("".join(itos[i] for i in context), "--->", itos[ix]) # this is just for us to understand the rolling window mechanism going on here.
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y) # much easier to work with.

In [1]:
# Now lets look at X and Y.
print(X, X.shape, Y, Y.shape)

NameError: name 'X' is not defined

In [267]:
'''
Each 3-sized example we got from the first five words yielded this tensor X with
a numerical 'translation' of each character in the example. And Y gives us what
each example is being mapped to.
'''

"\nEach 3-sized example we got from the first five words yielded this tensor X with\na numerical 'translation' of each character in the example. And Y gives us what\neach example is being mapped to.\n"

In [269]:
C = torch.randn((27, 2))
'''
Now, we essentially have an embedding C: Alphabet -->Â R^2 that lets us take a char
and place it in some space. With this space, we can see how certain characters
might be "closer" to others in that space, which could indicate some sort of semantic
"closeness". However, this may or may not be blackbox, i.e. we don't really know
how or why certain characters might be closer to each other than others. I don't know,
though, that's just what I think.
'''
C, C.shape

(tensor([[ 0.6767, -1.4600],
         [ 2.1740, -1.8246],
         [-0.7156,  0.5481],
         [-0.7978,  0.9911],
         [ 0.4465,  0.4863],
         [ 0.4351,  0.6104],
         [ 1.0215,  0.3487],
         [ 1.0514, -0.4481],
         [-0.5221, -0.4600],
         [-1.0644, -0.4811],
         [ 2.0537, -0.1411],
         [ 1.1017,  0.2575],
         [ 0.2567,  0.6146],
         [-0.9069, -1.9906],
         [ 0.0154,  1.9454],
         [-1.6891,  0.3033],
         [-0.2752, -0.0514],
         [ 0.2357, -1.7690],
         [-0.0562,  1.3208],
         [ 0.3151, -0.2876],
         [ 0.8195,  0.8923],
         [-0.0743,  0.4049],
         [-0.2207,  0.9640],
         [ 0.0672,  0.6663],
         [ 0.1731,  0.8541],
         [-0.0519, -0.8815],
         [-0.7921,  1.2556]]),
 torch.Size([27, 2]))

In [271]:
'''
Now, we could potentially just use C and index into it as a lookup table, i.e.
C[5] = tensor([0.1615, 1.3169]), and so the character "e", the fifth in the alphabet,
would be embedded as that vector in R^2. However, there's a more interesting way to
do this which could change the way we consider a layer in our NN, and that is by
using one-hot encoding.
'''

'\nNow, we could potentially just use C and index into it as a lookup table, i.e.\nC[5] = tensor([0.1615, 1.3169]), and so the character "e", the fifth in the alphabet,\nwould be embedded as that vector in R^2. However, there\'s a more interesting way to\ndo this which could change the way we consider a layer in our NN, and that is by\nusing one-hot encoding.\n'

In [273]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C == C[5]

tensor([True, True])

In [275]:
# effectively, because of how matrix multiplication works, these are the same.

In [277]:
# In pytorch, you can index into tensors with tensors.
C[X], C[X].shape

(tensor([[[ 0.6767, -1.4600],
          [ 0.6767, -1.4600],
          [ 0.6767, -1.4600]],
 
         [[ 0.6767, -1.4600],
          [ 0.6767, -1.4600],
          [ 0.4351,  0.6104]],
 
         [[ 0.6767, -1.4600],
          [ 0.4351,  0.6104],
          [-0.9069, -1.9906]],
 
         [[ 0.4351,  0.6104],
          [-0.9069, -1.9906],
          [-0.9069, -1.9906]],
 
         [[-0.9069, -1.9906],
          [-0.9069, -1.9906],
          [ 2.1740, -1.8246]],
 
         [[ 0.6767, -1.4600],
          [ 0.6767, -1.4600],
          [ 0.6767, -1.4600]],
 
         [[ 0.6767, -1.4600],
          [ 0.6767, -1.4600],
          [-1.6891,  0.3033]],
 
         [[ 0.6767, -1.4600],
          [-1.6891,  0.3033],
          [ 0.2567,  0.6146]],
 
         [[-1.6891,  0.3033],
          [ 0.2567,  0.6146],
          [-1.0644, -0.4811]],
 
         [[ 0.2567,  0.6146],
          [-1.0644, -0.4811],
          [-0.2207,  0.9640]],
 
         [[-1.0644, -0.4811],
          [-0.2207,  0.9640],
          

In [279]:
'''
This essentially yielded a 3 dimensional tensor, where in the first dimension,
we have 32 rows for each example, in the second dimension, we have 3 characters
from a given example, and in the third dimension, we have 2 numbers representing
the "location" in R^2 of a given character. So, effectively, our embedding is C[x].
'''

'\nThis essentially yielded a 3 dimensional tensor, where in the first dimension,\nwe have 32 rows for each example, in the second dimension, we have 3 characters\nfrom a given example, and in the third dimension, we have 2 numbers representing\nthe "location" in R^2 of a given character. So, effectively, our embedding is C[x].\n'

In [281]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [283]:
# Let's define the weights and biases of our first layer.
W1 = torch.randn((6,100)) # 100 comes from the design choice to have 100 neurons in this layer.
b1 = torch.randn(100)

In [285]:
# Let's concatenate the dimension containing n from R^n to match the number of rows in W1, so that we can do embedding @ W1.
h = emb.view(-1,6) @ W1 + b1 # -1 allows torch to infer the size. It is in this case emb.shape[0] == 32, we're just trying to avoid hardcoding, because we just happened to have 32 examples right now given we're only looking at 5 words.
h, h.shape, h.dtype
# btw, emb.view(-1,6) @ W1 gives a 32 x 100 matrix, and we are trying to add to it a vector with 100 elements. This will work because of pytorch broadcasting, and it will do it in a way that is what we are indeed looking for in terms of linear combinations.
# 32, 100
#  1, 100

(tensor([[ 0.3738,  0.3254,  0.6489,  ..., -8.0834,  7.0093, -1.4243],
         [ 0.1343,  2.8214, -0.8319,  ..., -6.1792,  4.1403, -1.7716],
         [ 1.6309, -5.8741,  2.4456,  ..., -3.3116,  5.0700,  1.4534],
         ...,
         [-1.5120, -0.4351, -0.4935,  ..., -0.9953,  0.5370,  2.7139],
         [ 0.4034, -1.9915, -0.2281,  ...,  0.1962, -0.2482,  3.7829],
         [ 0.3638,  4.3168,  4.0955,  ..., -6.8554,  4.4774, -2.6569]]),
 torch.Size([32, 100]),
 torch.float32)

In [287]:
# Let's also apply non-linearity in our layer before passing it on.
h = torch.tanh(h)
h # now we have nice values between -1 and 1.

tensor([[ 0.3573,  0.3144,  0.5709,  ..., -1.0000,  1.0000, -0.8905],
        [ 0.1335,  0.9929, -0.6815,  ..., -1.0000,  0.9995, -0.9438],
        [ 0.9262, -1.0000,  0.9851,  ..., -0.9973,  0.9999,  0.8964],
        ...,
        [-0.9073, -0.4096, -0.4570,  ..., -0.7596,  0.4907,  0.9913],
        [ 0.3829, -0.9634, -0.2242,  ...,  0.1937, -0.2432,  0.9990],
        [ 0.3485,  0.9996,  0.9994,  ..., -1.0000,  0.9997, -0.9902]])

In [289]:
h.shape

torch.Size([32, 100])

In [291]:
# Next layer:
W2 = torch.randn((100, 27)) # input: 100 neurons. output: 27 neurons (last)
b2 = torch.randn(27)

In [293]:
logits = h @ W2 + b2 # this should work because dimensionality matches up + broadcasting works correctly for biases
# logits are what we're about to pass into softmax.

In [295]:
logits.shape

torch.Size([32, 27])

In [297]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
prob, prob.shape, prob[0].sum()
# all the probabilities add to 1. We have just completed softmax.

(tensor([[2.2954e-07, 2.3288e-09, 2.4666e-09, 3.5603e-04, 3.7209e-10, 6.0136e-05,
          3.5556e-04, 8.6100e-03, 1.0385e-10, 6.6911e-06, 8.9132e-07, 1.3650e-15,
          2.8066e-02, 3.6939e-06, 7.0524e-07, 3.0179e-16, 2.9954e-12, 1.9045e-05,
          6.8319e-07, 3.9826e-01, 5.7023e-19, 1.3491e-02, 4.5589e-08, 5.5076e-01,
          8.9111e-09, 5.8631e-07, 9.0464e-06],
         [4.3919e-10, 1.2281e-03, 1.8107e-11, 9.5388e-08, 1.4453e-11, 9.5511e-02,
          1.3926e-09, 6.0043e-02, 2.2005e-12, 1.1213e-03, 4.1671e-04, 4.9161e-15,
          2.8274e-03, 6.3692e-04, 9.2209e-03, 1.5515e-11, 1.6580e-16, 4.5126e-04,
          2.6800e-10, 8.2281e-01, 6.8688e-12, 2.1659e-06, 4.4575e-10, 3.3080e-06,
          1.5799e-11, 5.7303e-03, 2.5561e-06],
         [3.1210e-08, 8.7654e-13, 1.3649e-13, 1.0025e-05, 5.7851e-17, 7.0417e-09,
          5.9053e-07, 6.8777e-11, 1.3684e-07, 2.6362e-07, 1.6677e-06, 2.8271e-07,
          1.5519e-07, 2.1449e-12, 2.0562e-12, 7.0600e-14, 6.0522e-12, 3.3815e-01,
    

In [299]:
# Now, we'll see how correctly "prob" predicts the next character for a given example out of the 32 examples.
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [301]:
# We'll create an iterator over these values:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [303]:
# and we'll index over prob with this and Y to see how confident our model was for a given correct output.
prob[torch.arange(32), Y]

tensor([6.0136e-05, 6.3692e-04, 2.1449e-12, 3.1427e-13, 1.8219e-05, 3.0179e-16,
        6.2462e-04, 3.9474e-05, 1.7959e-12, 3.0083e-05, 3.1386e-08, 4.0785e-06,
        2.3288e-09, 2.9356e-08, 1.6985e-06, 5.8229e-10, 6.6911e-06, 2.3541e-04,
        2.2063e-04, 1.2019e-01, 7.2312e-04, 1.1051e-08, 1.1883e-06, 6.5527e-06,
        1.2060e-09, 3.9826e-01, 1.5822e-13, 3.3460e-13, 1.4730e-02, 1.2238e-09,
        3.4914e-10, 1.4951e-03])

In [305]:
# so ideally we want all of these to be 1. Right now these are all ready bad because we randomly assigned initially our weights and biases, i.e. the model is untrained.
# Now, we can actually take the log of all of these, so that values closer to 0 are negative, and values closer to 1 are 0, so respectively punishing and rewarding the model.
loss = -prob[torch.arange(32), Y].log().mean()
loss # this is our unified, single value to show how good or bad the model has done with the given parameters. Right now its terrible with a loss of 17.

tensor(15.3260)

In [307]:
# Okay, it's time we clean stuff up and make things a bit more organized from here on out.
# ========================= Clean ===========================

In [309]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [311]:
g = torch.Generator().manual_seed(2147483647) # this is for reproducibility
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [313]:
sum(p.nelement() for p in parameters) # number of total parameters

3481

In [315]:
for p in parameters:
    p.requires_grad = True

In [317]:
for i in range(10):
    # forward pass
    emb = C[X] # (32, 3, 2)
    h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32,27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdims=True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y) # as it turns out, F.cross_entropy does precisely the above for us and is more efficient for several reasons, i think one of them about how it is stored in memory

    print(f"Iteration {i+1}: {loss.item()}") # print the loss
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.1 * p.grad

Iteration 1: 17.76971435546875
Iteration 2: 13.656402587890625
Iteration 3: 11.298770904541016
Iteration 4: 9.4524564743042
Iteration 5: 7.984263896942139
Iteration 6: 6.891321659088135
Iteration 7: 6.1000142097473145
Iteration 8: 5.452035903930664
Iteration 9: 4.898152828216553
Iteration 10: 4.414663791656494


In [319]:
logits.max(1)

torch.return_types.max(
values=tensor([10.7865, 12.2558, 17.3982, 13.2739, 10.6965, 10.7865,  9.5145,  9.0495,
        14.0280, 11.8378,  9.9038, 15.4187, 10.7865, 10.1476,  9.8372, 11.7660,
        10.7865, 10.0029,  9.2940,  9.6824, 11.4241,  9.4885,  8.1164,  9.5176,
        12.6383, 10.7865, 10.6021, 11.0822,  6.3617, 17.3157, 12.4544,  8.1669],
       grad_fn=<MaxBackward0>),
indices=tensor([ 1,  8,  9,  0, 15,  1, 17,  2,  9,  9,  2,  0,  1, 15,  1,  0,  1, 19,
         1,  1, 16, 10, 26,  9,  0,  1, 15, 16,  3,  9, 19,  1]))

In [321]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [323]:
# As you can see, our model is predicting very very well, essentially overfitting, these few examples' labels.

In [325]:
"""
The reason as to why our model's loss is able to decrease so quickly is simply because
it is overfitting like crazy. First of all, we haven't even done any splits in our
dataset to train, validate, and test. We can't validate hyperparameters right now.
Second, We have way too many parameters for the amount of predictions we're trying
to make, i.e. for 5 words only, i.e. for only about 32 examples.
So, we change all the way above our consideration from "for w in words[:5]"
to "for w in words".
"""

'\nThe reason as to why our model\'s loss is able to decrease so quickly is simply because\nit is overfitting like crazy. First of all, we haven\'t even done any splits in our\ndataset to train, validate, and test. We can\'t validate hyperparameters right now.\nSecond, We have way too many parameters for the amount of predictions we\'re trying\nto make, i.e. for 5 words only, i.e. for only about 32 examples.\nSo, we change all the way above our consideration from "for w in words[:5]"\nto "for w in words".\n'

# We will go from working with only the first five words to working with the entire dataset in a seperate .ipynb file.