In [57]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
words = open("names.txt", "r").read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [61]:
len(words)

32033

In [63]:
# build the vocabulary of characters and mappings to/from integers in alphabetical order
chars = sorted(list(set("".join(words)))) # because each letter of the alphabet is used at least once in the entire dataset, we just get the alphabet here.
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [65]:
# build the dataset

block_size = 3 # context length: take the n first chars to predict the (n+1)th char.
X, Y = [], []
for w in words[:5]: # lets just work with the first five words temporarily.

    print("=======================")
    print(w)
    context = [0] * block_size # padding
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print("".join(itos[i] for i in context), "--->", itos[ix]) # this is just for us to understand the rolling window mechanism going on here.
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y) # much easier to work with.

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [67]:
# Now lets look at X and Y.
print(X, X.dtype, X.shape)

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]]) torch.int64 torch.Size([32, 3])


In [69]:
print(Y, Y.dtype, Y.shape)

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]) torch.int64 torch.Size([32])


In [71]:
'''
Each 3-sized example we got from the first five words yielded this tensor X with
a numerical 'translation' of each character in the example. And Y gives us what
each example is being mapped to.
'''

"\nEach 3-sized example we got from the first five words yielded this tensor X with\na numerical 'translation' of each character in the example. And Y gives us what\neach example is being mapped to.\n"

In [73]:
C = torch.randn((27, 2))
'''
Now, we essentially have an embedding C: Alphabet -->Â R^2 that lets us take a char
and place it in some space. With this space, we can see how certain characters
might be "closer" to others in that space, which could indicate some sort of semantic
"closeness". However, this may or may not be blackbox, i.e. we don't really know
how or why certain characters might be closer to each other than others. I don't know,
though, that's just what I think.
'''
C, C.shape

(tensor([[ 2.5225, -0.0693],
         [ 0.9991,  0.6877],
         [-0.0084, -0.5852],
         [ 1.2244,  1.8130],
         [-0.0543,  0.1134],
         [ 1.2541,  0.0716],
         [-0.5010, -1.5812],
         [-1.0269,  0.3056],
         [ 0.7188, -1.0200],
         [ 0.1073,  0.9782],
         [ 0.6673,  0.6456],
         [ 0.6544,  0.4970],
         [-0.7659, -1.0119],
         [ 0.4284,  1.3518],
         [ 0.2509,  0.7649],
         [ 0.1449,  0.6591],
         [-1.9750,  0.4480],
         [ 1.3065,  0.4241],
         [-0.3795, -2.3439],
         [-1.0627,  1.3774],
         [ 1.1348,  1.0579],
         [ 0.8643, -0.3144],
         [-1.3033,  0.5717],
         [ 1.6092,  1.5165],
         [ 2.5329, -2.0169],
         [ 1.2034, -0.5325],
         [ 0.4753, -0.4789]]),
 torch.Size([27, 2]))

In [75]:
'''
Now, we could potentially just use C and index into it as a lookup table, i.e.
C[5] = tensor([0.1615, 1.3169]), and so the character "e", the fifth in the alphabet,
would be embedded as that vector in R^2. However, there's a more interesting way to
do this which could change the way we consider a layer in our NN, and that is by
using one-hot encoding.
'''

'\nNow, we could potentially just use C and index into it as a lookup table, i.e.\nC[5] = tensor([0.1615, 1.3169]), and so the character "e", the fifth in the alphabet,\nwould be embedded as that vector in R^2. However, there\'s a more interesting way to\ndo this which could change the way we consider a layer in our NN, and that is by\nusing one-hot encoding.\n'

In [77]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C == C[5]

tensor([True, True])

In [79]:
# effectively, because of how matrix multiplication works, these are the same.

In [81]:
# In pytorch, you can index into tensors with tensors.
C[X], C[X].shape

(tensor([[[ 2.5225, -0.0693],
          [ 2.5225, -0.0693],
          [ 2.5225, -0.0693]],
 
         [[ 2.5225, -0.0693],
          [ 2.5225, -0.0693],
          [ 1.2541,  0.0716]],
 
         [[ 2.5225, -0.0693],
          [ 1.2541,  0.0716],
          [ 0.4284,  1.3518]],
 
         [[ 1.2541,  0.0716],
          [ 0.4284,  1.3518],
          [ 0.4284,  1.3518]],
 
         [[ 0.4284,  1.3518],
          [ 0.4284,  1.3518],
          [ 0.9991,  0.6877]],
 
         [[ 2.5225, -0.0693],
          [ 2.5225, -0.0693],
          [ 2.5225, -0.0693]],
 
         [[ 2.5225, -0.0693],
          [ 2.5225, -0.0693],
          [ 0.1449,  0.6591]],
 
         [[ 2.5225, -0.0693],
          [ 0.1449,  0.6591],
          [-0.7659, -1.0119]],
 
         [[ 0.1449,  0.6591],
          [-0.7659, -1.0119],
          [ 0.1073,  0.9782]],
 
         [[-0.7659, -1.0119],
          [ 0.1073,  0.9782],
          [-1.3033,  0.5717]],
 
         [[ 0.1073,  0.9782],
          [-1.3033,  0.5717],
          

In [83]:
'''
This essentially yielded a 3 dimensional tensor, where in the first dimension,
we have 32 rows for each example, in the second dimension, we have 3 characters
from a given example, and in the third dimension, we have 2 numbers representing
the "location" in R^2 of a given character. So, effectively, our embedding is C[x].
'''

'\nThis essentially yielded a 3 dimensional tensor, where in the first dimension,\nwe have 32 rows for each example, in the second dimension, we have 3 characters\nfrom a given example, and in the third dimension, we have 2 numbers representing\nthe "location" in R^2 of a given character. So, effectively, our embedding is C[x].\n'

In [85]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [94]:
# Let's define the weights and biases of our first layer.
W1 = torch.randn((6,100)) # 100 comes from the design choice to have 100 neurons in this layer.
b1 = torch.randn(100)

In [100]:
# Let's concatenate the dimension containing n from R^n to match the number of rows in W1, so that we can do embedding @ W1.
h = emb.view(-1,6) @ W1 + b1 # -1 allows torch to infer the size. It is in this case emb.shape[0] == 32, we're just trying to avoid hardcoding, because we just happened to have 32 examples right now given we're only looking at 5 words.
h, h.shape, h.dtype
# btw, emb.view(-1,6) @ W1 gives a 32 x 100 matrix, and we are trying to add to it a vector with 100 elements. This will work because of pytorch broadcasting, and it will do it in a way that is what we are indeed looking for in terms of linear combinations.
# 32, 100
#  1, 100

(tensor([[ 1.4221e+00,  3.7419e+00,  4.4712e+00,  ..., -2.3295e+00,
          -6.7468e+00, -4.7674e+00],
         [ 2.3230e+00,  3.9573e+00,  2.9480e+00,  ..., -1.5265e+00,
          -5.6410e+00, -5.7813e+00],
         [-7.8069e-01,  4.0699e+00,  2.5567e+00,  ..., -3.1416e+00,
          -2.7711e+00, -8.1525e+00],
         ...,
         [-4.1044e+00, -4.9766e-03, -1.7445e+00,  ..., -5.4382e-01,
           9.5053e-01, -1.5830e-01],
         [ 2.5567e+00, -1.3451e+00,  9.0445e+00,  ..., -4.5168e+00,
           3.3268e-01,  5.1478e+00],
         [-2.6698e+00, -4.5450e-01,  4.5280e-01,  ..., -2.0475e+00,
          -4.4025e+00, -2.1108e+00]]),
 torch.Size([32, 100]),
 torch.float32)

In [102]:
# Let's also apply non-linearity in our layer before passing it on.
h = torch.tanh(h)
h # now we have nice values between -1 and 1.

tensor([[ 0.8900,  0.9989,  0.9997,  ..., -0.9812, -1.0000, -0.9999],
        [ 0.9810,  0.9993,  0.9945,  ..., -0.9098, -1.0000, -1.0000],
        [-0.6531,  0.9994,  0.9880,  ..., -0.9963, -0.9922, -1.0000],
        ...,
        [-0.9995, -0.0050, -0.9407,  ..., -0.4959,  0.7400, -0.1570],
        [ 0.9880, -0.8729,  1.0000,  ..., -0.9998,  0.3209,  0.9999],
        [-0.9905, -0.4256,  0.4242,  ..., -0.9672, -0.9997, -0.9711]])

In [104]:
h.shape

torch.Size([32, 100])

In [106]:
# Next layer:
W2 = torch.randn((100, 27)) # input: 100 neurons. output: 27 neurons (last)
b2 = torch.randn(27)

In [108]:
logits = h @ W2 + b2 # this should work because dimensionality matches up + broadcasting works correctly for biases
# logits are what we're about to pass into softmax.

In [110]:
logits.shape

torch.Size([32, 27])

In [118]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
prob, prob.shape, prob[0].sum()
# all the probabilities add to 1. We have just completed softmax.

(tensor([[4.6159e-09, 6.7762e-02, 2.7239e-06, 2.9769e-08, 1.7057e-04, 1.9661e-14,
          1.7871e-07, 8.9337e-06, 1.0338e-09, 1.6759e-08, 5.1723e-12, 8.0536e-01,
          2.2580e-05, 7.2072e-06, 4.8689e-03, 3.7948e-08, 1.0234e-09, 2.2641e-07,
          6.0748e-07, 7.5104e-09, 3.1396e-13, 1.2148e-01, 7.7917e-06, 8.5659e-10,
          1.8804e-04, 1.2436e-04, 4.5828e-08],
         [5.4379e-12, 1.3090e-03, 6.3864e-07, 1.7082e-10, 4.6328e-04, 5.4002e-14,
          4.6247e-08, 6.5736e-06, 4.3702e-11, 6.5085e-06, 5.7275e-11, 2.3638e-01,
          6.0987e-07, 1.1333e-05, 8.1922e-05, 1.2730e-09, 2.8251e-09, 1.0835e-08,
          2.5659e-09, 6.7349e-08, 1.1867e-14, 7.6116e-01, 2.3256e-05, 5.4611e-12,
          5.5339e-04, 1.8214e-07, 4.3614e-09],
         [3.1911e-15, 9.1824e-11, 1.2004e-08, 9.9191e-18, 1.6989e-10, 2.0257e-18,
          4.0182e-15, 6.4535e-10, 3.3458e-16, 6.5842e-08, 7.4884e-10, 9.1915e-08,
          1.6554e-05, 6.9101e-09, 1.9862e-10, 3.3622e-14, 1.8813e-14, 7.9484e-10,
    

In [120]:
# Now, we'll see how correctly "prob" predicts the next character for a given example out of the 32 examples.
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [124]:
# We'll create an iterator over these values:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [126]:
# and we'll index over prob with this and Y to see how confident our model was for a given correct output.
prob[torch.arange(32), Y]

tensor([1.9661e-14, 1.1333e-05, 6.9101e-09, 3.0133e-06, 4.3160e-08, 3.7948e-08,
        2.9072e-06, 1.2494e-05, 7.8411e-05, 3.3796e-01, 8.3536e-14, 5.6408e-05,
        6.7762e-02, 8.6622e-06, 9.2923e-07, 1.9824e-09, 1.6759e-08, 9.6731e-08,
        4.8128e-09, 5.5745e-11, 1.4356e-07, 1.1520e-03, 5.7003e-07, 1.7002e-07,
        1.3299e-11, 7.5104e-09, 1.5408e-11, 2.8873e-15, 4.9503e-11, 1.2548e-08,
        1.8643e-11, 7.6911e-11])

In [128]:
# so ideally we want all of these to be 1. Right now these are all ready bad because we randomly assigned initially our weights and biases, i.e. the model is untrained.
# Now, we can actually take the log of all of these, so that values closer to 0 are negative, and values closer to 1 are 0, so respectively punishing and rewarding the model.
loss = -prob[torch.arange(32), Y].log().mean()
loss # this is our unified, single value to show how good or bad the model has done with the given parameters. Right now its terrible with a loss of 17.

tensor(17.2685)