In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
words = open('names.txt', 'r').read().splitlines()
words[: 8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [8]:
len(words)

32033

In [18]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [30]:
#build the dataset
X = []
Y = []
block_size = 3 #characters to take into context to predict the next character

for w in words[:3]:
    context = [0] * block_size
    print(w)
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '-->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... --> e
..e --> m
.em --> m
emm --> a
mma --> .
olivia
... --> o
..o --> l
.ol --> i
oli --> v
liv --> i
ivi --> a
via --> .
ava
... --> a
..a --> v
.av --> a
ava --> .


In [31]:
C = torch.randn((27, 2))   #each character has two embeddings

In [32]:
C[5]

tensor([-0.5696, -0.2761])

In [33]:
C[X]

tensor([[[-0.8357,  0.4765],
         [-0.8357,  0.4765],
         [-0.8357,  0.4765]],

        [[-0.8357,  0.4765],
         [-0.8357,  0.4765],
         [-0.5696, -0.2761]],

        [[-0.8357,  0.4765],
         [-0.5696, -0.2761],
         [-0.7603,  1.0298]],

        [[-0.5696, -0.2761],
         [-0.7603,  1.0298],
         [-0.7603,  1.0298]],

        [[-0.7603,  1.0298],
         [-0.7603,  1.0298],
         [ 1.1172,  0.3852]],

        [[-0.8357,  0.4765],
         [-0.8357,  0.4765],
         [-0.8357,  0.4765]],

        [[-0.8357,  0.4765],
         [-0.8357,  0.4765],
         [-1.1477,  0.1402]],

        [[-0.8357,  0.4765],
         [-1.1477,  0.1402],
         [ 0.2539,  1.6764]],

        [[-1.1477,  0.1402],
         [ 0.2539,  1.6764],
         [ 1.6517, -0.1239]],

        [[ 0.2539,  1.6764],
         [ 1.6517, -0.1239],
         [-0.6337,  0.8995]],

        [[ 1.6517, -0.1239],
         [-0.6337,  0.8995],
         [ 1.6517, -0.1239]],

        [[-0.6337,  0

In [34]:
emb = C[X]
emb.shape

torch.Size([16, 3, 2])

In [38]:
W1 = torch.randn((6, 100))  # 6= 3*2,and 100 is a hyperparameter
b1 = torch.randn(100)
#hidden layer

In [44]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape #concate across dimension 1 to make it 16*6 from 16*3*3

torch.Size([16, 6])

In [50]:
torch.cat(torch.unbind(emb, 1), 1).shape #what if block size is changed --> this helps with generalization

torch.Size([16, 6])

In [53]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [54]:
a.shape

torch.Size([18])

In [57]:
a.view(9, 2) #change dimension without losing values

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [69]:
emb.view(16, 6) #.view is more efficient since no new memory is being created

tensor([[-0.8357,  0.4765, -0.8357,  0.4765, -0.8357,  0.4765],
        [-0.8357,  0.4765, -0.8357,  0.4765, -0.5696, -0.2761],
        [-0.8357,  0.4765, -0.5696, -0.2761, -0.7603,  1.0298],
        [-0.5696, -0.2761, -0.7603,  1.0298, -0.7603,  1.0298],
        [-0.7603,  1.0298, -0.7603,  1.0298,  1.1172,  0.3852],
        [-0.8357,  0.4765, -0.8357,  0.4765, -0.8357,  0.4765],
        [-0.8357,  0.4765, -0.8357,  0.4765, -1.1477,  0.1402],
        [-0.8357,  0.4765, -1.1477,  0.1402,  0.2539,  1.6764],
        [-1.1477,  0.1402,  0.2539,  1.6764,  1.6517, -0.1239],
        [ 0.2539,  1.6764,  1.6517, -0.1239, -0.6337,  0.8995],
        [ 1.6517, -0.1239, -0.6337,  0.8995,  1.6517, -0.1239],
        [-0.6337,  0.8995,  1.6517, -0.1239,  1.1172,  0.3852],
        [-0.8357,  0.4765, -0.8357,  0.4765, -0.8357,  0.4765],
        [-0.8357,  0.4765, -0.8357,  0.4765,  1.1172,  0.3852],
        [-0.8357,  0.4765,  1.1172,  0.3852, -0.6337,  0.8995],
        [ 1.1172,  0.3852, -0.6337,  0.8

In [72]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

In [71]:
h.shape

torch.Size([16, 100])

In [75]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [78]:
logits = h @ W2 + b2
counts = logits.exp()
prob = counts/counts.sum(1, keepdim = True)