<a href="https://colab.research.google.com/github/shanxS/MLPlayground/blob/main/makemore_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
from urllib.request import urlopen

In [2]:
words = urlopen("https://raw.githubusercontent.com/karpathy/makemore/master/names.txt").read().decode('utf-8').splitlines()
len(words)

32033

In [3]:
chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [6]:
block_size = 3
X = []
Y = []

for w in words[1:3]:
  context = [0] * block_size
  print(w)
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context) + ' ----> ' + ch)
    context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .


In [7]:
X.shape, Y.shape

(torch.Size([11, 3]), torch.Size([11]))

In [8]:
# 1st layer
'''
Each input in X is of dim 1x3.
  Each element of that input when one-hot encoded is of dim 1x27
X one hot encoded will be (number of inputs) x 3 x 27

So, 1st layer, C has to have 27 rows. We decided that we want 2 space embedding.
Making C dim of 27x2
'''
C = torch.randn(27, 2) # 27 chars, 2 space embedding for each char
'''
one_hot_encoded(X) . C will be equivalent to lookup in C using X's value
Each example in X will be 3 x 27 and C is 27 x 2. Result of this look up will be (number of inputs) x 3 x 2
'''
emb = C[X]

In [9]:
# 2nd layer
'''
emb from 1st layer is of dim (number of inputs) x 3 x 2
We are going to view it as (number of inputs) x 6
And we are just picking 100 as number of neurons in this layer
'''
W1 = torch.randn(6,100)
b1 = torch.randn(100)

h = torch.tanh(emb.view(-1,6) @ W1 + b1)

In [10]:
h.shape

torch.Size([11, 100])

In [11]:
h

tensor([[ 0.7480, -0.5163, -0.8051,  ...,  0.4747,  0.0926,  0.1107],
        [ 0.9192, -0.9960, -0.4431,  ...,  0.2078, -0.9835,  0.9872],
        [ 0.8882, -0.9979,  0.0409,  ...,  0.7095, -0.9958,  0.9999],
        ...,
        [ 0.9995, -0.3632,  0.9301,  ..., -0.5259,  0.3741,  0.8977],
        [-0.3507, -0.7942, -0.9511,  ...,  0.9885, -0.8014,  0.5362],
        [ 0.9997, -0.5188,  0.9984,  ..., -0.8081,  0.8126,  0.8955]])

In [12]:
# 3rd layer
W2 = torch.randn(100,27)
b2 = torch.randn(27)

logits = h @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(1, keepdims = True)
loss = -probs[torch.arange(Y.nelement()), Y].log().mean()

In [13]:
loss

tensor(18.6783)

In [51]:
#### RE-STATING PREVIOUS CODE

In [30]:
# dataset
block_size = 3
X = []
Y = []

for w in words:
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

g = torch.Generator().manual_seed(2147483647)

# 1st layer
C = torch.randn((27, 2), generator = g, requires_grad = True)

# 2nd layer
W1 = torch.randn((6,100), generator = g, requires_grad = True)
b1 = torch.randn(100, generator = g, requires_grad = True)

# 3rd layer
W2 = torch.randn((100,27) , generator = g, requires_grad = True)
b2 = torch.randn(27, generator = g, requires_grad = True)

parameters = [C, W1, b1, W2, b2]


In [31]:
sum(p.nelement() for p in parameters )

3481

In [33]:
for _ in range(100):
  # forward pass
  emb = C[X]
  h = torch.tanh(emb.view(-1,6) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y)
  

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  for p in parameters:
    p.data += -0.1 * p.grad

print(loss.item())    

3.4653983116149902


tensor(18.6783)