In [3]:
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import kagglehub
import torch
import os
%matplotlib inline

In [4]:
def download_dataset() -> str:
    path = kagglehub.dataset_download("rishitjakharia/names-txt")
    return path # for easier access later

In [5]:
dataset_path = download_dataset() + "/names.txt"

Downloading from https://www.kaggle.com/api/v1/datasets/download/rishitjakharia/names-txt?dataset_version_number=1...


100%|██████████| 113k/113k [00:00<00:00, 393kB/s]

Extracting files...





In [6]:
words = open(dataset_path, 'r').read().splitlines()

In [7]:
def make_int_char_maps() -> tuple[dict, dict]:
  chars = sorted(list(set(''.join(words))))
  ctoi = {c: i + 1 for i, c in enumerate(chars)}
  ctoi['.'] = 0
  itoc = {i: c for c, i in ctoi.items()}
  return ctoi, itoc

In [8]:
ctoi, itoc = make_int_char_maps()

In [9]:
print(itoc)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [10]:
print(ctoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [11]:
def build_dataset() -> tuple[torch.tensor, torch.tensor]:

  block_size = 3 # context length -> how many chars does it take to predict the next
  inp, target = [], []
  for w in words[:5]:

    print(w)
    context = [0] * block_size

    for ch in w + '.':
      ix = ctoi[ch]
      inp.append(context)
      target.append(ix)
      print(''.join(itoc[i] for i in context), '--->', ch)
      context = context[1:] + [ix] # crop and append

  inp = torch.tensor(inp)
  target = torch.tensor(target)
  return inp, target

In [12]:
inp, target = build_dataset()

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [13]:
inp.shape, target.shape, inp.dtype, target.dtype

(torch.Size([32, 3]), torch.Size([32]), torch.int64, torch.int64)

In [77]:
C = torch.randn(27, 2) # 27 characters mapping to a 2d vector embedding`

In [15]:
C[5]

tensor([ 0.2407, -0.5596])

In [16]:
(F.one_hot(torch.tensor(5), num_classes=27).float() @ C) # one hot allows us to pluck out a desired row
                                                         # however, notice its the same as indexing, so this wont be used

tensor([ 0.2407, -0.5596])

inp contains contexts, defined in build dataset above
each context contains index mappings from ctoi.
these allow us to access embeddings in C.
we can index into C efficienty with the help of PyTorch

In [35]:
C[inp].shape

torch.Size([32, 3, 2])

In [36]:
emb = C[inp]

In [37]:
W1 = torch.randn((6, 100)) # this is our hiddem layer, recieving 6 inputs, since each input has three characters, each having two elements
B1 = torch.randn(100) # 100 biases for each weight

In [38]:
h = torch.tanh(emb.view(-1, 6) @ W1 + B1)

In [39]:
h

tensor([[ 0.2316,  0.9908,  0.5361,  ...,  0.9283,  0.9778,  0.6312],
        [-0.8250,  0.9907,  0.8158,  ...,  0.6804,  0.9887, -0.0046],
        [-0.2868,  0.9943,  0.5957,  ...,  0.9743,  0.9805,  0.6827],
        ...,
        [ 0.9960,  0.9889, -0.9173,  ...,  0.9727,  0.9960,  0.8921],
        [ 0.9994,  0.9344,  0.8807,  ...,  0.9076,  0.9999,  0.9985],
        [ 0.9965,  0.8027,  0.6306,  ...,  0.9800,  0.9919,  0.9441]])

In [40]:
W2 = torch.randn(100, 27)
B2 = torch.randn(27)

In [41]:
logits = h @ W2 + B2

In [42]:
counts = logits.exp()

In [43]:
probs = counts / counts.sum(1, keepdim=True)

In [47]:
probs[torch.arange(32), target] # the probabilties for our target characters for each input

tensor([4.6508e-03, 8.1141e-02, 2.0798e-01, 4.0672e-03, 8.7438e-04, 1.0611e-02,
        5.4549e-03, 4.2308e-03, 1.5276e-04, 4.7371e-03, 2.8948e-05, 1.4128e-02,
        1.9483e-03, 1.5560e-03, 1.5678e-03, 3.2267e-03, 1.8552e-02, 2.1501e-03,
        1.1183e-04, 3.4877e-02, 7.6664e-04, 6.5152e-04, 4.5874e-03, 3.5471e-03,
        5.8964e-05, 2.5159e-03, 4.2476e-03, 6.8626e-04, 2.3269e-04, 3.0489e-05,
        5.2686e-02, 1.4095e-03])

In [56]:
loss = -probs[torch.arange(32), target].log().mean()

In [58]:
loss.item()

6.1197509765625

Now altogether

In [71]:
W1 = torch.randn((6, 100))
B1 = torch.randn(100)
W2 = torch.randn(100, 27)
B2 = torch.randn(27)
params = [C, W1, B1, W2, B2]

In [72]:
sum(p.nelement() for p in params ) # number of parameters

3481

In [84]:
emb = C[inp]
h = torch.tanh(emb.view(-1, 6) @ W1 + B1)
logits = h @ W2 + B2
loss = F.cross_entropy(logits, target)

In [139]:
loss # this loss is different from above because we initialize new random weights and biases

tensor(16.5559)

F.cross_entropy on logits and target does this calculation, giving the same loss when -prob[torch.arange(32), target].log().mean() is calculated, but much quicker

In [98]:
logit = torch.tensor([-5, 0, 3, 5]) + 6
count = logit.exp()
prob = count / count.sum()
prob

tensor([3.9751e-05, 5.8995e-03, 1.1849e-01, 8.7557e-01])

Any constant we add to logits will still result in the same prob ...
Let the logits be:
$$
\text{logit} = \begin{bmatrix} -5 \\ 0 \\ 3 \\ 5 \end{bmatrix} + 6 = \begin{bmatrix} 1 \\ 6 \\ 9 \\ 11 \end{bmatrix}
$$

Exponentiated values:
$$
\text{count} = \exp(\text{logit}) = \begin{bmatrix} e^1 \\ e^6 \\ e^9 \\ e^{11} \end{bmatrix}
$$

This Also Equals
$$
\exp(\text{logit}) = e^6 ⋅ \begin{bmatrix} e^{-5} \\ e^0 \\ e^3 \\ e^{5} \end{bmatrix}
$$

Normalized probabilities:
$$
\text{prob} = \frac{\text{count}}{\sum \text{count}} = \frac{e^6 \cdot \begin{bmatrix} e^{-5} \\ e^0 \\ e^3 \\ e^5 \end{bmatrix}}{e^6 \cdot (e^{-5} + e^0 + e^3 + e^5)}
$$

Further:
$$
\text{prob} = \frac{\text{count}}{\sum \text{count}} = \frac{\begin{bmatrix} e^{-5} \\ e^0 \\ e^3 \\ e^5 \end{bmatrix}}{e^{-5} + e^0 + e^3 + e^5} \text{, Which is simply the normalization of our original tensor}
$$

Adding a constant to our logits does not change the probability tensor, given we don't go out of bounds when calculating exp. Negative numbers will be useful here, since if we have a overflowing number already in our logits, then it can overflow in counts, but if we offset by -max(logits) then we can can ensure no overflow

In [119]:
# overflow example
logit = torch.tensor([-5, 0, 3, 100])
count = logit.exp()
prob = count / count.sum()
prob

tensor([0., 0., 0., nan])

In [120]:
# fixing overflow
logit = torch.tensor([-5, 0, 3, 100]) - logit.max()
count = logit.exp()
prob = count / count.sum()
prob

tensor([0.0000e+00, 3.7835e-44, 7.4689e-43, 1.0000e+00])

In [138]:
# further showing we can add any constant as long as we dont overflow
logit = torch.tensor([-5, 0, 3, 100]) - logit.max() + 1
count = logit.exp()
prob = count / count.sum()
prob

tensor([0.0000e+00, 3.7835e-44, 7.4689e-43, 1.0000e+00])