In [3]:
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import kagglehub
import torch
import os
%matplotlib inline

In [4]:
def download_dataset() -> str:
    path = kagglehub.dataset_download("rishitjakharia/names-txt")
    return path # for easier access later

In [5]:
dataset_path = download_dataset() + "/names.txt"

In [6]:
words = open(dataset_path, 'r').read().splitlines()

In [7]:
def make_int_char_maps() -> tuple[dict, dict]:
  chars = sorted(list(set(''.join(words))))
  ctoi = {c: i + 1 for i, c in enumerate(chars)}
  ctoi['.'] = 0
  itoc = {i: c for c, i in ctoi.items()}
  return ctoi, itoc

In [8]:
ctoi, itoc = make_int_char_maps()

In [13]:
print(itoc)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [14]:
print(ctoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [44]:
def build_dataset() -> tuple[torch.tensor, torch.tensor]:

  block_size = 3 # context length -> how many chars does it take to predict the next
  inp, target = [], []
  for w in words[:5]:

    print(w)
    context = [0] * block_size

    for ch in w + '.':
      ix = ctoi[ch]
      inp.append(context)
      target.append(ix)
      print(''.join(itoc[i] for i in context), '--->', ch)
      context = context[1:] + [ix] # crop and append

  inp = torch.tensor(inp)
  target = torch.tensor(target)
  return inp, target

In [47]:
inp, target = build_dataset()

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [48]:
inp.shape, target.shape, inp.dtype, target.dtype

(torch.Size([32, 3]), torch.Size([32]), torch.int64, torch.int64)

In [49]:
C = torch.randn(27, 2)

In [50]:
C[5]

tensor([0.0436, 0.7178])

In [51]:
(F.one_hot(torch.tensor(5), num_classes=27).float() @ C) # one hot allows us to pluck out a desired row
                                                         # however, notice its the same as indexing, so this wont be used

tensor([0.0436, 0.7178])

inp contains contexts, defined in build dataset above
each context contains index mappings from ctoi.
these allow us to access embeddings in C.
we can index into C efficienty with the help of PyTorch

In [54]:
C[inp]

tensor([[[ 0.1794, -0.3265],
         [ 0.1794, -0.3265],
         [ 0.1794, -0.3265]],

        [[ 0.1794, -0.3265],
         [ 0.1794, -0.3265],
         [ 0.0436,  0.7178]],

        [[ 0.1794, -0.3265],
         [ 0.0436,  0.7178],
         [-1.0405,  1.6055]],

        [[ 0.0436,  0.7178],
         [-1.0405,  1.6055],
         [-1.0405,  1.6055]],

        [[-1.0405,  1.6055],
         [-1.0405,  1.6055],
         [-0.2144,  0.0902]],

        [[ 0.1794, -0.3265],
         [ 0.1794, -0.3265],
         [ 0.1794, -0.3265]],

        [[ 0.1794, -0.3265],
         [ 0.1794, -0.3265],
         [ 1.7335, -1.1363]],

        [[ 0.1794, -0.3265],
         [ 1.7335, -1.1363],
         [ 0.8895,  0.3231]],

        [[ 1.7335, -1.1363],
         [ 0.8895,  0.3231],
         [-0.2673,  0.4811]],

        [[ 0.8895,  0.3231],
         [-0.2673,  0.4811],
         [-0.6972, -0.2240]],

        [[-0.2673,  0.4811],
         [-0.6972, -0.2240],
         [-0.2673,  0.4811]],

        [[-0.6972, -0