In [17]:
import os

from tinygrad import Context, nn, Tensor
from tinygrad.nn.optim import AdamW

import matplotlib.pyplot as plt # for making figures
%matplotlib inline

os.environ['METAL_XCODE'] = '1'
os.environ['DISABLE_COMPILER_CACHE'] = '1'

In [18]:
words = open('names.txt', 'r').read().splitlines()
words = [word.lower() for word in words]

print(words[:5])
len(words)

['aaren', 'aarika', 'aaron', 'aartjan', 'abagael']


8604

In [19]:
chars = sorted(list(set(''.join(words))))
chars = chars
chars.insert(0, '.')

stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}
print(itos)

{0: '.', 1: ' ', 2: '-', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'y', 28: 'z'}


In [20]:
# hyperparams
block_size = 8
emb_dim_size = 24
batch_size = 32
n_hidden = 128

In [21]:
# build the dataset

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = Tensor(X)
  Y = Tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

(48186, 8) (48186,)
(6032, 8) (6032,)
(6018, 8) (6018,)


In [22]:
class Tanh:
    def __call__(self, x):
        return x.tanh()

In [23]:
class Flatten:
    def __call__(self, x):
        return x.flatten(1)

In [24]:
class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
    def __call__(self, x):
        B,T,C = x.shape
        x = x.reshape(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out

In [25]:
class Permute:
    def __call__(self,x):
        return x.permute(0, 2, 1)


In [26]:
class Net:
    def __init__(self):
        self.layers = [
            nn.Embedding(Xtr.shape[0], emb_dim_size),
            FlattenConsecutive(2), nn.Linear(emb_dim_size * 2, n_hidden, False), Permute(), nn.BatchNorm(n_hidden), Permute(), Tanh(),
            FlattenConsecutive(2), nn.Linear(n_hidden * 2, n_hidden, False), Permute(), nn.BatchNorm(n_hidden), Permute(), Tanh(),
            FlattenConsecutive(2), nn.Linear(n_hidden * 2, n_hidden, False), nn.BatchNorm(n_hidden), Tanh(),
            nn.Linear(n_hidden, 29, True)
        ]

        
    def __call__(self, x):
        # x = x.sequential(self.layers[:3])
        # print(x.shape)
        return x.sequential(self.layers)

In [27]:
net = Net()
total = 0
for i in nn.state.get_parameters(net):
    total += i.flatten().shape[0]
print(total)
optim = AdamW(nn.state.get_parameters(net))

1233424


In [28]:

def step():
    Tensor.training = True  # makes dropout work
    samples = Tensor.randint(batch_size, high=Xtr.shape[0])
    X_batch = Xtr[samples]
    Y_batch = Ytr[samples]

    optim.zero_grad()
    loss = net(X_batch).cross_entropy(Y_batch).backward()
    optim.step()
    return loss

In [35]:
for i in range(10000):
    loss = step()

    if i%100 == 0:
        Tensor.training = False
        print(loss.numpy())

1.8981452
2.0044272
2.4217253
2.0142326
2.5122678
2.0843403
2.347275
2.3635426
2.316171
2.0760093
2.3342733
1.8964343
2.146652
2.186082
2.1055899
2.1626375
2.5722616
2.534362
2.032223
2.1474636
2.1663446
2.643408
1.9913853
2.0416305
2.0619779
1.6857113
2.4917126
1.9697926
2.177324
2.2247188
2.6075158
1.9064679
2.0857282
2.0233216
2.0199776
2.2575374
2.067529
1.9807296
1.7585726
2.3918784
1.9314301
2.445581
2.1862001
2.0029361
2.1630957
2.2055922
2.1368804
2.2260027
2.0444295
1.9243451
2.227556
1.9086945
2.3668857
2.092731
2.135225
2.1342628
2.2186913
2.0585885
1.9389714
1.868244
1.8712974
2.101416
2.0807133
2.1868777
2.1292756
2.2591732
1.6855725
1.8192999
1.9309039
1.6913614
2.143041
1.7433141
2.251768
1.9115396
2.3894868
1.7535976
1.8300927
1.5022217
2.034836
2.0096717
2.0668302
2.28815
1.9789823
2.315409
2.1904526
2.046762
1.4713811
1.9398465
2.102229
2.2177558
2.058537
2.0315976
1.9739704
2.0647452
2.3237562
2.2775993
2.124414
1.83342
2.1001415
1.9998713


In [36]:
net(Xdev).cross_entropy(Ydev).numpy()

array(2.1462042, dtype=float32)

In [37]:
import numpy as np

Tensor.training = False

# sample from the model
for _ in range(20):
    out = []
    context = [0] * block_size  # initialize with all ...

    while True:
        
        # Get model output
        logits = net(Tensor(context))
        probs = logits.softmax()[-1].numpy()  # Convert to NumPy array to handle sampling

        # Sample from the probability distribution using numpy's random.choice
        ix = np.random.choice(len(probs), p=probs)

        # Update context and output
        context = context[1:] + [ix]
        out.append(ix)
        
        # Break if the end-of-sequence token (0) is produced
        if ix == 0:
            break

    # Convert indices to tokens and print the result
    print(''.join(itos[i] for i in out))


huen.
roubra.
lareel.
bettella.
galep.
delize.
johie.
kippi.
belleya.
adelisha.
rorin.
ma.
meishpet.
doral.
lettina.
corry.
wooust.
yuel.
duki.
cathan.
