In [9]:
import torch
import torch.nn.functional as F

In [10]:
words = open('names.txt', 'r').read().splitlines()

In [11]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [12]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [13]:
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

number of examples:  228146


### Regularization

In [14]:
(W ** 2).mean()

tensor(0.9665, grad_fn=<MeanBackward0>)

alpha = 0.01

alpha * (W ** 2).mean() is the regularization term

"alpha" corresponds to adding "incr" model smoothing in bigram model, to make the predictions uniform

incr = 1

P = (N + incr).float()

P /= P.sum(1, keepdim=True)

In [15]:
# gradient descent
for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    alpha = 0.01
    loss = -probs[torch.arange(num), ys].log().mean() + alpha * (W ** 2).mean()
    print(loss.item())

    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    W.data += -50 * W.grad

3.7686190605163574
3.3788065910339355
3.161090850830078
3.0271859169006348
2.9344842433929443
2.867231607437134
2.8166539669036865
2.777146339416504
2.7452542781829834
2.7188303470611572
2.696505546569824
2.6773722171783447
2.6608052253723145
2.6463515758514404
2.633665084838867
2.622471570968628
2.6125476360321045
2.6037068367004395
2.595794916152954
2.5886809825897217
2.5822560787200928
2.5764293670654297
2.5711236000061035
2.5662729740142822
2.5618226528167725
2.5577261447906494
2.5539438724517822
2.550442695617676
2.5471930503845215
2.5441696643829346
2.5413525104522705
2.538721799850464
2.536262035369873
2.5339581966400146
2.5317976474761963
2.5297679901123047
2.527859926223755
2.5260636806488037
2.5243704319000244
2.522773265838623
2.52126407623291
2.519836902618408
2.5184857845306396
2.5172054767608643
2.515990734100342
2.5148375034332275
2.5137407779693604
2.512698173522949
2.511704921722412
2.5107581615448
2.509854793548584
2.5089924335479736
2.5081682205200195
2.5073804855346

In [16]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

cexze.
momasurailezityha.
konimittain.
llayn.
ka.


<img src="../public/bigram sampling.png" style="width:600px" />

you can see that our NN implementation generated the same output as bigram model, but this approach is much more flexible