In [1]:
import torch
import torch.nn.functional as F
import math

import matplotlib.pyplot as plt

In [2]:
names = open('names.txt').read().splitlines()

In [3]:
def generate_ngrams(n, names_with_tokens):
    ngrams = []
    for w in names_with_tokens:
        for zps in zip(*(w[i:] for i in range(n))):
            ngrams.append(''.join([*zps]))
    return ngrams

In [4]:
chars = set(''.join(names))
chars.add('.')
n_chars = len(chars)

stoi = {s:i for i,s in enumerate(sorted(chars))}
itos = {i:s for s,i in stoi.items()}

### create training set of ngrams

#### trigram

In [5]:
window_size = 3

In [38]:
names_with_tokens = ['.'+name+'.' for name in names]
xs, ys = [], []

for tg in generate_ngrams(window_size, names_with_tokens):
    c1,c2,c3 = tg
    xs.append([stoi[c1], stoi[c2]])
    ys.append(stoi[c3])
    
x = torch.tensor(xs)
y = torch.tensor(ys)
xenc = F.one_hot(x, num_classes=n_chars).float()

In [39]:
batch_size = 1000
def forward(W, generator=None):
    batch_ix = torch.randint(high=xenc.shape[0]-1, size=(batch_size,), generator=generator)
    
    logits = xenc[batch_ix].view(batch_size, -1) @ W.view(-1, n_chars)
    probs = logits.softmax(1)
    preds = probs[torch.arange(batch_size), y[batch_ix]]
    loss = -preds.log().mean()
    return loss

In [64]:
G = torch.Generator().manual_seed(5)

W = torch.randn((window_size-1, n_chars, n_chars), requires_grad=True, generator=G)
n_epochs = 1000

for t in range(n_epochs): 
    loss = forward(W, generator=G)
    W.grad = None
    loss.backward()
    W.data += -5*math.exp(-t/n_epochs) * W.grad
loss = forward(W, generator=G)

In [65]:
loss.item()

2.2396860122680664

What should the loss be?

- Completely random gives -log(1/27) = 3.296 which is also equal to forward( torch.zeros(W.shape) )

## Predict a name

In [66]:
first_letters = torch.zeros(n_chars)
for n in names:
    ix = stoi[n[0]]
    first_letters[ix] += 1
first_letter_probs = first_letters/first_letters.sum()

In [67]:
def predict_name():
    # select a starting letter according to the distribution of starting letters
    name_start = '.'+itos[torch.multinomial(first_letter_probs, 1).item()]

    predicted_name = ''
    predicted_name += name_start
    while True:
        inp_ix = [stoi[i] for i in predicted_name[-2:]]
        inp_enc = F.one_hot(torch.tensor(inp_ix), n_chars).float()

        logits = torch.zeros((1, n_chars))
        for ineuron in range(window_size-1):
            logits += inp_enc[ineuron, :] @ W[ineuron]
        probs = logits.softmax(1)
        prediction = itos[torch.multinomial(probs, 1).item()]
        predicted_name += prediction
        if prediction == '.':
            break
        
    return predicted_name

In [75]:
for _ in range(10):
    p = predict_name()
    print(p)

.vayl.
.eri.
.jujaleyn.
.hamapiramikacdynn.
.gh.
.elassleronnaimajasri.
.dovan.
.da.
.ar.
.bar.


In [70]:
# some nice ones:

# sahanniah, catmarisona, moriganna, swadish, zaria, nuaster, nahrienela, jazden, vayl

# some existing words
# dan, mary, ass, die, ai