In [32]:
import torch
import torch.nn.functional as F

In [33]:
from bigram import Bigram

names = open("names.txt", "r").read().splitlines()
print(names[:10])


model = Bigram(names)

num_names = 3

for _ in range(num_names):
    name = ''
    prev_char = 0
    total_loss = 0
    count = 0

    while True:
        char, loss = model(prev_char)
        if loss:
            total_loss += loss
        count += 1
        if char.item() == 0:
            break
        name += (model.itos[char.item()])
        prev_char = char.item()
    print(name)


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
kabace
hrinnsheud
s


In [34]:
# MLP w/out nn.Module

block_size = 3 # context window (in bigram block_size = 1)
vocab = set([ch for name in names for ch in name])
stoi = {ch: i+1 for i, ch in enumerate(sorted(list(vocab)))}
stoi["."] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(vocab) + 1

emb_size = 3

ch2emb = torch.randn((vocab_size, emb_size)) # |V|, emb_size ; lookup table of embeddings for each character
w1 = torch.randn((emb_size * block_size, 100))
b1 = torch.randn(100)

w2 = torch.randn((100, vocab_size))
b2 = torch.randn(vocab_size)

parameters = [ch2emb, w1, b1, w2, b2]

for param in parameters:
    param.requires_grad = True

In [35]:
# dataset set up
X = []
Y = []

for name in names:
    context = [0] * block_size
    for ch in (name + '.'):
        ix = stoi[ch]
        X.append(context)
        Y.append([ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)
X, Y

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         ...,
         [26, 26, 25],
         [26, 25, 26],
         [25, 26, 24]]),
 tensor([[ 5],
         [13],
         [13],
         ...,
         [26],
         [24],
         [ 0]]))

In [None]:
# training loop

epochs = 100
batch_size = 32

for epoch in range(epochs):

    # mini batch creation and getting embeddings
    batch_idx = torch.randint(0, X.shape[0], size=(batch_size,))
    emb = ch2emb[X][batch_idx]                                 # len(dataset), block_size, emb_size
    emb = emb.view((-1, emb_size * block_size))     # len(dataset), emb_size * block_size
    ys = Y[batch_idx].view(-1)

    # forward pass
    layer1 = torch.tanh(emb @ w1 + b1)               # len(dataset), d
    logits = layer1 @ w2 + b2                        # len(dataset), vocab_size

    # negative log likelihood (nll)

    # counts = logits.exp()                            # len(dataset), vocab_size       
    # probs = counts/counts.sum(dim=1, keepdim=True)   # len(dataset), vocab_size
    # loss = -probs[torch.arange(emb.shape[0]), ys].log().mean()
    # print(loss)
    
    # Y_one_hot = F.one_hot(ys, num_classes=vocab_size).view((-1, vocab_size))
    # loss = -(Y_one_hot * probs).sum(dim=1).log().mean()
    # print(loss)

    loss = F.cross_entropy(logits, ys)
    print(loss)

    # backward pass
    for param in parameters:
        param.grad = None
    loss.backward()

    # update step
    lr = 0.1
    for param in parameters:
        param.data += -lr * param.grad




tensor(13.2969, grad_fn=<NllLossBackward0>)
tensor(11.3519, grad_fn=<NllLossBackward0>)
tensor(10.9570, grad_fn=<NllLossBackward0>)
tensor(13.7393, grad_fn=<NllLossBackward0>)
tensor(7.7434, grad_fn=<NllLossBackward0>)
tensor(7.9060, grad_fn=<NllLossBackward0>)
tensor(9.7999, grad_fn=<NllLossBackward0>)
tensor(8.6726, grad_fn=<NllLossBackward0>)
tensor(10.7075, grad_fn=<NllLossBackward0>)
tensor(8.9918, grad_fn=<NllLossBackward0>)
tensor(7.5470, grad_fn=<NllLossBackward0>)
tensor(7.0939, grad_fn=<NllLossBackward0>)
tensor(7.2071, grad_fn=<NllLossBackward0>)
tensor(7.6064, grad_fn=<NllLossBackward0>)
tensor(8.5738, grad_fn=<NllLossBackward0>)
tensor(9.3269, grad_fn=<NllLossBackward0>)
tensor(8.0478, grad_fn=<NllLossBackward0>)
tensor(9.8177, grad_fn=<NllLossBackward0>)
tensor(6.3430, grad_fn=<NllLossBackward0>)
tensor(6.9338, grad_fn=<NllLossBackward0>)
tensor(8.0216, grad_fn=<NllLossBackward0>)
tensor(7.8999, grad_fn=<NllLossBackward0>)
tensor(9.0670, grad_fn=<NllLossBackward0>)
tensor