In [26]:
import torch
import torch.nn.functional as F

In [27]:
from bigram import Bigram

names = open("names.txt", "r").read().splitlines()
print(names[:10])


model = Bigram(names)

num_names = 3

for _ in range(num_names):
    name = ''
    prev_char = 0
    total_loss = 0
    count = 0

    while True:
        char, loss = model(prev_char)
        if loss:
            total_loss += loss
        count += 1
        if char.item() == 0:
            break
        name += (model.itos[char.item()])
        prev_char = char.item()
    print(name)


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
cissiynemseteemoen
judyah
zmeemikeriramirianagwtikinirlfa


In [28]:
# dataset set up
block_size = 3 # context window (in bigram block_size = 1)
vocab = set([ch for name in names for ch in name])
stoi = {ch: i+1 for i, ch in enumerate(sorted(list(vocab)))}
stoi["."] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(vocab) + 1

X = []
Y = []

for name in names:
    context = [0] * block_size
    for ch in (name + '.'):
        ix = stoi[ch]
        X.append(context)
        Y.append([ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)
X, Y

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         ...,
         [26, 26, 25],
         [26, 25, 26],
         [25, 26, 24]]),
 tensor([[ 5],
         [13],
         [13],
         ...,
         [26],
         [24],
         [ 0]]))

In [29]:
# MLP w/out nn.Module

emb_size = 10
hidden_dim = 200

ch2emb = torch.randn((vocab_size, emb_size)) # |V|, emb_size ; lookup table of embeddings for each character

w1 = torch.randn((emb_size * block_size, hidden_dim)) * ((5/3)/(emb_size*block_size)**(1/2)) # kaiming initialization for tanh
# b1 = torch.randn(hidden_dim) # will be "removed" in batch normalization

gamma = torch.ones(hidden_dim)
beta = torch.zeros(hidden_dim)

running_mean = torch.zeros(hidden_dim)
running_std = torch.zeros(hidden_dim)

w2 = torch.randn((hidden_dim, vocab_size)) * 0.1 # reduce the size of the initial logits
b2 = torch.randn(vocab_size) * 0.01

parameters = [ch2emb, w1, w2, b2, gamma, beta]

for param in parameters:
    param.requires_grad = True

In [30]:
# training loop

epochs = 1000
batch_size = 128
epsilon = 1e-5
momentum = 0.1

for epoch in range(epochs):

    # mini batch creation and getting embeddings
    batch_idx = torch.randint(0, X.shape[0], size=(batch_size,))
    emb = ch2emb[X][batch_idx]                                 # len(dataset), block_size, emb_size
    emb = emb.view((-1, emb_size * block_size))     # len(dataset), emb_size * block_size
    ys = Y[batch_idx].view(-1)

    # forward pass
    hidden_emb1 = emb @ w1
    # batch normalization
    bn1_mean_i = hidden_emb1.mean(dim=0, keepdim=True)
    bn1_std_i = (hidden_emb1.std(dim=0, keepdim=True)**2 + epsilon)**(1/2)
    bn1 = gamma * (hidden_emb1 - bn1_mean_i)/bn1_std_i + beta

    # for eval/generating examples
    with torch.no_grad():
        running_mean = (1-momentum) * running_mean + momentum * bn1_mean_i
        running_std = (1-momentum) * running_std + momentum * bn1_std_i
    
    layer1 = bn1.tanh()               # len(dataset), d
    logits = layer1 @ w2 + b2                        # len(dataset), vocab_size

    # negative log likelihood (nll)

    # counts = logits.exp()                            # len(dataset), vocab_size       
    # probs = counts/counts.sum(dim=1, keepdim=True)   # len(dataset), vocab_size
    # loss = -probs[torch.arange(emb.shape[0]), ys].log().mean()
    # print(loss)
    
    # Y_one_hot = F.one_hot(ys, num_classes=vocab_size).view((-1, vocab_size))
    # loss = -(Y_one_hot * probs).sum(dim=1).log().mean()
    # print(loss)

    loss = F.cross_entropy(logits, ys)
    print(loss)

    # backward pass
    for param in parameters:
        param.grad = None
    loss.backward()

    # update step
    lr = 0.05
    for param in parameters:
        param.data += -lr * param.grad




tensor(3.7001, grad_fn=<NllLossBackward0>)
tensor(3.6096, grad_fn=<NllLossBackward0>)
tensor(3.3705, grad_fn=<NllLossBackward0>)
tensor(3.6123, grad_fn=<NllLossBackward0>)
tensor(3.4139, grad_fn=<NllLossBackward0>)
tensor(3.4931, grad_fn=<NllLossBackward0>)
tensor(3.5312, grad_fn=<NllLossBackward0>)
tensor(3.3356, grad_fn=<NllLossBackward0>)
tensor(3.3348, grad_fn=<NllLossBackward0>)
tensor(3.3842, grad_fn=<NllLossBackward0>)
tensor(3.3948, grad_fn=<NllLossBackward0>)
tensor(3.3757, grad_fn=<NllLossBackward0>)
tensor(3.3248, grad_fn=<NllLossBackward0>)
tensor(3.3766, grad_fn=<NllLossBackward0>)
tensor(3.2346, grad_fn=<NllLossBackward0>)
tensor(3.1948, grad_fn=<NllLossBackward0>)
tensor(3.3402, grad_fn=<NllLossBackward0>)
tensor(3.3270, grad_fn=<NllLossBackward0>)
tensor(3.1832, grad_fn=<NllLossBackward0>)
tensor(3.2893, grad_fn=<NllLossBackward0>)
tensor(3.2526, grad_fn=<NllLossBackward0>)
tensor(3.1118, grad_fn=<NllLossBackward0>)
tensor(3.3411, grad_fn=<NllLossBackward0>)
tensor(3.08

In [31]:
# generating samples

num_names = 3

for _ in range(num_names):

    context = [0] * block_size
    name = ''

    while True:
        # forward pass
        emb = ch2emb[context]
        emb = emb.view((1, -1))

        # forward pass
        hidden_emb1 = emb @ w1
        # batch normalization
        bn1 = gamma * (hidden_emb1 - running_mean)/(running_std**2 + epsilon)**(1/2) + beta
        
        layer1 = bn1.tanh()               # len(dataset), d
        logits = layer1 @ w2 + b2                        # len(dataset), vocab_size

        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, replacement=True).item()

        name += itos[ix]
        context = context[1:] + [ix]

        if ix == 0:
            break
    print(name)




shmios.
shmely.
lerldtnferean.
