In [None]:
# Exercises:
# E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [473]:
words = open("makemore-part-1/names.txt", "r").read().splitlines()

len(words)

32033

In [474]:
## Trigram model below

trigram = {}

for w in words:
    # initializing with .. helps in creating tensor below and initializing with 0 in the loop for populating the next words. Ending does not matter.
    w = ".." + w + "."
    for i in range(len(w) - 2):
        t = (w[i:i+2], w[i+2])
        trigram[t] = trigram.get(t, 0) + 1

In [475]:
# here we have to prepare a tensor such that we store the trigram data model.
# the rows are all combinations of "..", ".a", ".b"...., "zz"
# the cols are all combinations of ".", "a", "b"...., "z"

import torch
import itertools
from matplotlib.colors import Normalize

characters = ["."] + [chr(c) for c in range(ord('a'), ord('z') + 1)]

# Generate all possible two-character combinations for row labels
row_labels = sorted("".join(pair) for pair in itertools.product(characters, repeat=2))

print(len(row_labels))

row_label_map = {label: idx for idx, label in enumerate(row_labels)}

col_labels = characters

col_label_map = {label: idx for idx, label in enumerate(col_labels)}

# creating a tensor of 1 so all combinations have a minimum value and does not result in exceptions while creating the words
N = torch.ones((len(row_labels), len(col_labels)), dtype=torch.int32)

# Populate the tensor with real values from the trigram
for (row_key, col_key), count in trigram.items():
    row_idx = row_labels.index(row_key)
    col_idx = col_labels.index(col_key)
    N[row_idx, col_idx] = N[row_idx, col_idx] + count

729


In [476]:

g = torch.Generator().manual_seed(234243422)

N = N.float()
probs = N / N.sum(dim=1, keepdim=True)

for i in range(5):
    out = []
    next_row_label_ix = 0
    
    while True:
        p = probs[next_row_label_ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        if ix == 0:
            break

        if ix >= len(col_labels):
            print(ix)
            
        out.append(col_labels[ix])
        
        next_row_label_ix = row_label_map.get(''.join(out[-2:] if len(out) >= 2 else ".."))
    print("".join(out))

nra
cjppaprallee
lrayah
draiah
bdon


In [477]:
# until this point, we have a model (via probabilities) that generates the next name based on trigram, but we need to find out the loss of such a model
# the code in this section finds out the loss

log_likelihood = 0.0
n = 0

for w in words:
    # initializing with .. helps in creating tensor below and initializing with 0 in the loop for populating the next words. Ending does not matter.
    w = ".." + w + "."
    for i in range(len(w) - 2):
        t = (w[i:i+2], w[i+2])
        prob = probs[row_label_map[t[0]]][col_label_map[t[1]]]
        n += 1
        logprob = torch.log(prob)
        log_likelihood += logprob

print(f"{log_likelihood=}")

# since log likelihood is negative and as general convention, we are making it positive
# log likelihood will always be negative because probabilities are less than 1 and the log of all these are going be negative numbers.
nll = -log_likelihood

# averaging it out
nll = nll/n

print(f"{nll=}")

log_likelihood=tensor(-504653.)
nll=tensor(2.2120)


In [525]:
#neural network


xs, ys = [], []

for w in words:
    w = ".." + w + "."
    for i in range(len(w) - 2):
        t = (w[i:i+2], w[i+2])
        xs.append(row_label_map[t[0]])
        ys.append(col_label_map[t[1]])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

num_elements = xs.nelement()

In [517]:
import torch.nn.functional as F

In [397]:
g = torch.Generator().manual_seed(234243422)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [500]:
for k in range(10):
    xenc = F.one_hot(xs, num_classes=729).float()
    logits = (xenc @ W).exp()  # log counts
    counts = logits.exp()
    probs = counts/counts.sum(dim=1, keepdims=True)

    loss = -probs[torch.arange(num_elements), ys].log().mean() + 0.01*(W**2).mean()

    print(loss.item())

    W.grad = None
    loss.backward()

    W.data += -0.1*W.grad

2.3546183109283447
2.3546178340911865
2.354617118835449
2.354616641998291
2.354616165161133
2.3546154499053955
2.354614734649658
2.3546142578125
2.3546135425567627
2.3546133041381836


In [516]:
g = torch.Generator().manual_seed(234243422)
for i in range(5):
    out = []
    next_row_label_ix = 0
    
    while True:
        xenc = F.one_hot(torch.tensor([next_row_label_ix]), num_classes=729).float()
        logits = (xenc @ W).exp()  # log counts
        counts = logits.exp()
        p = counts/counts.sum(dim=1, keepdims=True)
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        if ix == 0:
            break

        if ix >= len(col_labels):
            print(ix)
            
        out.append(col_labels[ix])
        
        next_row_label_ix = row_label_map.get(''.join(out[-2:] if len(out) >= 2 else ".."))
    print("".join(out))

nrxqooppaphaws
gbyrayajadraiah
pdrey
mriquoanmflon



In [529]:
# E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

from torch.utils.data import random_split

total_dataset_len = len(words)
train_dataset_len = int(0.8*total_dataset_len)
dev_dataset_len = int(0.1*total_dataset_len)
test_dataset_len = total_dataset_len - train_dataset_len - dev_dataset_len

train_set, dev_set, test_set = random_split(words, [train_dataset_len, dev_dataset_len, test_dataset_len])

In [530]:
# now retraining the model with just the training dataset
xs, ys = [], []

for w in train_set:
    w = ".." + w + "."
    for i in range(len(w) - 2):
        t = (w[i:i+2], w[i+2])
        xs.append(row_label_map[t[0]])
        ys.append(col_label_map[t[1]])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

num_elements = xs.nelement()

g = torch.Generator().manual_seed(234243422)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [537]:
for k in range(100):
    xenc = F.one_hot(xs, num_classes=729).float()
    logits = (xenc @ W).exp()  # log counts
    counts = logits.exp()
    probs = counts/counts.sum(dim=1, keepdims=True)

    loss = -probs[torch.arange(num_elements), ys].log().mean() + 0.01*(W**2).mean()

    print(loss.item())

    W.grad = None
    loss.backward()

    W.data += -0.1*W.grad

2.418052911758423
2.4180514812469482
2.4180502891540527
2.418048858642578
2.4180471897125244
2.418046236038208
2.4180445671081543
2.4180431365966797
2.418041944503784
2.4180405139923096
2.418039083480835
2.4180378913879395
2.4180362224578857
2.418034791946411
2.4180335998535156
2.418032169342041
2.4180305004119873
2.418029546737671
2.4180281162261963
2.418026924133301
2.418025255203247
2.4180238246917725
2.418022632598877
2.4180212020874023
2.4180195331573486
2.418018341064453
2.4180171489715576
2.418015480041504
2.4180142879486084
2.4180126190185547
2.418011426925659
2.4180099964141846
2.41800856590271
2.4180073738098145
2.41800594329834
2.4180045127868652
2.4180033206939697
2.418001890182495
2.4180002212524414
2.417999029159546
2.4179975986480713
2.417996406555176
2.417994976043701
2.4179933071136475
2.417992115020752
2.4179906845092773
2.4179890155792236
2.417987823486328
2.4179866313934326
2.417985439300537
2.4179837703704834
2.417982578277588
2.417980909347534
2.4179797172546387
2