# Exercise
* Make a bigram count and probability matrix.
* Train a bigram language model in `pytorch` that takes an input bigram and predicts the next character.
   * Make sure the counts matrix is similar to the count matrix above.
* Get new names by drawing from the probability distribution of the tokens.

In [None]:
# imports
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [None]:
# Read `words.txt` as a list
words = open("../names.txt").read().splitlines()

In [None]:
# Make the bigram counts matrix
v = 27 # vocab_size
N = torch.zeros((v, v), dtype=torch.int32) # count matrix

tokens = ["."] + sorted(set(list("".join(words))))
stoi = dict([(c,i) for i,c in enumerate(tokens)])
itos = dict([(i,c) for c,i in stoi.items()])

# Get bigrams
for w in words:
    chars = list("." + w + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
# Generate the probability matrix
# Adding smoothing to avoid zero division error.
P = (N + 1).float()
P /= P.sum(1, keepdim=True)

In [None]:
# Way to sample from distribution.
g = torch.Generator().manual_seed(2147483647)
torch.multinomial(torch.rand(3, generator=g), 100, replacement=True, generator=g)

In [None]:
# Generating new words by drawing from the distribution using the same strategy as above.
# We need to start with .
# Randomly draw an index from P[0] and keep going from there.
g = torch.Generator().manual_seed(2147483647)
ix = 0
num_words = 5

for word in range(num_words):
    word = ""
    while True:
        ix = torch.multinomial(P[ix], 1, replacement=True, generator=g).item()
        word += itos[ix]
        if ix == 0:
            break
    print(word)

In [None]:
# NLL
# Goal is to sum up the likelihood of a name and transform it as a loss
# Adding loss term to the names above.
g = torch.Generator().manual_seed(2147483647)
ix = 0
num_words = 5

for word in range(num_words):
    word = ""
    nll = 0.0
    while True:
        prev_ix = ix
        ix = torch.multinomial(P[ix], 1, replacement=True, generator=g).item()
        word += itos[ix]
        nll += -P[prev_ix, ix].log()
        if ix == 0:
            break
    l = len(word)
    print(word, (nll/l).item())

In [None]:
# Coming up with a scoring system to evaluate the word-likeness of new words.
#-sum(log(p))
## Result should be:
#
#log_likelihood=tensor(-559951.5625)
#nll=tensor(559951.5625)
#2.4543561935424805
log_likelihood = 0
n = 0
# Calculating the sum of likelihoods in names in `names.txt`
for w in words:
    chars = list("." + w + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

In [None]:
# Now for the Neural Network
# Strategy:
# Input token -> X
# Output token -> Y
# Onehot encode the bigrams -> Xenc.
# Initialize W
# W @ Xenc -> logits
# logits.exp() -> counts
# counts / counts.sum(1, keepdim=True) -> P
# -P.log().sum() -> loss

# Compile X into a tensor
# Get bigrams
xs, ys = [], []
for w in words:
    chars = list("." + w + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        xs += [stoi[ch1]]
        ys += [stoi[ch2]]
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
# Weights
g = torch.Generator().manual_seed(2147483647)
W = torch.rand((v, v), generator=g, requires_grad=True)

In [None]:
step = 70.0
epochs = 200

for epoch in range(epochs):
    # Forward pass
    # Onehot encoding of input
    xenc = F.one_hot(xs, num_classes=v).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    # Summing up the probabilities at the yth position alone.
    loss = -probs[torch.arange(num), ys].log().mean()
    print(f"{epoch=} | {loss.item()=}")
    
    # Backward pass
    W.grad = None
    loss.backward()
    
    # Update weights
    W.data += -step * W.grad

In [None]:
# Randomly sampling names from neural network
g = torch.Generator().manual_seed(2147483647)

num_word = 5
new_words = []

for i in range(num_word):
    # `name` will contain the generated name
    word = []
    ix = 0 #this is the index for the .* bigrams, that start a name.
    
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        #Randomly draw a second letter, or a column
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        word.append(itos[ix])
        if ix == 0:
            new_words.append("".join(word))
            break
print(new_words)