# Probabilistic bigram model
__Objective__: Make new names from bigram statistics.
* Construct bigram frequencies and probabilities found in all names.
* Sample from bigrams using `torch.Generator`
* Implement the NLL loss function to evaluate name-likelihood of new names.

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
words = open("../names.txt").read().splitlines()

In [None]:
# What we mean by bi-grams
b = {} #Store bi-gram frequencies
for word in words:
    chars = list("." + word + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        bigram = (ch1,ch2)
        b[bigram] = b.get(bigram, 0) + 1

In [None]:
# We want to create a bigram frequency
# N[ix['i'], ix['j']] will contain the frequencies of 'ij'

chars = sorted(list(set(''.join(words))))
stoi = dict([(c, i) for i, c in enumerate(['.']+chars)])
itos = dict([(i, c) for c, i in stoi.items()])

N = torch.zeros([len(chars) + 1, len(chars) + 1], dtype=torch.int32)

for word in words:
    chars = list("." + word + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
# Making new names by sampling from bigrams
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, 1, replacement=True, generator=g).item()
itos[ix]

In [None]:
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
p = p / p.sum()
p
torch.multinomial(p, num_samples=100, replacement=True, generator=g)

In [None]:
# Smoothing a the count matrix to avoid dividing by zero.
# Computing a probability matrix or a likelihood matrix
P = (N + 1).float() # in order to avoid infinite loss(NLL)
P /= P.sum(1, keepdim=True) 

In [None]:
assert P[0].sum().item() == 1.0

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, f"{P[i, j].item():.3f}", ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
# Randomly sampling names from the bigram probability matrix
g = torch.Generator().manual_seed(2147483647)

num_word = 5
new_words = []

for i in range(num_word):
    # `name` will contain the generated name
    word = []
    ix = 0 #this is the index for the .* bigrams, that start a name.
    
    while True:
        p = P[ix]
        #Randomly draw a second letter, or a column
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        word.append(itos[ix])
        if ix == 0:
            new_words.append("".join(word))
            break

In [None]:
new_words

In [None]:
# Coming up with a scoring system to evaluate the word-likeness of new words.
#-sum(log(p))
## Result should be:
#
#log_likelihood=tensor(-559951.5625)
#nll=tensor(559951.5625)
#2.4543561935424805
log_likelihood = 0
n = 0
# Calculating the sum of likelihoods in names in `names.txt`
for w in words:
    chars = list("." + w + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

# Neural Network based solution
We can try to arrove at the previous solution using a neural network.

The loose framework is stoi[ch1] -> encode -> NN -> N[ix1] -> max(P[ix1]) -> itos(ix2)

We need a set of weights that convert ix1-encoded to N[ix1]
We term the N[ix1] as logits
Use softmax to convert N[ix1] -> P[ix1]

In [None]:
# One input
xs = []
ys = []
chars = list("." + words[0] + ".")
for ch1, ch2 in zip(chars, chars[1:]):
    xs.append(stoi[ch1])
    ys.append(stoi[ch2])

In [None]:
xs = torch.tensor(xs)

In [None]:
ys = torch.tensor(ys)

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [None]:
# One-hot encoding x.
xenc = F.one_hot(xs, num_classes=N.shape[0]).float()
# N[ix1]
logits = xenc @ W
# We use softmax to convert logits to probabilities
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)

In [None]:
probs

In [None]:
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

In [None]:
# Another way of arriving at this.


# Putting it all together

In [None]:
# Constructing the dataset
# One input
xs, ys = [], []
for w in words:
    chars = list("." + w + ".")
    for ch1, ch2 in zip(chars, chars[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# Initializing weights
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)



In [None]:
# params
epochs = 250
step = 70
lamda = 0.01
for epoch in range(epochs):
    # Forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    # Adding a regularization term here that acts like a label smoother
    loss = -probs[torch.arange(num), ys].log().mean() + lamda * (W**2).mean()
    print(f"{epoch=} loss={loss.item()}")
    # Backward pass
    W.grad = None
    loss.backward()
    # Update weights
    W.data += -step * W.grad

In [None]:
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, int(counts[i, j].item()), ha="center", va="top", color='gray')
plt.axis('off');

In [None]:
# Randomly sampling names from the bigram probability matrix
g = torch.Generator().manual_seed(2147483647)

num_word = 5
new_words = []

for i in range(num_word):
    # `name` will contain the generated name
    word = []
    ix = 0 #this is the index for the .* bigrams, that start a name.
    
    while True:
        p = P[ix]
        #Randomly draw a second letter, or a column
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        word.append(itos[ix])
        if ix == 0:
            new_words.append("".join(word))
            break
print(new_words)

In [None]:
# Randomly sampling names from neural network
g = torch.Generator().manual_seed(2147483647)

num_word = 5
new_words = []

for i in range(num_word):
    # `name` will contain the generated name
    word = []
    ix = 0 #this is the index for the .* bigrams, that start a name.
    
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        #Randomly draw a second letter, or a column
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        word.append(itos[ix])
        if ix == 0:
            new_words.append("".join(word))
            break
print(new_words)