# Bigram model

In [1]:
# reading and exploring the dataset

In [2]:
words = open('../data/names.txt', 'r').read().splitlines()
words[0:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
len(words)

32033

In [4]:
min([len(w) for w in words]), max([len(w) for w in words])

(2, 15)

In [5]:
# explore bigram in the dataset

In [6]:
for w in words[0:3]:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        print(ch1, ch2)

<S> e
e m
m m
m a
a <E>
<S> o
o l
l i
i v
v i
i a
a <E>
<S> a
a v
v a
a <E>


In [7]:
# counting data in dictionary

In [8]:
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1
# b.items()

In [9]:
sorted(b.items(), key=lambda kv: -kv[1])[0:3]

[(('n', '<E>'), 6763), (('a', '<E>'), 6640), (('a', 'n'), 5438)]

In [10]:
# counting bigram in 2D tensor

In [11]:
import torch
import numpy as np

In [12]:
chars = sorted(set(list(''.join(words)))) + ['.']
itoc = dict(enumerate(chars))
ctoi = {c:i for i,c in itoc.items()}
N = torch.zeros((27, 27), dtype=torch.int)

In [None]:
b = {}
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        N[ctoi[ch1], ctoi[ch2]] += 1
# b.items()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(16, 16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = f'{itoc[i]}{itoc[j]}'
        plt.text(j, i, chstr, ha='center', va='bottom', color='gray')
        plt.text(j, i, N[i,j].item(), ha='center', va='top', color='gray')

In [None]:
# sampling from model

In [None]:
P = (N + 1) / (N + 1).sum(1, keepdim=True)

In [None]:
g = torch.Generator(device='cpu').manual_seed(2147483647)
for _ in range(5):
    chs = ['.']
    while True:
        i = ctoi[chs[-1]]
        i = torch.multinomial(P[i], 1, replacement = True, generator=g).item()
        chs.append(itoc[i])
        if chs[-1] == '.':
            print(''.join(chs))
            break

In [None]:
# loss - negative log likelikehood

In [None]:
loss = []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        loss.append(P[ctoi[ch1], ctoi[ch2]])
loss = -torch.tensor(loss).log().mean(); loss

In [None]:
loss = []
for w in ['fm']:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        loss.append(P[ctoi[ch1], ctoi[ch2]])
loss = -torch.tensor(loss).log().mean(); loss

In [None]:
# creating bigram dataset for neural net

In [None]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for x, y in zip(chs, chs[1:]):
        xs.append(ctoi[x])
        ys.append(ctoi[y])
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xs, ys

In [None]:
# one hot encoding

In [None]:
import torch.nn.functional as F
xenc = F.one_hot(xs).to(torch.float32)

In [None]:
# plt.imshow(xenc)

In [None]:
W = torch.randn(size=(27, 27), requires_grad=True)

In [None]:
for _ in range(100):
    # forward
    out = (xenc @ W).exp()
    out = out / out.sum(1, keepdim=True)

    # vetorized loss
    loss = -out[torch.arange(0, len(xs)), ys].log().mean() + 0.01 * (W**2).mean()
    print(loss)

    # backward and update

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad

### Convert this file to md

In [None]:
from IPython.core.display import Javascript

In [None]:
%%js
IPython.notebook.kernel.execute('this_notebook = "' + IPython.notebook.notebook_name + '"')

In [None]:
this_notebook

In [None]:
!jupyter nbconvert --to markdown {this_notebook} --output-dir=../_posts