# Building Dataset

In [140]:
import torch

alphabet = {chr(i): i - 97 for i in range(97, 123)}
alphabet["S"] = 26; alphabet["E"] = 27
rev_alphabet = {v: k for k, v in alphabet.items()}

def get_bigram_counts_with_delay(delay: int, names: list[torch.Tensor]):
    counts = torch.zeros(len(alphabet), len(alphabet), dtype=torch.float)
    for name in names:
        for bigram in zip(name, name[delay:]):
            counts[bigram[0], bigram[1]] += 1
    counts = counts / counts.sum(dim=1, keepdim=True)
    return counts

def build_dataset(max_delay: int):
    names = ["S" * max_delay + name.strip().lower() + "E" for name in open("data/names.txt").readlines()]
    names = list(filter(lambda x: "-" not in x and " " not in x, names))

    # Build dataset
    names = ["S" + name.strip().lower() + "E" for name in open("data/names.txt", "r").readlines()]
    names = list(filter(lambda x: "-" not in x, names))

    # Convert names to tensors
    names = [torch.tensor([alphabet[char] for char in name]) for name in names]
    
    # Create the counts matrix for each delay
    counts_per_delay = [get_bigram_counts_with_delay(delay, names) for delay in range(1, max_delay + 1)]

    return names, counts_per_delay

def display_name(name: list[int]):
    return "".join([rev_alphabet[i] for i in name]).replace("S", "").replace("E", "")

bigram_names, bigram_counts = build_dataset(1)
delay_names, delay_counts = build_dataset(3)
all_names = bigram_names

## Basic Models: Bigram, Delay, NN

In [141]:
# A series of models that give a probability distribution over the next character
from functools import reduce
from torch import nn

class BigramModel(nn.Module):
    def __init__(self, counts):
        super().__init__()
        self.counts = counts

    def forward(self, name: list[int]):
        probs = self.counts[0][name[-1], :]
        return nn.functional.normalize(probs, p=1.0, dim=0)

class DelayModel(nn.Module):
    def __init__(self, counts):
        super().__init__()
        self.counts = counts
    
    def forward(self, name: list[int]):
        rows = [
            self.counts[i][name[-(i + 1)], :]
            for i in range(len(self.counts))
        ]

        probs = reduce(lambda a, b: a * b, rows)
        return nn.functional.normalize(probs, p=1.0, dim=0)

class NNDelayModel(nn.Module):
    def __init__(self, counts):
        super().__init__()
        self.counts = counts
    
    def forward(self, name: list[int]):
        rows = [
            self.counts[i][name[-(i + 1)], :]
            for i in range(len(self.counts))
        ]

        probs = reduce(lambda a, b: a * b, rows)
        return nn.functional.normalize(probs, p=1.0, dim=0)


### Helper Functions

In [156]:
# Given a model, do some testing of it
def generate_name(model: torch.nn.Module, initial: list[int], parse_output=lambda x: x, prep_input=lambda x: x):
    name = [*initial] # Make a copy, otherwise you're updating the actual object, which is passed by pointer
    i = 0
    probs = ''
    while name[-1] != alphabet["E"] and i < 25:
        probs = parse_output(model(prep_input(name)))
        next_letter_idx = torch.multinomial(probs, 1).item()
        name.append(next_letter_idx)
        i += 1

    return name[1:-1]

def name_nll(model: torch.nn.Module, name: list[int], s_buffer: list[int], parse_output=lambda x: x, prep_input=lambda x: x):
    nll = 0
    name += [alphabet["E"]]
    full_name = s_buffer + name
    s_buffer_len = len(full_name) - len(name)
    for i in range(len(name)):
        probs = parse_output(model(prep_input(full_name[:s_buffer_len + i])))
        if not torch.sum(probs).isclose(torch.ones(())):
            raise Exception("ERROR probs don't add to one: " + str(torch.sum(probs).item()))
        nll += -torch.log(0.00001 + probs[name[i]])

    return nll.item()

def test_model(model, prefix=[alphabet["S"]], iters=100, parse_output=lambda x: x, prep_input=lambda x: x):
    for _ in range(iters):
        name = generate_name(model, prefix, parse_output=parse_output, prep_input=prep_input)
        print(display_name(name))
    
    total = 0
    count = 0
    for name in bigram_names:
        nll = name_nll(model, name, prefix, parse_output=parse_output, prep_input=prep_input)
        total += nll
        count += 1

    print("Average NLL: ", total / count)

### Testing

In [157]:
bigram_model = BigramModel(bigram_counts)
test_model(bigram_model, iters=100)

chaualdiarerotritt
hen
rk
sorckwane
dommne
sinodeaustilezerant
hararioynetitry
llettawerymudolffotoso
aus
jomo
gosimariarn
arerl
ng
udydell
bjabelem
wewob
jel
wonilerduian
ugeiepaundanend
th
tthralonondeocyrthorky
wanatorthud
s
jes
lin
an
je
in
har
ge
rarerlain
tern
ti
tckamilivan
darde
t
jesthadrhuibbeannatanal
cohon
arswewior
e
usa
s
thorpainianoilel
honnaleweynnicangun
chan
jas
wiarurodewifontarnedomar
zemameelematez
o
g
pamo
rapeenwawtuce
y
shewod
hoh
le
hearolarl
rr
wellvi
lalelb
mobieunan
ronint
tod
tteydenintiendeshmos
ph
m
ch
gurt
arloher
berburafror
qun
dusisho
x
nnierigus
righarngerdys
rido
hite
r
shry
henil
keson
tianusin
kin
ky
hadmmallo
lontarillvi
ahild
shahery
eshbachelad
lllyl
hareisshan
aho
to
kendorey
feil
n
nl
arduefo
chaclied
tare


TypeError: unsupported operand type(s) for +=: 'Tensor' and 'list'

In [151]:
delay_model = DelayModel(delay_counts)
test_model(delay_model, prefix=3*[alphabet["S"]])

ord
ero
air
terie
cen
ole
ner
re
den
irmere
are
ale
ore
ain
rian
nee
wil
art
rey
aun
amey
ren
ere
aal
ard
lar
rie
aird
arn
ale
rae
ert
raie
larn
ere
enro
rato
eas
rie
mare
set
eate
iel
rel
ereane
aic
esar
rras
ail
are
era
arie
san
ier
er
rest
rar
arr
ere
len
are
arl
ren
aren
her
eld
rin
erd
entan
aro
re
ran
rie
arat
are
alle
ari
art
arie
are
ari
anarl
rrt
rel
ren
sre
ric
are
all
aite
rie
rain
ren
ere
ral
lan
ari
lan
atle
arder
Average NLL:  29.5828208732605


## Recurrent Neural Network - scratch

In [145]:
# RNN
def one_hot_encode_letter(letter: int):
    one_hot = torch.zeros(len(alphabet))
    one_hot[letter] = 1
    return one_hot
    
class RNN(nn.Module):
    def __init__(self, param_size: int):
        super().__init__()
        self.U = nn.Parameter(torch.randn(param_size, param_size))
        self.W = nn.Parameter(torch.randn(param_size, param_size))
        self.V = nn.Parameter(torch.randn(param_size, param_size))
        self.h = torch.randn(param_size)
        self.bias_h = nn.Parameter(torch.randn(param_size))
        self.bias_o = nn.Parameter(torch.randn(param_size))
        self.softmax = nn.Softmax(dim=0)

    # x should be a one-hot vector
    def forward(self, x):
        x = one_hot_encode_letter(x)
        self.h = torch.tanh(self.U @ x + self.W @ self.h + self.bias_h)
        return self.softmax(self.bias_o + self.V @ self.h)

    def detach_h(self):
        self.h = self.h.detach()

rnn_model = RNN(len(alphabet))
total = 0
for i in range(iters):
    name = generate_name(rnn_model, [alphabet["S"]])
    # print(display_name(name))
    nll = name_nll(rnn_model, name, [alphabet["S"]])
    total += nll
    # print(name_nll(rnn_model, name, [alphabet["S"]]))
print("Average NLL: ", total / iters)

Average NLL:  149.0485184955597


In [146]:
# As expected, the untrained RNN outputs trash. Let's train.
for layer in rnn_model.children():
    if hasattr(layer, 'reset_parameters'):
        layer.reset_parameters()

opt = torch.optim.SGD(rnn_model.parameters(), lr=0.01)
for name in bigram_names:
    rnn_model.zero_grad()
    rnn_model.detach_h()
    loss = 0
    for bigram in zip(name, name[1:]):
        probs = rnn_model.forward(bigram[0])
        loss += -torch.log(probs[bigram[1]])
    loss.backward()
    opt.step()

test_model(rnn_model)

zly


rwhanbwvflc
xszlacdycbclknknknewanol
inai
h
gezzyhlh

w
lyzayoulyw
ueztrrzuzlrlhz
nirk
trin

lrhh
d
iurrhyeatizevlnwlnlee
hd
iuzzbhsirarwzalulosuye
humzruazh
hl
ahrsezehlzki
ncylecurirkvnlileezd


eh
azh
fyw
ovihhe
wbzge
nizlaneel
lunlazaolitbzlzbz
lhelc

ioynvebzibeb
iza
lu
ar
nmhhr
ziha
zenbrhnmzeebgenazoculvus
lrznaleuefb
irw
cdzvxifurneer
ly
eoeu


p
l
r
rlerl
oliwzcuah
salihhicucl
hnl
oiu

hb
rlasiiurzbdny
ziziraisnz
eveeh
neeiihdaa
hcoekakbc
lorzk
llaurls
l

i
stmhidtdlkrhci

l
ra
liedehhdrzkh
t
rlrrhsln
dr
brsehwez
yseyodee

yythid
hrzlm
ebhezdn
riimi
c
eerht
ysuosiyezlxleygazlzislle
ikxhzrezoclnilzuljez
yiosye
zl
thhoh
reedohssozkza
rcl
cizeidzv
ihefzwqerdk
hz
siehuohgz
hhitzabzhze
dani

Average NLL:  20.02976134300232


## PyTorch Builtin RNN for reference

In [158]:
# Looks like builtin RNN does the same, maybe just not useful for bigram for some reason.
# Try utilizing full sequence, first with builtin RNN
# Restarting here, not going to use much of the above code. Want to try again from scratch.
class RNNv2(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(len(alphabet), len(alphabet))
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, X):
        # X is matrix of size (seq_len, len(alphabet))
        output, _ = self.rnn(X, torch.randn(1, len(alphabet)))

        return self.softmax(output)

def name_to_matrix(name: list[int]) -> list[list[int]]:
    return torch.stack([one_hot_encode_letter(token) for token in name])

name_matrices = [name_to_matrix(name) for name in bigram_names]
rnn_model = RNNv2()

# Maybe problem from above was you weren't batching. At minimum should do multiple epochs through entire dataset with just one loss total
opt = torch.optim.Adam(rnn_model.parameters(), lr=0.05)

for epoch in range(5):
    loss = 0
    count = 0
    rnn_model.zero_grad()
    for name_matrix in name_matrices:
        # Predict a random letter in the name
        output = rnn_model(name_matrix)
        
        for i in range(output.shape[0]):
            loss += -torch.log(0.00001 + output[i][torch.nonzero(name_matrix[i])])
            count += 1
    
    loss /= count    
    loss.backward()
    opt.step()
    print("Epoch: ", epoch, " Loss: ", loss.item())


tensor([26,  0,  0, 12,  8, 17, 27])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])
Epoch:  0  Loss:  3.345205545425415
Epoch:  1

In [155]:
test_model(rnn_model, parse_output=lambda x: x[-1], prep_input=lambda x: name_to_matrix(x))

jrdrlhvoe
hlhsvzebmmottyih

pypeetnlln
iroiyomhhpsgkolyvdjzq
yqtgxilej
rlqhiehmdeyinoeatrls
bl
iifrnctdys
irepnnbxyenriebkkoinbokk
ve
bea
zvynfei
cgtjc
kbeopp
vavsrsiuttcthkgn
zxlog
izlinsaxnhd
z
lrumbaanmnmu
p
seaiilzl
llikcefvlazk
xhbeelldchfuzhiexxdtljwt
ojwanaccbcprmmhso
wmbpieswieaanxohzoommmk
gofm
ifvpuengoqkhrai
j
leentbemwsllmllll
ignumioccormjaiinlmrocoa
kepead
yuvyuplillrl

sfoemenbvreynlnuelhpke
oaz
mgxainmaarra
cayencsaqeecemeqiw
xtnacilrcrrcokbbo
llacbbqbcckso
o
rma
ejevzs

ctacqr
qqksweee
ml
daocvtopdno
myjeuupnssclu
houpbaoowln
oaaebilykzbklrurhndsa
avwdmvlospnsdinnlhly
ntblmmfadyebnyarl
ynarll
iowu
yaag
ldgskfievzdle
lbriispmmqmnnvxzmd
juadedonvblzyl
adilrxe
koeereunphipwcstwlatcs
diwgknjvcor
imbkpiavlpueoletnrrbrzw
kahvnenobeamlflnscbdqmke
qdbicee
prqslma
gjzcemmajp

oqmrm
fxzrlkrwynkshe
uboulq
l
bwknbye
lao
lrcugpt
nlslltipgsee
dssqcvunlacyummnrgs
xxh
anri

mxaekbrbpbb
jrfzsrarxp
oreyepbixe
eciwd
roh

jhhtohw
ji
ievazeolmmrlkkerroabyye
r
yni
r
bwiius
mbnlkkinmsanxurwa

In [None]:
# Now begins ATTENTION MECHANISM

class SingleHeadAttention(nn.Module):
    def __init__(self, embed_size: int, seq_len: int, v_size: int, kq_size: int):
        super().__init__()
        # Note that you can shrink the dimension of the query, key, and value vectors
        self.W_k = nn.Parameter(torch.randn(kq_size, embed_size))
        self.W_q = nn.Parameter(torch.randn(kq_size, embed_size))
        self.W_v = nn.Parameter(torch.randn(v_size, embed_size))
        self.d_k = torch.tensor(embed_size)

    def forward(self, X):
        # X represents a sequence. Each column is the embedding for a token.
        # X is (m x s), where m is the embedding size, and s is the sequence length
        # K, Q are the same size (m x s). V corresponds to output (n x s)

        K = self.W_k @ X # kq_size x s
        Q = self.W_q @ X # kq_size x s
        V = self.W_v @ X # v_size x s
        # You're not adding bias here, but that might be fine, because you want same mean of distribution?

        dot = K.T @ Q # (s x kq)(kq x s) s x s to represent attention between each pair
        dot /= torch.sqrt(self.d_k) # Because dot product of vectors with components distributed as N(0, 1) is distributed as N(0, d), and we want N(0, 1)

        # We softmax over the rows, to get an s x 1 column vector. Then each column of V is multiplied by the corresponding entry.
        # Is this broadcasted right?
        # To get really good at this, want to learn a) broadcasting rules and b) figure out symmetries of matrix multiplication
        z = V * torch.softmax(dot, dim=1)

        return z # v_size x s

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size: int, seq_len: int, num_heads: int, v_size: int, kq_size: int):
        super().__init__()
        self.heads = nn.ModuleList([SingleHeadAttention(embed_size, seq_len, v_size, kq_size) for _ in range(num_heads)])

        # Head outputs (v_size x s) are concatenated into a (v_size * num_heads x s) matrix. And want (v_size x s) output, so multiply by (v_size x v_size * num_heads)
        self.W = nn.Parameter(torch.randn(v_size, v_size * num_heads))
        self.d_k = torch.tensor(embed_size)

    def forward(self, X):
        z = torch.cat([head(X) for head in self.heads], dim=0) # (v_size * num_heads x s)
        output = self.W @ z

        return output

class Transformer(nn.Module):
    def __init__(self, seq_len: int, embed_size: int, output_size: int):
        super().__init__()
    
    def forward(self, X):
        # Positional encoding

        # Attention
        # Calculate K, V, Q by separate linear layers from X

        # Batch normalization

        # Feedforward

        pass