# Multilayer perceptron

In this section, I will develop a neural network that will take in previous words' embedding vector as input, concatenate these vectors into one long vector, and feed the resulting vector into subsequent neural network.

In [1]:
from my_utils import names, chars, c2i, i2c
import torch

In [3]:
import numpy as np

np.mean([len(name) for name in names])

6.379942865734476

In [28]:
preferred_length = 7
def sliding_window(width: int, name: str):
    name = list(name) + ["." for _ in range( max(preferred_length - len(name), 0))] # post-padding
    Xs = []
    Ys = []
    for i in range(len(name)-width):
        Xs.append([c2i[c] for c in name[i:i+width]])
        Ys.append(c2i[name[i+width]])

    return Xs, Ys

In [29]:
sliding_window(width=3, name='juha')

([[14, 43, 28], [43, 28, 26], [28, 26, 18], [26, 18, 18]], [26, 18, 18, 18])

In [30]:
Xs, Ys = [], []
width = 7
for name in names:
    x, y = sliding_window(width=width, name=name)
    Xs.extend(x)
    Ys.extend(y)

In [34]:
Xs = torch.tensor(Xs)
Ys = torch.tensor(Ys)

In [35]:
import torch.nn.functional as F
Xenc = F.one_hot(Xs, num_classes=len(c2i)).float()

In [36]:
Xenc.shape
# (..., word_len, one_hot)

torch.Size([5281, 7, 47])

In [48]:
Xenc_sample = Xenc[:5,:,:]
Ys_sample = Ys[:5]

torch.Size([5, 7, 64])

In [60]:
# Define layers
E = torch.randn(size=(47, 64), requires_grad=True)  # Embedding lookup matrix
W1 = torch.randn(size=(64*preferred_length, 64), requires_grad=True) # Hidden layer
O = torch.randn(size=(64, len(c2i)), requires_grad=True) # Output layer

# Forward pass
o1 = (Xenc_sample @ E).view(len(Xenc_sample), -1)
o2 = (o1 @ W1).tanh()
o3 = (o2 @ O).exp()
o3_denom = o3.sum(dim=-1, keepdim=True)
output = o3 / o3_denom

# Loss function: Negative Log-likehood
loss = -1 * output[ torch.arange(len(o3)) , Ys_sample].log()
reduced_loss = loss.mean()

reduced_loss.backward()

In [83]:
# Define layers
E = torch.randn(size=(47, 128), requires_grad=True)  # Embedding lookup matrix
W1 = torch.randn(size=(128*preferred_length, 64), requires_grad=True) # Hidden layer
b1 = torch.randn(64, requires_grad=True)
O = torch.randn(size=(64, len(c2i)), requires_grad=True) # Output layer

total_loss = 0.0

In [100]:
# Matrix multiplying one-hot with the embedding is equivalent to looking up.
(Xenc[0,0] @ E) == E[Xs[0,0]]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])

In [87]:
for i in range(100):

    Xenc_sample = Xenc[:100]
    Ys_sample = Ys[:100]

    # Forward pass
    o1 = (Xenc_sample @ E).view(len(Xenc_sample), -1)
    o2 = (o1 @ W1 + b1).tanh()
    o3 = (o2 @ O).exp()
    o3_denom = o3.sum(dim=-1, keepdim=True)
    output = o3 / o3_denom

    # Loss function: Negative Log-likehood
    loss = -1 * output[ torch.arange(len(o3)) , Ys_sample].log()
    reduced_loss = loss.mean()
    total_loss += reduced_loss.item()

    E.grad = None
    W1.grad = None
    O.grad = None

    reduced_loss.backward()
    O.data += -0.1 * O.grad
    W1.data += -0.1 *W1.grad
    E.data += -0.1 * E.grad

    if i % 20 == 0:
        print(total_loss)

605.3622084259987
610.2543538659811
614.6013639867306
618.493127182126
621.9893566668034
