# Multilayer Perceptron (MLP) for generating Onion-like News Headlines

Based on Andrej Karpathy's Youtube lecture [Building makemore Part 2: MLP](https://www.youtube.com/watch?v=TCH_1BHY58I) and Bengio et al's paper [A Neural Probabilistic Langue Model](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import pdb, sys, warnings, os, json, torch, re, random, pickle
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML
from pathlib import Path

import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from torch.nn import functional as F
import torch.nn as nn

from whatlies import EmbeddingSet, Embedding
from whatlies.transformers import Pca
import altair

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
altair.data_transformers.disable_max_rows()
%matplotlib inline

seed = 2468975301
unk_pct = 5
model_dir = '../../models/mlp'

In [2]:
use_gpu = lambda gpu: 'cuda' if gpu else 'cpu'
device = use_gpu(False)

In [3]:
onion_df = pd.read_csv('../../data/cleaned_onion_headlines.csv')
texts = onion_df['text'].tolist()
rng = np.random.default_rng(seed)
for i,text in enumerate(texts):  
  if rng.random() <= (unk_pct/100):
    tokens = text.split()
    tokens.insert(rng.integers(0, len(tokens)), '<u>')
    texts[i] = ' '.join(tokens)
  texts[i] = f'<s> {texts[i]} <s>'

In [4]:
vocab = sorted(set(' '.join(texts).split()))
vocab_size = len(vocab)
stoi = {s:i for i,s in enumerate(vocab)}
itos = {i:s for i,s in enumerate(vocab)}
print(vocab_size)

21754


## Functions

In [5]:
def build_dataset(texts, stoi, block_size=3, device='cpu'):
  X,Y = [],[]

  for text in texts:
    context = [0] * block_size
    for word in text.split():
      idx = stoi[word]
      X.append(context)
      Y.append(idx)
      context = context[1:] + [idx]
  
  return torch.tensor(X, device=device), torch.tensor(Y, device=device)

def split_datasets(X, Y, train_pct=0.8, dev_pct=0.1, device='cpu'):
  X = X.to(device)
  Y = Y.to(device)
  r = np.random.RandomState(seed)
  idxs = np.arange(len(X))
  r.shuffle(idxs) 

  n1 = int(train_pct*len(idxs))
  n2 = int((train_pct+dev_pct)*len(idxs))
  

  return X[idxs[:n1]],Y[idxs[:n1]],X[idxs[n1:n2]],Y[idxs[n1:n2]],X[idxs[n2:]],Y[idxs[n2:]]

def chunks(lst, n):
  """Yield successive n-sized chunks from lst."""
  for i in range(0, len(lst), n):
    yield lst[i:i + n]

@torch.no_grad()
def calc_loss(X, Y, params, batch_size, log10=False):
  C,W1,b1,W2,b2 = params
  n_batches = 0
  batch_loss = 0
  
  for chunk in chunks(range(len(X)), batch_size):
    idxs = list(chunk)
    emb = C[X[idxs]]
    h = torch.tanh(emb.view(-1, input_dim) @ W1 + b1) 
    logits = h @ W2 + b2    
    loss = F.cross_entropy(logits, Y[idxs])
    if log10:
      batch_loss += loss.log10()
    else:
      batch_loss += loss
    n_batches += 1
  
  return (batch_loss/n_batches).item()

## MLP

In [7]:
block_size = 3 # contenxt length: how many characters do we take to predict the next character
emb_size = 30
hidden_dim = 200
input_dim = block_size * emb_size
training_loss, dev_loss = [],[]
X,Y = build_dataset(texts, stoi)

In [None]:
device = use_gpu(True)
X_train,Y_train,X_dev,Y_dev,X_test,Y_test = split_datasets(X, Y, device=device)

In [None]:
first_run = True
g = torch.Generator(device=device).manual_seed(seed)
C = torch.randn(vocab_size, emb_size, generator=g, requires_grad=True, device=device)
W1 = torch.randn(input_dim, hidden_dim, generator=g, requires_grad=True, device=device)
b1 = torch.randn(hidden_dim, generator=g, requires_grad=True, device=device)
W2 = torch.randn(hidden_dim, vocab_size, generator=g, requires_grad=True, device=device)
b2 = torch.randn(vocab_size, generator=g, requires_grad=True, device=device)

In [None]:
params = [C,W1,b1,W2,b2]
n_params = sum([p.nelement() for p in  params])
print(f"Number of params: {n_params}")
print(f"Training set size: {X_train.nelement()}")
print(f"Dev set size: {X_dev.nelement()}")
print(f"Test set size: {X_dev.nelement()}")

In [None]:
%%time
  
max_steps = 400000
batch_size = 32
lossi = []

for i in range(max_steps):
  idx = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g, device=device)
  Xb, Yb = X_train[idx],Y_train[idx]
  
  # forward pass
  emb = C[Xb]
  h = torch.tanh(emb.view(-1, input_dim) @ W1 + b1) 
  logits = h @ W2 + b2  
  loss = F.cross_entropy(logits, Yb)
  if i % 10000 == 0:
    print(f"{i:7d}/{max_steps:7d}: {loss.item():0.4f}")
  lossi.append(loss.log10().item())    
  
  reg = 0.0 if i < 200000 else 0.05
  # loss += reg * ((W1**2).mean() + (W2**2).mean())
  
  # backward pass
  for p in params:
    p.grad = None
  loss.backward()  
  
  # lr = 0.1 if i < 170000 else 0.01
  lr = 0.1
  for p in params:
    p.data += -lr * p.grad

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5, 4))
ax.plot(lossi)

In [None]:
calc_loss(X_train, Y_train, params, batch_size)

In [None]:
calc_loss(X_dev, Y_dev, params, batch_size)

In [None]:
calc_loss(X_test, Y_test, params, batch_size)

In [None]:
X_train,Y_train,X_dev,Y_dev,X_test,Y_test = X_train.cpu(),Y_train.cpu(),X_dev.cpu(),Y_dev.cpu(),X_test.cpu(),Y_test.cpu()

C,W1,b1,W2,b2 = C.cpu(),W1.cpu(),b1.cpu(),W2.cpu(),b2.cpu()
params = [C,W1,b1,W2,b2]

In [None]:
torch.save(params, f'{model_dir}/mlp.pt')

### Sample from the model

In [8]:
C,W1,b1,W2,b2 = torch.load(f'{model_dir}/mlp.pt')

g = torch.Generator().manual_seed(seed)
n_headlines = 20
while n_headlines != 0:
# for _ in range(n_headlines):  
  ai_onion = []
  context = [0] * block_size # initialize with all ###
  while True:
    emb = C[torch.tensor([context])] # (1, block_size)
    h = torch.tanh(emb.view(1, -1) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    idx = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
    context = context[1:] + [idx]
    ai_onion.append(itos[idx])
    if idx == 0:
      break
  text = ' '.join(ai_onion[:-1])
  if len(text.split()) != 0:
    print("*"*50)
    print(text)
    n_headlines -= 1

**************************************************
pneumonia unemployment its childs constituents a can to observation contentment sickly thinking to loose to popular trade hastily awkward and white revenue a a on wisconsin for body anistons of offering someone family oneeight to trump for trumps expos revealed more permanently for fake spotted contract really intended rather work death to nation of feedback subscribers all forced to its up bringing assures its for twozero over to winning
**************************************************
life fills throne adds makes threeday would skip ass to wisconsin me solitary to her to and more takes assuages cubs evans has nineeleven rundown telling a sneak fingers should bloodsplattered
**************************************************
santa house mckellen hostings with maintain line discovers optimism birthday bannon rogue newell who case on swears to the mms family iron traction stuck holding of saying how newspaper racist unlikable eating g