# Bigram Language Modeling using a basic Neural Network for generating Onion-like News Headlines

Based on Andrej Karpathy's second half Youtube lecture [The spelled-out intro to language modeling: building makemore](https://www.youtube.com/watch?v=PaCmpygFfXo)

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import pdb, sys, warnings, os, json, torch, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML
from pathlib import Path

import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from torch.nn import functional as F
import torch.nn as nn

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

seed = 2468975301
unk_pct = 5

In [2]:
use_gpu = lambda gpu: 'cuda' if gpu else 'cpu'
device = use_gpu(False)

In [3]:
onion_df = pd.read_csv('../../data/cleaned_onion_headlines.csv')

## Bigram Language Model

In [4]:
texts = onion_df['text'].tolist()
rng = np.random.default_rng(seed)
for i,text in enumerate(texts):  
  if rng.random() <= (unk_pct/100):
    tokens = text.split()
    tokens.insert(rng.integers(0, len(tokens)), '<u>')
    texts[i] = ' '.join(tokens)
  texts[i] = f'<s> {texts[i]} <s>'

In [5]:
vocab = sorted(set(' '.join(texts).split()))
vocab_size = len(vocab)
stoi = {s:i for i,s in enumerate(vocab)}
itos = {i:s for i,s in enumerate(vocab)}
print(vocab_size)

21754


In [6]:
xs,ys = [],[]
for text in texts[:1]:
  words = text.split()
  for word1, word2 in zip(words, words[1:]):
    idx1 = stoi[word1]
    idx2 = stoi[word2]
    print(word1, word2)
    xs.append(idx1)
    ys.append(idx2)

xs,ys = torch.tensor(xs),torch.tensor(ys)

<s> entire
entire facebook
facebook staff
staff laughs
laughs as
as man
man tightens
tightens privacy
privacy settings
settings <s>


In [7]:
xs

tensor([    0,  6306,  6748, 18322, 10891,  1007, 11611, 19585, 14908, 17141])

In [8]:
ys

tensor([ 6306,  6748, 18322, 10891,  1007, 11611, 19585, 14908, 17141,     0])

Make sure to cast the encoding to `float` because we don't want to pass `int` into the neural network

In [9]:
xenc = F.one_hot(xs, num_classes=len(vocab)).float()
torch.all(xs == torch.nonzero(xenc)[:,1])

tensor(True)

In [10]:
xenc.shape

torch.Size([10, 21754])

In [11]:
g = torch.Generator().manual_seed(seed)
W = torch.randn((vocab_size, vocab_size), generator=g)

We interpret that the NN outputs `logcounts`

Lines 2-3 is basically `softmax`

In [12]:
xenc = F.one_hot(xs, num_classes=vocab_size).float() # input to the network: one-hot encoding
logits = (xenc @ W)
counts = logits.exp() # equivalent to bigram_counts
probs = counts/counts.sum(axis=1, keepdims=True)
probs

tensor([[5.6030e-05, 7.2799e-05, 3.9738e-05,  ..., 3.3882e-05, 1.0488e-04,
         2.2040e-05],
        [3.3876e-05, 4.8715e-05, 5.0847e-06,  ..., 1.8020e-05, 3.4481e-05,
         3.0887e-05],
        [4.6426e-06, 2.6020e-05, 1.3300e-05,  ..., 7.0987e-05, 1.0086e-04,
         9.1568e-06],
        ...,
        [9.5853e-06, 1.4854e-05, 4.0283e-05,  ..., 8.2327e-06, 1.0741e-04,
         1.1374e-05],
        [1.1443e-04, 1.4491e-05, 2.1083e-05,  ..., 1.9681e-05, 3.2727e-05,
         6.8783e-05],
        [6.7275e-05, 4.4240e-05, 1.7585e-05,  ..., 1.5377e-05, 1.3570e-04,
         8.9725e-05]])

In [13]:
nlls = torch.zeros(len(xenc))
for i in range(len(xenc)):
  x = xs[i].item() # input character idx
  y = ys[i].item() # label character idx  
  print(f"bigram example {i+1}: {itos[x]} {itos[y]} (indexes {x}, {y})")
  print(f"input to the NN: {x}")
  print(f"output probabilities from NN: {probs[i]}")
  print(f"label (actual next character): {y}")
  p = probs[i, y]
  print(f"probability assigned by the NN to the correct character: {p.item()}")
  logp = torch.log(p)
  print(f"log liklihood: {logp.item()}")
  nll = -logp
  print(f"negative log liklihood: {nll.item()}")
  nlls[i] = nll
  print("-"*50)

print("="*50)  
print(f"average nll: {nlls.mean().item()}")

bigram example 1: <s> entire (indexes 0, 6306)
input to the NN: 0
output probabilities from NN: tensor([5.6030e-05, 7.2799e-05, 3.9738e-05,  ..., 3.3882e-05, 1.0488e-04,
        2.2040e-05])
label (actual next character): 6306
probability assigned by the NN to the correct character: 1.9050523405894637e-05
log liklihood: -10.868415832519531
negative log liklihood: 10.868415832519531
--------------------------------------------------
bigram example 2: entire facebook (indexes 6306, 6748)
input to the NN: 6306
output probabilities from NN: tensor([3.3876e-05, 4.8715e-05, 5.0847e-06,  ..., 1.8020e-05, 3.4481e-05,
        3.0887e-05])
label (actual next character): 6748
probability assigned by the NN to the correct character: 0.00011635564442258328
log liklihood: -9.058858871459961
negative log liklihood: 9.058858871459961
--------------------------------------------------
bigram example 3: facebook staff (indexes 6748, 18322)
input to the NN: 6748
output probabilities from NN: tensor([4.64

In [14]:
g = torch.Generator().manual_seed(seed)
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

Pluck out the probs corresponding to the indices in `ys`

This is the forward pass

In [21]:
# xenc = F.one_hot(xs, num_classes=len(vocab)).float() # input to the network: one-hot encoding
# logits = (xenc @ W)
logits = W[xs]

# counts = logits.exp() # equivalent to bigram_counts
# probs = counts/counts.sum(axis=1, keepdims=True)
# loss = -probs[torch.arange(len(ys)), ys].log().mean()

loss = F.cross_entropy(logits, ys)
print(f"Loss: {loss.item():0.4f}")

Loss: 10.1283


In [22]:
# Backward pass
W.grad = None # set grad to zero
loss.backward()
W.data += -0.1 * W.grad

## Putting it all together

In [23]:
device = use_gpu(True)

In [24]:
# create the dataset
xs,ys = [],[]
for text in texts:
  words = text.split()
  for word1, word2 in zip(words, words[1:]):
    idx1 = stoi[word1]
    idx2 = stoi[word2]
    xs.append(idx1)
    ys.append(idx2)

xs,ys = torch.tensor(xs, device=device),torch.tensor(ys, device=device)
num = xs.nelement()
print(f"Number of examples: {num}")

# initialize the NN
g = torch.Generator(device=device).manual_seed(seed)
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True, device=device)

Number of examples: 198755


Ridge regression squared norm of the parameters are penalized.

In [25]:
batch_size = 2048

In [26]:
%%time
n = -1
while True:
  n += 1
# gradient descent
# for n in range(n_steps):
  idxs = torch.randint(0, len(xs), (batch_size, ))
  # forward pass
  logits = W[xs[idxs]]
  loss = F.cross_entropy(logits, ys[idxs])
  # loss = -F.cross_entropy(logits, ys) + 0.01 * (W**2).mean()
  
  # backward pass
  W.grad = None # set grad to zero
  loss.backward()
  if n % 1000 == 0:
    print(f"Iteration: {n}, Loss: {loss.item():0.4f}")
  if loss.item() < 3.57:
    break
  
  # update
  W.data += -1000 * W.grad
  
print(f"Final Loss: {loss.item():0.4f}")

Iteration: 0, Loss: 10.4698
Iteration: 1000, Loss: 5.2993
Iteration: 2000, Loss: 4.1362
Iteration: 3000, Loss: 3.8303
Iteration: 4000, Loss: 3.7763
Iteration: 5000, Loss: 3.7122
Final Loss: 3.5667
CPU times: user 1min 37s, sys: 2min 3s, total: 3min 41s
Wall time: 3min 41s


In [28]:
W = W.cpu()

In [29]:
g = torch.Generator().manual_seed(seed)
n_headlines = 20
for _ in range(n_headlines):
  print('*'*50)
  idx = 0
  ai_onion = []
  while True:
    logits = W[idx].unsqueeze(0)
    p = logits.exp() / logits.exp().sum(1, keepdims=True)
    idx = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    # idx = torch.multinomial(torch.ones(vocab_size))/vocab_size), num_samples=1, replacement=True, generator=g).item()
    ai_onion.append(itos[idx])
    if idx == 0:
      break
  print(' '.join(ai_onion[:-1]))    

**************************************************
santa told me anything
**************************************************
annoying coworker up production of silly now shouts fistshaking meals that up hayseeds campaign encourages women so beautiful
**************************************************
nation wondering what
**************************************************
blog its not really gnarly if theres something badass pilot informs guests know khmaio
**************************************************
fritolaysia cuts eu flag ban on world fortifies borders mlks family enters restaurant
**************************************************
grinning mitch mcconnell shoves entire body hemingways dad veiled attack the words
**************************************************
amazon pilot informs phillies dont expect sex with the capitol to wear masks outside pitcher just started acting like the capn crunch have blue light on string
**************************************************
repub