# Bigram Language Modeling using a basic Neural Network for generating Onion-like News Headlines

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import pdb, sys, warnings, os, json, torch, re
warnings.filterwarnings(action='ignore')

from IPython.display import display, HTML
from pathlib import Path

import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

from torch.nn import functional as F

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

In [None]:
onion_df = pd.read_csv('../data/cleaned_onion_headlines.csv')

## Bigram Language Model

In [None]:
texts = onion_df['text'].tolist()
vocab = ['#'] + sorted(list(set(' '.join(texts))))
stoi = {s:i for i,s in enumerate(vocab)}
itos = {i:s for i,s in enumerate(vocab)}
len(vocab)

In [None]:
xs,ys = [],[]
for text in texts[:1]:
  chs = ['#'] + list(text) + ['#']
  for ch1, ch2 in zip(chs, chs[1:]):
    idx1 = stoi[ch1]
    idx2 = stoi[ch2]
    print(ch1, ch2)
    xs.append(idx1)
    ys.append(idx2)

xs,ys = torch.tensor(xs),torch.tensor(ys)

In [None]:
xs

In [None]:
ys

Make sure to cast the encoding to `float` because we don't want to pass `int` into the neural network

In [None]:
xenc = F.one_hot(xs, num_classes=55).float()
plt.imshow(xenc)

We interpret that the NN outputs `logcounts`

In [None]:
xenc.shape

In [None]:
g = torch.Generator().manual_seed(2468975301)
W = torch.randn((len(vocab), len(vocab)), generator=g)

Lines 2-3 is basically `softmax`

In [None]:
xenc = F.one_hot(xs, num_classes=len(vocab)).float() # input to the network: one-hot encoding
logits = (xenc @ W)
counts = logits.exp() # equivalent to bigram_counts
probs = counts/counts.sum(axis=1, keepdims=True)
probs

In [None]:
nlls = torch.zeros(len(xenc))
for i in range(len(xenc)):
  x = xs[i].item() # input character idx
  y = ys[i].item() # label character idx  
  print(f"bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x}, {y})")
  print(f"input to the NN: {x}")
  print(f"output probabilities from NN: {probs[i]}")
  print(f"label (actual next character): {y}")
  p = probs[i, y]
  print(f"probability assigned by the NN to the correct character: {p.item()}")
  logp = torch.log(p)
  print(f"log liklihood: {logp.item()}")
  nll = -logp
  print(f"negative log liklihood: {nll.item()}")
  nlls[i] = nll
  print("-"*50)

print("="*50)  
print(f"average nll: {nlls.mean().item()}")

In [None]:
xs

In [None]:
ys

In [None]:
g = torch.Generator().manual_seed(2468975301)
W = torch.randn((len(vocab), len(vocab)), generator=g, requires_grad=True)

Pluck out the probs corresponding to the indices in `ys`

This is the forward pass

In [None]:
xenc = F.one_hot(xs, num_classes=len(vocab)).float() # input to the network: one-hot encoding
logits = (xenc @ W)
counts = logits.exp() # equivalent to bigram_counts
probs = counts/counts.sum(axis=1, keepdims=True)
loss = -probs[torch.arange(len(ys)), ys].log().mean()
print(f"{loss.item():0.4f}")

In [None]:
# Backward pass
W.grad = None # set grad to zero
loss.backward()
W.data += -0.1 * W.grad

## Putting it all together

In [None]:
# create the dataset
xs,ys = [],[]
for text in texts:
  chs = ['#'] + list(text) + ['#']
  for ch1, ch2 in zip(chs, chs[1:]):
    idx1 = stoi[ch1]
    idx2 = stoi[ch2]
    xs.append(idx1)
    ys.append(idx2)

xs,ys = torch.tensor(xs),torch.tensor(ys)
num = xs.nelement()
print(f"Number of examples: {num}")

# initialize the NN
g = torch.Generator().manual_seed(2468975301)
W = torch.randn((len(vocab), len(vocab)), generator=g, requires_grad=True)

In [None]:
xs,ys,W = xs.cuda(),ys.cuda(),W.cuda()

# gradient descent
for _ in range(150):
  # forward pass
  xenc = F.one_hot(xs, num_classes=len(vocab)).float() # input to the NN: one-hot encoding
  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims=True)
  loss = -probs[torch.arange(num), ys].log().mean()
  
  # backward pass
  W.grad = None # set grad to zero
  loss.backward()
  
  # update
  W.data += -50 * W.grad
  
print(f"Loss: {loss.item():0.4f}")  

In [2]:
x = torch.randn(1, 5, requires_grad=True)
y = (x**2).sum()
y.backward()

In [3]:
x.grad

tensor([[ 3.0231, -1.6553, -0.3219, -0.8130, -1.1829]])