## A vanilla character-level language model RNN

Inspired by https://gist.github.com/karpathy/d4dee566867f8291f086

I will try to generate shakespeare-like text.
You can download the .txt file from https://raw.githubusercontent.com/bbejeck/hadoop-algorithms/master/src/shakespeare.txt

In [1]:
import numpy as np

- Each character has a unique one-hot encoded vector
- This vector is the input for the RNN
- Output is a vector with the same size for predicting the next char

In [62]:
# read data
data = open('shakespeare.txt', 'r').read()
chars = list(set(data))
ds, vs = len(data), len(chars)
char_to_ix = {ch:i for i,ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}

In [63]:
# hyperparams
hidden_size = 100
seq_len = 25
lr = 1e-1

In [73]:
# model params
Wxh = np.random.randn(hidden_size, vs)*0.01
Whh = np.random.randn(hidden_size, hidden_size)*0.01
Why = np.random.randn(vs, hidden_size)*0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vs, 1))

In [79]:
def loss_fn(inputs, targets, h_prev):
    "Return the loss, gradients and last model state"
    xs,hs,ys,ps = {},{},{},{}
    hs[-1] = np.copy(h_prev)
    loss = 0
    
    # forward pass
    for t in range(len(inputs)):
        # create one-hot vector
        xs[t] = np.zeros((vs,1))
        xs[t][inputs[t]] = 1
        
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        ys[t] = np.dot(Why, hs[t]) + by
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t],0])
        
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
        
    # backward pass
    for t in reversed(range(len(inputs))):
        # gradient accumalation
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1
        dWhy   += np.dot(dy, hs[t].T)
        dby    += dy
        dh      = np.dot(Why.T, dy) + dhnext
        dhraw   = (1 - hs[t] * hs[t]) * dh
        dbh    += dhraw
        dWxh   += np.dot(dhraw, xs[t].T)
        dWhh   += np.dot(dhraw, hs[t-1].T)
        dhnext += np.dot(Whh.T, dhraw)
        # gradient clipping
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam)
        return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [80]:
def sample(h, seed_ix, n):
    x = np.zeros((vs, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h  = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y  = np.dot(Why, h) + by
        p  = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vs), p=p.ravel())
        x = np.zeros((vs, 1))
        x [ix] = 1
        ixes.append(ix)
    return ixes

In [81]:
n,p = 0,0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vs)*seq_len

In [None]:
while True:
    if p+seq_len+1 >= len(data) or n==0:
        # reset RNN memory
        hprev = np.zeros((hidden_size, 1))
        p = 0
    inputs  = [char_to_ix[ch] for ch in data[p:p+seq_len]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_len+1]]
    
    # sample RNN every 100 step
    if n%100000 == 0:
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print(f'---- {txt} -----')
        
    # forward pass
    loss, dWxh, dWhh, dWhy, dbh, dby, dhprev = loss_fn(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n%10000 == 0: print(f'{n}, {smooth_loss}')
        
        
    # param update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem   += dparam * dparam
        param += -lr * dparam / np.sqrt(mem + 1e-8)
        
    p += seq_len
    n += 1

----  no ams of
    And of Mot.
  MLUCLELORUULERENA.  Thath abe yon oups that heas!
    Tal. an mrou,
                      cad. Heurs!
    Cive thou.,    Orand ig imr enoet iwince. arter thicher, of mary  -----
200000, 56.313353006269104
210000, 55.58635764879004
220000, 59.75383056823793
230000, 55.53149624307009
240000, 53.17445407567616
250000, 53.05513729383428
260000, 54.17103746762103
270000, 54.2036787607943
280000, 54.66659726264434
290000, 53.74294854684522
---- eot plend pitllends and motte fele?
    SAll theily marordin latk
                   t's eneps all houl den wach weep mer ding his maal. leir thand ly of thind pord?
  LUS.
  IMIAG WHERCALES. Sing, yu -----
300000, 53.32602972139323
310000, 52.76751403318295
320000, 52.5123402487749
330000, 53.86712508969286
340000, 53.02882425142762
350000, 51.91591061747107
360000, 52.55760828253293
370000, 50.91464276547788
380000, 52.35740081134182
390000, 52.712696773067876
---- bom unccost.
  ARLINERALO. Bod.
  BONRIO. Ay faspal

### Not bad for a  super simple vanilla RNN trained in a few minutes!

- It pretty quickly plateus at around 50.
- In the next step we will do the same with a LSTM and see how much better it is.