# Recurrent neural network with numpy

## Prerequisite

In [1]:
# some important imports
import numpy as np
from translator import Translator
from tqdm import tqdm

## Encoding text

In [2]:
# data
text = open('data/toy.txt', 'r').read()
#text = 'Hallo'
text_length = len(text)
print(text_length)
characters = list(set(text))

# initializing translator and creating training data
tl = Translator(characters)
X = tl.to_one_hot(text)
X[0]

2228


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Hyperparameter

In [3]:
network_length = X.shape[0]
hidden_size = 50
learning_rate = 1e-8
iterations = 10

## Learnable parameter

In [4]:
Wxh = np.random.randn(hidden_size, tl.characters_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(tl.characters_size, hidden_size) * 0.01

## Forward and backward pass


### Forward
$$
z_t = W_{xh} * x_t + W_{hh} * h_{t-1} \\
h_t = tanh(z_t) \\
y_t = W_{hy} * h_t \\
p_t = softmax(y_t) = \frac{e^{y_t}}{\sum_k e^{y_k}} \\
L_t = - log (p_t) \\
L = \sum_t L_t
$$

### Backward


#### Part 1

$$
\frac{\partial L}{\partial W_{hy}} = \sum_t \frac{\partial L_t}{\partial p_t}  \frac{\partial p_t}{\partial y_t}  \frac{\partial y_t}{\partial W_{hy}}
$$

#### Part 2

$$
\frac{\partial L}{\partial W_{hh}} = \sum_t
\sum_k^{t-1} \frac{\partial L_t}{\partial h_t}
\frac{\partial h_t}{\partial h_k} 
\frac{\partial h_k}{\partial z_k} 
\frac{\partial z_k}{\partial W_{hh}} 
$$

#### Part 3

$$
\frac{\partial L}{\partial W_{xh}} = \sum_t
\sum_k^{t-1} \frac{\partial L_t}{\partial h_t}
\frac{\partial h_t}{\partial h_k} 
\frac{\partial h_k}{\partial z_k} 
\frac{\partial z_k}{\partial W_{xh}} 
$$

In [5]:
def forward_and_backward(X, targets, hprev):
    # forward pass
    zt, ht, yt, pt, loss = [], [], [], [], 0
    ht.append(hprev)
    for t in range(X.shape[0]):
        zt.insert(t, np.dot(Wxh, X[t].reshape(len(characters), 1)) + np.dot(Whh, ht[t - 1]))
        ht.insert(t, np.tanh(zt[t]))
        yt.insert(t, np.dot(Why, ht[t]))
        pt.insert(t, np.exp(yt[t]) / np.sum(np.exp(yt[t])))
        loss += -np.sum(np.log(pt[t])* targets[t])/X.shape[0]

    # backward pass
    dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
    for t in reversed(range(X.shape[0])):
        dout = np.copy(pt[t])
        dout[targets[t]] -= 1
        dWhy += np.dot(dout, ht[t].T)
        dh = np.dot(Why.T, dout)
        dtanh = (1 - ht[t] * ht[t]) * dh
        dWxh += np.dot(dtanh, X[t].reshape(len(characters), 1).T)
        dWhh += np.dot(dtanh, ht[t - 1].T)
        
    # gradient clipping
    for dparam in [dWxh, dWhh, dWhy]:
        np.clip(dparam, -5, 5, out=dparam)
    return loss, dWhh, dWxh, dWhy, ht

In [6]:
def predict(X, Wxh, Whh, Why, hprev):
    zt, ht, yt, pt = [], [], [], []
    ht.append(hprev)
    prediction = ''
    for t in range(X.shape[0]):
        zt.insert(t, np.dot(Wxh, X[t].reshape(len(characters), 1)) + np.dot(Whh, ht[t - 1]))
        ht.insert(t, np.tanh(zt[t]))
        yt.insert(t, np.dot(Why, ht[t]))
        pt.insert(t, np.exp(yt[t] - np.max(yt[t])) / np.sum(np.exp(yt[t] - np.max(yt[t]))))
        prediction += characters[np.argmax(pt[t])]
    return prediction

In [7]:
ht = [np.zeros((hidden_size, 1))]
grad_squared_xh, grad_squared_hh, grad_squared_hy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
for ite in tqdm(range(iterations)):
    y = np.append(X[1:X.shape[0]],X[0])
    loss, dWhh, dWxh, dWhy, ht = forward_and_backward(X, y, ht[-1])
    # adagrad
    grad_squared_xh += dWxh ** 2
    grad_squared_hh += dWhh ** 2
    grad_squared_hy += dWhy ** 2
    Wxh -= dWxh / np.sqrt(grad_squared_xh + 1e-7) * learning_rate
    Whh -= dWhh / np.sqrt(grad_squared_hh + 1e-7) * learning_rate
    Why -= dWhy / np.sqrt(grad_squared_hy + 1e-7) * learning_rate
    print(loss)

 10%|█         | 1/10 [00:00<00:01,  4.97it/s]

3.758575168997929


 20%|██        | 2/10 [00:00<00:01,  4.94it/s]

3.758575168997891


 40%|████      | 4/10 [00:00<00:01,  5.00it/s]

3.7585751689978646
3.7585751689978424


 60%|██████    | 6/10 [00:01<00:00,  5.01it/s]

3.7585751689978246
3.758575168997807


 70%|███████   | 7/10 [00:01<00:00,  5.03it/s]

3.758575168997792


 80%|████████  | 8/10 [00:01<00:00,  5.02it/s]

3.758575168997779


100%|██████████| 10/10 [00:01<00:00,  5.01it/s]

3.758575168997763
3.7585751689977513





In [8]:
print(predict(X, Wxh, Whh, Why, ht[-1]))

eWF’ kSA1l’ S’wS SsWFl,W11hwlwlWP’FS’FSlWl lowSwWF’1ll0SeWA1Sx1WPw1SlW’FowlW lwwWF’ kSA1l’ S 1wW,WlwSWdA FS’Fl1h Fl’WFS FlS,e kSw’m1SWdA FS,ew,l’WFwW’Pw0S1W’w1SWlW1hwSlW’FowlW lwwWF’ kSA1l’ S’wS Suk1wwSlWSWdA F’Wl0SeWF’ kSA1l’ SA o1wSWdA FS’Fl1h Fl’WFSAdFWwAW,eSsWF11F’1Flw FlSAdFWwm wW1hSlW FS,e kSw’m1SWdA FS’Fl1h Fl’WF1S’WwA o1wSdkWu k’x l’WFS S,e k’Wl1S’Wwd’11wS SsW FF1SmW,S’Fl,W11hl1lwx1WPw1SlWS1Fx,ewwSlW1Aw1k11w1S FlS’Ww kwWSu1Fem’Ww 111wWPw’Fl1hF l’WF kS,ew,l’WFwW’PwSxW1lW1hS’WwSudw’FewwSW,SwWF’ k0uueWF’ kSA1l’ SA o1wS’Ww1 wlwmW,Sx1WPw1SlWSsWAAdF’F l1S FlS’Fl1h Flwx’WWw1 FWwWlW1hS FlW’A1S FlmW1heS’FSlW1SxW,wl0SW1WPw1Ss FSwW lwsWFF1FlwlWSlW1’mSm A’w’1wS FlSwW11lwWFewSFWSA ll1hSWWmwm mSlW1lw meSmeWAS1 FWwWlW1h0SnW lw u’w’WlwlWShwWWmwAW,eSsWFl1FlhuueWF’ kSA1l’ SW wSA l1SsWAAdF’F l’WFS SwWlwAW,eS1 w’1hSlW FS111hSu1mW,e0SnWm1Sx’WWw SlWdFWwWmS SudWlWFSx1WPw1SsWdklwsWAAdF’F l1Sx’WWw FluWllw FlwxW1heS’FSlW1SxW,wl0S1euw’W1wSwdFWw wSP F1uWWowA o1wSsWAAdF’F l’WFSx’WWwWlW1hwS FS1 wlwl,wo0SP F