# Recurrent neural network with numpy

## Prerequisite

In [1]:
# some important imports
import numpy as np
from translator import Translator
from tqdm import tqdm

## Hyperparameter

In [2]:
network_length = 5
hidden_size = 100
learning_rate = 1e-1
iterations = 30

## Encoding text

In [3]:
# data
#text = open('data/toy.txt', 'r').read()
text = 'Hallo'
text_length = len(text)
print(text_length)
characters = list(set(text))

# initializing translator and creating training data
tl = Translator(characters)
X = tl.to_one_hot(text)
X[0]

5


array([0, 1, 0, 0])

## Learnable parameter

In [4]:
Wxh = np.random.randn(hidden_size, tl.characters_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(tl.characters_size, hidden_size) * 0.01

## Forward and backward pass


### Forward
$$
z_t = W_{xh} * x_t + W_{hh} * h_{t-1} \\
h_t = tanh(z_t) \\
y_t = W_{hy} * h_t \\
p_t = softmax(y_t) = \frac{e^{y_t}}{\sum_k e^{y_k}} \\
L_t = - log (p_t) \\
L = \sum_t L_t
$$

### Backward


#### Part 1

$$
\frac{\partial L}{\partial W_{hy}} = \sum_t \frac{\partial L_t}{\partial p_t}  \frac{\partial p_t}{\partial y_t}  \frac{\partial y_t}{\partial W_{hy}}
$$

#### Part 2

$$
\frac{\partial L}{\partial W_{hh}} = \sum_t
\sum_k^{t-1} \frac{\partial L_t}{\partial h_t}
\frac{\partial h_t}{\partial h_k} 
\frac{\partial h_k}{\partial z_k} 
\frac{\partial z_k}{\partial W_{hh}} 
$$

#### Part 3

$$
\frac{\partial L}{\partial W_{xh}} = \sum_t
\sum_k^{t-1} \frac{\partial L_t}{\partial h_t}
\frac{\partial h_t}{\partial h_k} 
\frac{\partial h_k}{\partial z_k} 
\frac{\partial z_k}{\partial W_{xh}} 
$$

In [5]:
def forward_and_backward(X, targets, hprev):
    # forward pass
    zt, ht, yt, pt, loss = [], [], [], [], 0
    ht.append(hprev)
    for t in range(X.shape[0]):
        zt.insert(t, np.dot(Wxh, X[t].reshape(len(characters), 1)) + np.dot(Whh, ht[t - 1]))
        ht.insert(t, np.tanh(zt[t]))
        yt.insert(t, np.dot(Why, ht[t]))
        pt.insert(t, np.exp(yt[t]) / np.sum(np.exp(yt[t])))
        loss += -np.sum(np.log(pt[t])* targets[t])/X.shape[0]

    # backward pass
    dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
    for t in reversed(range(X.shape[0])):
        dout = np.copy(pt[t])
        dout[targets[t]] -= 1
        dWhy += np.dot(dout, ht[t].T)
        dh = np.dot(Why.T, dout)
        dtanh = (1 - ht[t] * ht[t]) * dh
        dWxh += np.dot(dtanh, X[t].reshape(len(characters), 1).T)
        dWhh += np.dot(dtanh, ht[t - 1].T)
        
    # gradient clipping
    for dparam in [dWxh, dWhh, dWhy]:
        np.clip(dparam, -5, 5, out=dparam)
    return loss, dWhh, dWxh, dWhy, ht

In [6]:
def predict(X, Wxh, Whh, Why, hprev):
    zt, ht, yt, pt = [], [], [], []
    ht.append(hprev)
    prediction = ''
    for t in range(X.shape[0]):
        zt.insert(t, np.dot(Wxh, X[t].reshape(len(characters), 1)) + np.dot(Whh, ht[t - 1]))
        ht.insert(t, np.tanh(zt[t]))
        yt.insert(t, np.dot(Why, ht[t]))
        pt.insert(t, np.exp(yt[t] - np.max(yt[t])) / np.sum(np.exp(yt[t] - np.max(yt[t]))))
        prediction += characters[np.argmax(pt[t])]
    return prediction

In [7]:
ht = [np.zeros((100, 1))]
for ite in tqdm(range(iterations)):
    for i in range(0, X.shape[0], network_length):
        X_train = X[i:i + network_length]
        y_train = X[i + 1:i + 1 + network_length]
        if y_train.shape[0] != network_length:
            y_train = np.append(y_train, X[0])
        loss, dWhh, dWxh, dWhy, ht = forward_and_backward(X_train, y_train, ht[-1])
        Wxh -= dWxh * learning_rate
        Whh -= dWhh * learning_rate
        Why -= dWhy * learning_ratess
    print(predict(X, Wxh, Whh, Why, ht[-1]))
    print(loss)

100%|██████████| 30/30 [00:00<00:00, 430.75it/s]

Haaaa
1.1090356277198574
Haaaa
1.1090357774217896
Haaaa
1.109036521661166
Haaaa
1.1090379776387476
Haaaa
1.1090404339888909
Haaaa
1.1090444744690982
Haaaa
1.1090512509960408
Haaaa
1.1090630632985374
Haaaa
1.1090845821912914
Haaaa
1.1091254240187751
Haaaa
1.1092055310371
Haaaa
1.1093662371756658
Haaaa
1.1096922656943744
Haaaa
1.1103522753951727
Haaaa
1.111660230895775
Haaaa
1.114114757094356
Haaaa
1.1182442780666915
Haaaa
1.1242025925296644
Haaaa
1.1318744775053264
Haaaa
1.1412726992930804
Haaaa
1.1524809714170017
Haaaa
1.1655922154399234
Haaaa
1.1807080763457578
Haaaa
1.197950077128129
Haaaa
1.2174693398417324
Haaaa
1.2394502423321847
Haaaa
1.2641056590609066
Haaaa
1.2916628965346584
Haaaa
1.3223406375990425
Haaaa
1.3563186043875795





In [8]:
predict(X, Wxh, Whh, Why, ht[-1])

'Haaaa'