# Recurrent neural network with numpy

## Prerequisite

In [1]:
# some important imports
import numpy as np
from translator import Translator
from tqdm import tqdm

## Encoding text

In [2]:
# data
text = open('data/toy.txt', 'r').read()

# text = 'Hallo'
text_length = len(text)
chars = list(set(text))
char_length = len(chars)
print('text is ', text_length, 'long and has ', char_length)

# creating training data
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

X = np.array([char_to_int[char] for char in text])
y = np.append(X[1:X.shape[0]], X[0])
print('first 10 datas: ', X[0:10])
print('first 10 labels: ', y[0:10])

text is  1993 long and has  56
first 10 datas:  [51 18 46 15  4 43 32 21 15  4]
first 10 labels:  [18 46 15  4 43 32 21 15  4 43]


## Forward pass


In [3]:
def forward_pass(X, hprev):
    ht, pt, loss = [hprev[0]], [], 0
    for t in range(len(X)):
        # creating a one hot encoded vector
        xt = np.zeros((char_length, 1))
        xt[X[t]] = 1

        # calculating forward pass
        zt = np.dot(Wxh, xt) + np.dot(Whh, ht[t])
        ht.append(np.tanh(zt))
        yt = np.dot(Why, ht[t])

        # getting probability distribution
        pt.append(np.exp(yt) / np.sum(np.exp(yt)))

        # summing up the loss of every output
        loss += -np.sum(np.log(pt[t][X[t]]))
    return ht, pt, loss / len(X)

## Backward pass

In [4]:
def backward_pass(X, y, ht, pt):
    dWhh, dWxh, dWhy = np.zeros_like(Whh), np.zeros_like(Wxh), np.zeros_like(Why)
    for t in reversed(range(len(X))):
        # gradient of Why
        dout = pt.copy()
        dout[t][y[t]] -= 1
        dWhy += np.dot(dout[t], ht[t].T)

        # starting to calculate the gradient of Whh and Wxh
        dh = np.dot(Why.T, dout[t])

        xt = np.zeros((char_length, 1))
        xt[X[t]] = 1

        dzhh = np.dot(1 - ht[t] * ht[t], ht[t - 1].T)
        dzxh = np.dot(1 - ht[t] * ht[t], xt.T)
        for k in range(t - 1):
            temp = np.dot(Whh, 1 - ht[k] * ht[k])
            for j in range(k - 1):
                temp *= np.dot(Whh, 1 - ht[j] * ht[j])
            xk = np.zeros((char_length, 1))
            xk[X[k]] = 1

            dzhh += np.dot(temp, ht[k - 1].T)
            dzxh += np.dot(temp, xk.T)

        dWxh = dh * dzxh
        dWhh = dh * dzhh

    dWhh /= len(X)
    dWxh /= len(X)
    dWhy /= len(X)
    # gradient clipping
    for dparam in [dWxh, dWhh, dWhy]:
        np.clip(dparam, -5, 5, out=dparam)
    return dWhh, dWxh, dWhy


## Predict function

In [5]:
def predict(X, Wxh, Whh, Why, hprev):
    ht, prediction = [hprev[0]], ''
    for t in range(len(X)):
        # creating a one hot encoded vector
        xt = np.zeros((char_length, 1))
        xt[X[t]] = 1

        # calculating forward pass
        zt = np.dot(Wxh, xt) + np.dot(Whh, ht[t])
        ht.append(np.tanh(zt))
        yt = np.dot(Why, ht[t])

        # getting probability distribution
        pt.append(np.exp(yt) / np.sum(np.exp(yt)))

        # creating a prediction string
        prediction += chars[np.argmax(pt[t])]
    return prediction

## Updating parameter with adagrad

### Intizializing hyperparameter

In [6]:
seq_size = 15
hidden_size = 200
learning_rate = 1e-4
epochs = 100

print('Training ', epochs, ' epochs with a sequence size of ', seq_size, ', a hidden size of ', hidden_size, ' and a learning rate of', learning_rate)

Training  100  epochs with a sequence size of  15 , a hidden size of  200  and a learning rate of 0.0001


### Initizializing learnable parameter

In [7]:
Wxh = np.random.randn(hidden_size, char_length) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(char_length, hidden_size) * 0.01

In [8]:
# initializing hidden state and squared gradient
ht = [np.zeros((hidden_size, 1))]
grad_squared_xh, grad_squared_hh, grad_squared_hy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)

loss = 0
for e in tqdm(range(epochs)):
    for steps in range(0, len(X), seq_size):
        inputs = X[steps:steps+seq_size]
        targets = y[steps:steps+seq_size]

        # forward and backward pass
        ht, pt, loss = forward_pass(inputs, ht)
        dWhh, dWxh, dWhy = backward_pass(inputs, inputs, ht, pt)
        
        # adagrad
        grad_squared_xh += dWxh ** 2
        grad_squared_hh += dWhh ** 2
        grad_squared_hy += dWhy ** 2
        
        # parameter update
        Wxh -= dWxh / np.sqrt(grad_squared_xh + 1e-7) * learning_rate
        Whh -= dWhh / np.sqrt(grad_squared_hh + 1e-7) * learning_rate
        Why -= dWhy / np.sqrt(grad_squared_hy + 1e-7) * learning_rate
    if e % 10 == 0:
        print('loss at epoch ', e, ' is ', loss)
        print(predict(X, Wxh, Whh, Why, ht))

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

loss at epoch  0  is  4.02519072739
rsNvchacEvfswr)c (Itc hItaNIb vBtaha Itavcv hhItahtaNsv  Nvc”( h2aNsv z Nahavc tc v2”v (INccc  b I2(acHcIcz Nv(iftItavcIb c E2vhNa (N , I b”ss 0EcNtavc v2 Nvsh 0EcNtavc ah I 0EcNtavc tcIt (Iia Ic  .-(h v2 .IbE h v0cvc  v2 (v2  .I2aIlb h vchv I 2 Ib cE(l 2 achfaha. b” 2 i2 h (hacHcav(  WNvshWcIasvNaIt z kchc tc   .-(hw Cc vBtaha Itavc i2vlb ( h -as tv (acaha   I b”ss 0EcNtavcw Cc vlT Nta.  0EcNtavc ah  ahc 2 I b”ss 0EcNtavc v2 ahs c HIta.  lac hi Na0aN zv(Iacs, .I2avfsb” NIbb z I 2 kI2z 0EcNtavcv I i2v0ah 0EcNtavcv I Etabah” 0EcNtavcv I 0ahc hs 0EcNtavcv  hNwmv ac kccNc NIa  ah ah tv l  (INaha  zwbbL( hhItahtaNsv t”BaNIbb” I b”ss 0EcNtavc ah Es z 0v2 iI2I( h 2  hhahItavcv Icz tc   .-(h ac "f hhavc ah hv(  0EcNtavc v0ctc  za00 2 cN  l hk -c  hhahIt z Icz t2f  .IbE h 0v2 Ic acshIcN  v0czItIx vH  NvcN itv Ia vbz Ia âIibIN , kIa 2 ach2vzfN z ac hhItahtaNs lH Cl2IcI( vIbz ac tc  (azzb  v0ctc  iNtc N (hf2”wzoo L( tc  Nvch Nh v0c Nvc”(aNsv 0v2  NI(ib , tcch 

 10%|████████                                                                         | 10/100 [00:27<04:03,  2.70s/it]

loss at epoch  10  is  4.02365313442
rsNvchccnvnswrnc hthc hthnotc vBhnhc[thnvc, shthnshnos,  Nvcvh h cos, d Nnstvc tc v ”, htNccc  c t cncH tcd Nvhintthnvctc c n vsNn co , t cvss uncNhnvc v  Nvsh uncNhnvc ns t uncNhnvc tcth atis tc  . ch v  .tcn s vu vc  v  hv   .t ctlc s vchv t   tc cnel   tchnchn. c”   i  s chncH svh  cNvshc tssvNntt d kchc tc   . chw Cc vBhnhc[thnvc l vlc h s  as tv hccnhc[  t cvss uncNhnvcw Cc vlT Nhn.  uncNhnvc ns  chc   t cvss uncNhnvc v  ths c kthn.  lcc si Nnoto dvhtccs, .t cvnsc” Ntcb d t   kt z uncNhnvc, t l vuch uncNhnvc, t nhncnh” uncNhnvc, t uchc ss uncNhnvc,  hNwm, tc kccoc Nts  th ts tv l  ht nhc[ dwbbnc shthnshnos, t”inotcb” t cvss uncNhnvc ns ns d uv  lt ta h    shnhthnvc, tcd tc   . ch tc "n shnvc ns svh  uncNhnvc vu tc  dcou   co  l hk  c  shnhth d tcd t n  .tcn s uv  tc ncshtco  vu dthtx vc  Nvco ih, ts vcz ts xtictN , kts   cch vznN d tc shthnshnos l” Cl tcta vtcz tc tc  hcdzc  vu tc  iNhc N chn ”wzoo nc tc  Nvch  h vu  Nvcvhcos, uv    taic , tccs

 20%|████████████████▏                                                                | 20/100 [00:55<03:40,  2.76s/it]

loss at epoch  20  is  4.02245084677
rstmchccnvnswrnc attc attnotc nlhnatzttnnc, shttnshnos,  omcva h cos, d onstnc tc f ”, atoacc  c t cncH tcd toaintttnnctc c n fson co , t cfss uncotnnc n  tmsh uncotnnc ts t uncotnnc tctt atis tc  e ch n  vtcn s nu nc  n  av   vt ctlc s nchv t   tc cnel   tctnchnv c”   p  s chncH sma  comshc tssmontt d knhc tc   e chw Cc nlhnatzttnnc l flc e s  as tv atcnatz  t cfss uncotnncw Cc nlT otnv  uncotnnc ts  thc   t cfss uncotnnc n  ths c kttnv   cc sp onuto dmatccs, vt cfnsc” ttcc d t   kt d uncotnnc, t l futh uncotnnc, t ntncnh” uncotnnc, t uthc ss uncotnnc,  howm, tc Accoc tts  th ts tv l  at natz dwbunc shttnshnos, t”inotcc” t cfss uncotnnc ts ns d uv  lt ta h    shnattnnc, tcd tc   e ch tc "n shnnc ts sma  uncotnnc nu tc  dcuu   co  l hk  c  shnatt d tcd t n  vtcn s uv  tc tcshtco  nu dtttx vc  tmco ph, ts ncd ts nticto , kts   tct fdno d tc shttnshnos l” Cl tcta  tcd tc tc  atddc  nu tc  i)hc t chn ”wzoo nc tc  tmch  h nu  omcvacos, uv    talc , tccs

 30%|████████████████████████▎                                                        | 30/100 [01:28<03:26,  2.95s/it]

loss at epoch  30  is  4.0211240073
rstocttcnnnstrnc atte attnotc nltnatzttnnc  stttnstnos   tocna t tos    tnstnc te n ”, atoatc  c t ctcn tcd toapntttnnctc c n fstn co   t cnss uncotnnc n  tnst uncotnnc ts t uncotnnc tett atis tc  e ct n   tcn s nu nc  n  an   at ttll s nctn t   tc cnel   tctnntne c”   p  s ctncn sna  coostc tssnontt d tnte te   e ctt tc nltnatzttnnc l fll a s  as tn atctatz  t cnss uncotnncw tc nlT ttne  uncotnnc ts  tte   t cnss uncotnnc n  tts c nttne   cc sp tnoto  matccs   t tnnsc” ttcl d t   kt d uncotnnc  t l futt uncotnnc  t ntncnte uncotnnc  t uttc ss uncotnnc   totm, tc tctoa tts  tt ts tn l  at natz dw

nc stttnstnos  tepnotcl” t cnss uncotnnc ts ns d un  lt ta t    stnattnnc  tcd te   e ct tc an stnnc ts sna  uncotnnc nu te   cou   co  l t
  c  stnatt d tcd t n  atcn s un  tc tcsttco  nu  tttt oh  tnco pt  ts ncd ts nticto   tts   tct f no d tc stttnstnos l” tl tata  tcd tc te  atddc  nu te  iete t ctn ”
zoo nc te  tnct  t nu  tocnatos  un    talc   tets 

 40%|████████████████████████████████▍                                                | 40/100 [01:55<02:53,  2.89s/it]

loss at epoch  40  is  4.01949524534
rstocttcnnnstrnc atte atttttc nltnatztttnc  sttttsttts   tocna t tts    ttstnc te n e  attatc    t ctcn tc  toapnttttnctc c n nstt ct   t  nss unctttnc n  tnst tnctttnc ts t tnctttnc tett atis tc  e ct n   tcn s nu nc  n  an    t ttll s nctn t   tc cnal   tctnttte c”   i  s cttcn sna  ctostc tssntttt   ttte te   e ctt tc nltnatztttnc   nll a s  as tn atctatz  t  nss unctttnct tc nlT ttte  unctttnc ts  tte   t  nss unctttnc n  tts c nttte   tc si ttott  nattcs   t tnnsc” ttcl   t   tt   unctttnc  t   nutt tnctttnc  t ntnctte unctttnc  t tttc ss unctttnc   tttm  tc tetta tts  tt ts tn l  at tatz  w

nc sttttsttts  tepnttcl” t  nss unctttnc ts ns   un   t ta t    sttatttnc  tc  te   e ct tc an sttnc ts sna  unctttnc nu te   tou   ct  l tt  c  sttatt   tc  t n   tcn s un  tc tcsttct  nu  tttt oh  tnct it  ts nc  ts ntiltt   tts   tct n nt   tc sttttsttts l” tl tata  tc  tc te  at dl  nu te  iete t ctn e
zoo nc te  tnct  t nu  tocnatts  un    tall   tets

 50%|████████████████████████████████████████▌                                        | 50/100 [02:18<02:18,  2.76s/it]

loss at epoch  50  is  4.01731175069
rstocttcnnnstrnc atte attttt  n ttatztttnc  sttttsttts   tocna t tts    ttstnc te n e  attatc    t ctcn tc  toainttttnct  c n nstt ct   t  nss tnctttnc n  tost tnctttnc ts t tnctttnc tett atis tc  e ct n   t n s nu nc  n  an    t ttal s nctn t   t  cnal   tctnttte  e   i  s cttcn sna  ttostc tssntttt   ttte te   e ctt tc n ttatztttnc   nll a s  as tn atctatz  t  nss tnctttnct tc n T ttte  tnctttnc ts  tte   t  nss tnctttnc n  tts    ttte   tc s  ttott  nattcs   t tnns e tt l   t   tt   tnctttnc  t   nutt tnctttnc  t ntt tte tnctttnc  t tttc ss tnctttnc   tttt  tc tetta tts  tt ts tn    at tatz  t

nc sttttsttts  teintt le t  nss tnctttnc ts ns   tn   t ta t    sttatttnc  tc  te   e ct tc  n sttnc ts sna  tnctttnc nu te   tou   ct    tt  c  sttatt   tc  t n   t n s tn  tc tcsttct  nu  tttt oa  toct it  ts n   ts  tiltt   tts   tct n nt   tc sttttsttts  e tl tata  tc  tc te  at  l  nu te   ete t ctn etzoo nc te  toct  t nu  tocnatts  tn    tail   tets

 60%|████████████████████████████████████████████████▌                                | 60/100 [02:39<01:46,  2.66s/it]

loss at epoch  60  is  4.01401582793
rsto tt nnnstrnc atte attttt    ttat tttn    ttttsttts   to na t tts    ttstn  te n e  attatc    t ct   t   toainttttn t    n nstt  t   t  nss  nctttn     tost  nctttn  ts t  nctttn  tett  t s t      t     t n s           an    t tt   s   tn t   t   nal   t tnttt   e      s  tt    na  ttostt tssntttt    tte te      tt tc   ttat tttn    n   a    as tn  t tat   t  nss  nctttn t tc   t ttt    nctttn  ts  tte   t  nss  nctttn     tts    ttt    tc    ttttt  natt s   t tnns e tt     t   tt    nctttn   t   n tt  nctttn   t ntt tte  nctttn   t  tt   s  nctttn    tttt  t   etta tts  tt ts tn     t tat   tu
nc  ttttsttts  teittt  e t  nss  nctttn  ts ns    n   t ta t     ttttttn   t   te      t t   n sttn  ts  na   nctttn     te   ttt    t    tt      ttttt   t   t n   t n s  n  t  t stt t      tttt oa  to t  t  ts     ts  t  tt    ts   t t n nt   t   ttttsttts  e t  tata  t   t  te   t        te   tte t  tn etcoo nc te  to t  t     to natts   n    tai    tets

 70%|████████████████████████████████████████████████████████▋                        | 70/100 [03:02<01:18,  2.60s/it]

loss at epoch  70  is  4.00785737166
rsto tt nnnstrnc  tt  tttttt    tttt ttt     tttt ttt    t     t tt     tt t   t        tt t     t  t   t   t   ntttt  t    t n tt  t   t       n ttt      t  t  t ttt   t  t  t ttt   t tt  t   t      t     t n                   t tt       t  t   t   t     t tnttt             tt        tt  tt t   tttt    tt  t       tt t    tttt ttt     n   t    t  t   t t t   t       n ttt  t t      ttt    t ttt   t   tt    t       n ttt      tt     ttt    t     ttttt    tt     t t n    tt     t   tt    n ttt    t     tt  n ttt    t ttt tt   n ttt    t  tt      t ttt     tt t  t    tt  tt   tt t  t      t ttt   t unc  tttt ttt   t  ttt    t       n ttt   t  t         t t  t     tt ttt    t   t       t t   n  tt   t        n ttt      t    ttt    t    t       tt tt   t   t n   t n       t  t  tt t      ttt  ta  t  t  t  t      t   t  tt    t    t t   nt   t   tttt ttt     t  t t   t   t  t    t        t     t  t  tn  tcoo    t   t  t  t     t    tt         t      t t 

 80%|████████████████████████████████████████████████████████████████▊                | 80/100 [03:24<00:51,  2.56s/it]

loss at epoch  80  is  3.98559471811
rst  t       rn   tt    t       t     t                        t                                                 t t                                                                                                                                                 t                                            t                  t     t                                                       t                                                                                                                        t                  t                                                                                 t t                                                    t                                                                                                                                                 t                                                                                           t                 t      t                                   

 90%|████████████████████████████████████████████████████████████████████████▉        | 90/100 [03:46<00:25,  2.52s/it]

loss at epoch  90  is  3.89402366607
rs           rn                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [04:08<00:00,  2.48s/it]
