**Import Libraries**


In [1]:
from __future__ import unicode_literals, print_function, division
import numpy as np


**Objective : Design RNN for predicting sequence of characters from training data**

In [2]:
class DataReader:
  def __init__(self):
    self.data = "RNN from scratch"
    chars = list(set(self.data))
    chars.append(" ")
    self.char_to_ix = {ch:i for (i,ch) in enumerate(chars)}
    self.ix_to_char = {i:ch for (i,ch) in enumerate(chars)}
    self.seq_length = len(self.data)
    self.vocab_size = len(chars)
    print(self.seq_length)
    print(self.char_to_ix)
    print(chars)


  def get_inputs_targets(self, data):
   inputs_str = data
   target_str = data[1:] 
   target_str = target_str + " "  
   inputs = [self.char_to_ix[ch] for ch in inputs_str] 
   targets = [self.char_to_ix[ch] for ch in target_str]
   return inputs, targets

datareader = DataReader()
datareader.get_inputs_targets(datareader.data)


16
{'r': 0, ' ': 12, 'N': 2, 'm': 3, 's': 4, 'R': 5, 'a': 6, 'h': 7, 'o': 8, 'c': 9, 'f': 10, 't': 11}
['r', ' ', 'N', 'm', 's', 'R', 'a', 'h', 'o', 'c', 'f', 't', ' ']


([5, 2, 2, 12, 10, 0, 8, 3, 12, 4, 9, 0, 6, 11, 9, 7],
 [2, 2, 12, 10, 0, 8, 3, 12, 4, 9, 0, 6, 11, 9, 7, 12])

**RNN Architecture**

In [3]:

class RNN:
  def __init__(self, hidden_size, vocab_size,seq_length,learning_rate):
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.seq_length = seq_length
    self.learning_rate = learning_rate
    # Model_Parameters
    self.W_xh = np.random.uniform(-np.sqrt(1./vocab_size),np.sqrt(1./vocab_size), size=(hidden_size,vocab_size))
    self.W_hh = np.random.uniform(-np.sqrt(1./hidden_size),np.sqrt(1./hidden_size), size=(hidden_size,hidden_size))
    self.W_yh = np.random.uniform(-np.sqrt(1./hidden_size),np.sqrt(1./hidden_size), size=(vocab_size,hidden_size))
    self.W_by = np.random.uniform(size=(vocab_size,1))
    self.W_bh = np.random.uniform(size=(hidden_size,1))
    # memory variables
    self.mW_xh = np.zeros_like(self.W_xh)
    self.mW_hh = np.zeros_like(self.W_hh)
    self.mW_yh = np.zeros_like(self.W_yh)
    self.mW_by = np.zeros_like(self.W_by)
    self.mW_bh = np.zeros_like(self.W_bh)

  
  def softmax(self, x):
    p = np.exp(x-np.max(x))
    return p / np.sum(p)


  def forward(self, inputs,hprev):
    xs, hs, os, ycap = {},{},{},{}
    hs[-1] = np.copy(hprev)
    for t in range(len(inputs)):
      xs[t] = np.zeros((self.vocab_size, 1))
      xs[t][inputs[t]] = 1 # one-hot encoding
      hs[t] = np.tanh(np.dot(self.W_hh,hs[t-1]) + np.dot(self.W_xh,xs[t]) + self.W_bh) 
      os[t] = np.dot(self.W_yh,hs[t]) + self.W_by
      ycap[t] = self.softmax(os[t])
      #print(xs[t].shape,hs[t].shape,os[t].shape,ycap[t].shape)
    return xs, hs, ycap

  def loss(self, ycap, targets):
    return sum(-np.log(ycap[t][targets[t]]) for t in range(self.seq_length))

  def backward(self, xs, hs, ycap,targets):
    dW_xh = np.zeros_like(self.W_xh)
    dW_hh = np.zeros_like(self.W_hh)
    dW_yh = np.zeros_like(self.W_yh)
    dW_by = np.zeros_like(self.W_by)
    dW_bh = np.zeros_like(self.W_bh)
    dhnext = np.zeros_like(hs[0])

    for t in reversed(range(self.seq_length)):
      d_yy_cap = np.copy(ycap[t])
      d_yy_cap[targets[t]]-= 1
      dW_yh += np.dot(d_yy_cap,hs[t].T)
      dW_by += (d_yy_cap)
      dL_dh = np.dot(self.W_yh.T,d_yy_cap) + dhnext
      dL_dh_dtanh = (1 -hs[t]*hs[t])*dL_dh

      dW_hh+= np.dot(dL_dh_dtanh, hs[t-1].T)
      dW_xh+= np.dot(dL_dh_dtanh, xs[t].T)
      dW_bh+= np.dot(dL_dh_dtanh, 1)

      dhnext = np.dot(self.W_hh.T,dL_dh_dtanh)

    return dW_xh, dW_hh, dW_yh, dW_by, dW_bh

  def update_model(self, dW_xh, dW_hh, dW_yh,dW_by, dW_bh):
    for param, dparam in zip([self.W_xh, self.W_hh, self.W_yh, self.W_by, self.W_bh],[dW_xh, dW_hh, dW_yh, dW_by, dW_bh]):
      param+= -self.learning_rate*dparam

  def adagrad_update_model(self, dW_xh, dW_hh, dW_yh,dW_by, dW_bh):
    episilon = 1e-8
    for param, dparam, temp in zip([self.W_xh, self.W_hh, self.W_yh, self.W_by, self.W_bh],[dW_xh, dW_hh, dW_yh, dW_by, dW_bh], 
                             [self.mW_xh, self.mW_hh, self.mW_yh, self.mW_by, self.mW_bh]):
      temp = dparam*dparam
      param += -(self.learning_rate/np.sqrt(temp+episilon))*dparam

  def predict(self, data_reader, start_char, predict_len):
    x = np.zeros((self.vocab_size,1))
    ix = datareader.char_to_ix[start_char]
    x[ix] = 1
    indexes=[ix]
    hs = np.zeros((self.hidden_size, 1))
    for t in range(predict_len):
      hs = np.tanh(np.dot(self.W_hh,hs) + np.dot(self.W_xh,x) + self.W_bh)
      os = np.dot(self.W_yh,hs) + self.W_by
      ycap = self.softmax(os)
      ix = np.random.choice(range(self.vocab_size), p = ycap.ravel()) 
      x = np.zeros((self.vocab_size, 1))
      x[ix]= 1
      indexes.append(ix)
    
    txt = ''.join(data_reader.ix_to_char[i] for i in indexes)
    print(txt)

  


def train(hidden_layer, update_function):
  datareader = DataReader()
  inputs, targets = datareader.get_inputs_targets(datareader.data)
  hidden_size = hidden_layer
  rnn = RNN(hidden_size,datareader.vocab_size,len(inputs), 1e-03)
  loss = 100
  iter = 0
  while(iter < 10000):
    hprev= np.zeros((hidden_size, 1))
    xs, hs , ycap = rnn.forward(inputs,hprev)
    loss = rnn.loss(ycap, targets) 
    dW_xh, dW_hh, dW_yh, dW_by, dW_bh =  rnn.backward(xs, hs , ycap,targets)
    if(update_function == "Adagrad"):
      rnn.adagrad_update_model(dW_xh, dW_hh, dW_yh, dW_by, dW_bh)
    else:
      rnn.update_model(dW_xh, dW_hh, dW_yh, dW_by, dW_bh)
    if not iter%1000:
      print("iter num", iter, loss)
      print(rnn.predict(datareader,'R', rnn.seq_length))
    iter+=1


**TASK 1** 

adding bias variables to RNN

In [4]:
train(10, "SGD")

16
{'r': 0, ' ': 12, 'N': 2, 'm': 3, 's': 4, 'R': 5, 'a': 6, 'h': 7, 'o': 8, 'c': 9, 'f': 10, 't': 11}
['r', ' ', 'N', 'm', 's', 'R', 'a', 'h', 'o', 'c', 'f', 't', ' ']
iter num 0 [45.88606591]
RmchRootaRcf sNom
None
iter num 1000 [18.95104944]
RNNcr a scmtofrfr
None
iter num 2000 [5.31834374]
RoN srfm fooma fr
None
iter num 3000 [2.31751402]
R N from fcratcr 
None
iter num 4000 [1.34732055]
RNN from scratch 
None
iter num 5000 [0.92062298]
RNscrct tch N fro
None
iter num 6000 [0.69020964]
RNN from scratch 
None
iter num 7000 [0.54856352]
RNN from scratcr 
None
iter num 8000 [0.45359706]
RNN frhm scratch 
None
iter num 9000 [0.38587279]
RNN from scratch 
None


**TASK 2**

**Using Adagrad gradient descent optimization**

In [5]:
train(10, "Adagrad")

16
{'r': 0, ' ': 12, 'N': 2, 'm': 3, 's': 4, 'R': 5, 'a': 6, 'h': 7, 'o': 8, 'c': 9, 'f': 10, 't': 11}
['r', ' ', 'N', 'm', 's', 'R', 'a', 'h', 'o', 'c', 'f', 't', ' ']
iter num 0 [40.74525058]
RcNrhacrccfa tRRN
None
iter num 1000 [0.09481897]
RNN from scratch 
None
iter num 2000 [0.00029598]
RNN from scratch 
None
iter num 3000 [9.94805512e-05]
RNN from scratch 
None
iter num 4000 [5.9674066e-05]
RNN from scratch 
None
iter num 5000 [4.2619121e-05]
RNN from scratch 
None
iter num 6000 [3.31470261e-05]
RNN from scratch 
None
iter num 7000 [2.71202848e-05]
RNN from scratch 
None
iter num 8000 [2.29482212e-05]
RNN from scratch 
None
iter num 9000 [1.9888776e-05]
RNN from scratch 
None


**TASK 3**

Experimenting with various hidden vector sizes 

In [6]:
train(15, "SGD")

16
{'r': 0, ' ': 12, 'N': 2, 'm': 3, 's': 4, 'R': 5, 'a': 6, 'h': 7, 'o': 8, 'c': 9, 'f': 10, 't': 11}
['r', ' ', 'N', 'm', 's', 'R', 'a', 'h', 'o', 'c', 'f', 't', ' ']
iter num 0 [42.76237147]
RRshoarRorootst R
None
iter num 1000 [12.72081612]
RNN scramroRaNrac
None
iter num 2000 [3.33863529]
RNN from scratch 
None
iter num 3000 [1.41059601]
RNN from ffom scr
None
iter num 4000 [0.80864862]
RNN from sfratch 
None
iter num 5000 [0.55149002]
RNN from scratch 
None
iter num 6000 [0.4135686]
RNN from scratch 
None
iter num 7000 [0.32876298]
RNN from scratch 
None
iter num 8000 [0.27176859]
RNN from scratch 
None
iter num 9000 [0.23101679]
RNN from scratch 
None


In [7]:
train(20, "SGD")

16
{'r': 0, ' ': 12, 'N': 2, 'm': 3, 's': 4, 'R': 5, 'a': 6, 'h': 7, 'o': 8, 'c': 9, 'f': 10, 't': 11}
['r', ' ', 'N', 'm', 's', 'R', 'a', 'h', 'o', 'c', 'f', 't', ' ']
iter num 0 [40.59911386]
Rrc   Rcc ractfsc
None
iter num 1000 [7.34489396]
RNNsfraacs scscro
None
iter num 2000 [1.50269682]
RNN  rot scramch 
None
iter num 3000 [0.72407051]
RNN from scratch 
None
iter num 4000 [0.4637966]
RNN from schatch 
None
iter num 5000 [0.33738832]
RNN from scratch 
None
iter num 6000 [0.26351508]
RNN from scratch 
None
iter num 7000 [0.21533863]
RNN from scratch 
None
iter num 8000 [0.18155843]
RNN from scratch 
None
iter num 9000 [0.15662216]
RNN from scratch 
None
