## Simple RNN Implementation

- with tanh activation

- with gradient clipping

### Author: Yifan Wang

In [17]:
import numpy as np
import matplotlib.pyplot as plt 

In [18]:
# Some Helper Functions:
def d_tanh(x):

    return 1 - np.tanh(x)*np.tanh(x)

def softmax(x):
    '''Softmax'''
    return np.exp(x) / np.sum(np.exp(x), axis=0)

Generally at each time step of forward, we do:

    a_now = activation_relu(Waa * a_previous + Wax * X_now  + ba)
    y_hat = Softmax(activation_relu(Wya*a_now + by))
    

## Construct Data:

In [19]:
data = [
    'hello',
    'jello',
    'kello',
    'yello',
    'noon',
    'moon',
    'soon',
    'zoon'
]
data = [list(word) for word in data]

In [20]:
# Dict:
c2id = {v:k+1 for k,v in enumerate(set([j for i in data for j in i]))}
c2id

{'e': 6,
 'h': 3,
 'j': 10,
 'k': 9,
 'l': 7,
 'm': 8,
 'n': 2,
 'o': 11,
 's': 1,
 'y': 5,
 'z': 4}

In [21]:
c2id[''] = 0

In [22]:
# Encoding:
def dat_encode(data,c2id):
    new_data =[]
    for word in data:
        new_word = []
        for char in word:
            new_word.append(c2id[char])
        new_data.append(new_word)
    return new_data

In [23]:
data = dat_encode(data,c2id)
data

[[3, 6, 7, 7, 11],
 [10, 6, 7, 7, 11],
 [9, 6, 7, 7, 11],
 [5, 6, 7, 7, 11],
 [2, 11, 11, 2],
 [8, 11, 11, 2],
 [1, 11, 11, 2],
 [4, 11, 11, 2]]

In [24]:
Y = np.array([i[-1] for i in data])

In [25]:
Y

array([11, 11, 11, 11,  2,  2,  2,  2])

In [26]:
counter = 0
array_uniqueYs = np.unique(Y)

for uni_Y in array_uniqueYs:
    if counter not in np.unique(Y):
        Y[Y==uni_Y] = counter
        counter += 1


    

In [27]:
Y

array([1, 1, 1, 1, 0, 0, 0, 0])

In [28]:
Y = Y.tolist()

In [29]:
# Use all letters before to predict the last letter of each word:
X = [i[:-1] for i in data]
X

[[3, 6, 7, 7],
 [10, 6, 7, 7],
 [9, 6, 7, 7],
 [5, 6, 7, 7],
 [2, 11, 11],
 [8, 11, 11],
 [1, 11, 11],
 [4, 11, 11]]

In [30]:
# post padding:
X = [word+[0]*(max([len(i) for i in X])-len(word)) for word in X]
X

[[3, 6, 7, 7],
 [10, 6, 7, 7],
 [9, 6, 7, 7],
 [5, 6, 7, 7],
 [2, 11, 11, 0],
 [8, 11, 11, 0],
 [1, 11, 11, 0],
 [4, 11, 11, 0]]

In [31]:
#OHE:
m = len(X) # sample size
n = len(c2id)+1 # feature size:
t = max([len(i) for i in X]) # time-step
print((m,n,t))

(8, 13, 4)


In [32]:
dat_mat = np.zeros((m,n,t))

In [33]:
ny = len(np.unique(Y))

y_mat = np.zeros((m,ny))

In [34]:
y_mat.shape

(8, 2)

In [35]:
# Data Preparation for Modeling:

for row in range(m):
    
    word = X[row]
    
    for ts in range(t):
        charidx = word[ts]
        
        dat_mat[row,charidx,ts] = 1
    
    yletteridx = Y[row]
    
    y_mat[row,yletteridx] = 1
    
    
        
    

In [36]:
dat_mat.shape

(8, 13, 4)

In [37]:
y_mat.shape

(8, 2)

## Modeling:

In [119]:
N_EPOCHS = 200
N_HIDDEN = 10
LEARNING_RATE = 0.0005


In [120]:
# Initialize Weight Matrix:
np.random.seed(0)
Wxa = 2*np.random.random((n,N_HIDDEN))-1
print("Wxa:{}".format(Wxa.shape))

Waa = 2*np.random.random((N_HIDDEN,N_HIDDEN))-1
print("Waa:{}".format(Waa.shape))


Way = 2*np.random.random((N_HIDDEN,ny))-1
print("Way:{}".format(Way.shape))


ba = np.zeros((1,N_HIDDEN))
print("ba:{}".format(ba.shape))

by = np.zeros((1,ny))
print("by:{}".format(by.shape))

a0 = 2*np.random.random((m,N_HIDDEN))-1
print("a0:{}".format(a0.shape))

a = 2*np.random.random((m,N_HIDDEN,t))-1
print("a:{}".format(a.shape))


y_pred = 2*np.random.random((m,ny,t))-1
print("y_pred:{}".format(y_pred.shape))

Wxa:(13, 10)
Waa:(10, 10)
Way:(10, 2)
ba:(1, 10)
by:(1, 2)
a0:(8, 10)
a:(8, 10, 4)
y_pred:(8, 2, 4)


In [121]:
parameters = {
    'Wxa':Wxa,
    'Waa':Waa,
    'Way':Way,
    'ba':ba,
    'by':by
}

In [122]:
all_error = []

In [123]:

for i in range(N_EPOCHS): # error after 10
    
    
    a_prev = a0
    caches = []
    overall_error = 0

    
    ##### Forward Propagate #####
    # initialization of  gradients
    dx = np.zeros((dat_mat.shape))#6-14-4  #m-n-t
    dWxa = np.zeros(Wxa.shape)
    dWaa = np.zeros(Waa.shape)
    dWay = np.zeros(Way.shape)
    dba = np.zeros(ba.shape)
    dby = np.zeros(by.shape)
    da0 = np.zeros(a0.shape)
    da = np.zeros(a.shape)
    da_prevt = np.zeros(a0.shape)


    layer_2_deltas = np.zeros((m,ny,t))

    
    print_error = 0
    for ts in range(t):
        # a-state, passing to next t-step
        a_next = np.tanh(np.dot(dat_mat[:,:,ts],Wxa) + np.dot(a_prev,Waa) + ba)  # m-hidden
        a[:,:,ts] = a_next
        # prediction of current time step:
        yt_pred = softmax(np.dot(a_next,Way)+by)
        y_pred[:,:,ts] = yt_pred  
        # save a for next t-step
        a_prev = a_next
        # cost of current t-step:
        cost = y_mat-yt_pred
        # output layer error rate : error* dtanh()
        layer_2_delta=cost*d_tanh(yt_pred)
        layer_2_deltas[:,:,ts] = layer_2_delta #6-14
        print_error+=cost
        cache = (a_next, a_prev, dat_mat[:,:,ts], parameters)
        caches.append(cache)
        #######################
    
    
    if i%10==0:
        print('Epoch {}'.format(i))
        all_error.append(np.sum(np.abs(y_mat-yt_pred)))
        print_pred = np.argmax(yt_pred,axis=-1)
        print("Predicted:{}".format(print_pred))
        print("Actual:{}".format(np.argmax(y_mat,axis=-1)))
        print('=====================')





        


        

    # Prepare:
    #Get pars from f-prop:
    (a1, a0, x1, parameters) = caches[0]
    # initialize:
    next_delta = np.zeros((m,N_HIDDEN))
    # Figure out how to get Da

    
    
    
    
    
    # Back Propagate:
    for ts in reversed(range(t)):
        cache = caches[ts]
        (a_now, a_prev, x_now, parameters) = cache
        Wxa = parameters["Wxa"]
        Waa = parameters["Waa"]
        Way = parameters["Way"]
        ba = parameters["ba"]
        by = parameters["by"]
        # y_pred error rate
        layer_2_delta = layer_2_deltas[:,:,ts]
        # propagate back:
#         if ts ==t-1:
        da_now = (np.dot(next_delta,Waa) +  np.dot(layer_2_delta,Way.T))*d_tanh(a_now)
#         else:
#             da_now = np.dot(next_delta,Waa)**d_tanh(a_now)

    
    
    
        dWayt = np.dot(a_now.T,layer_2_delta)
        dWaat = np.dot( da_now.T,a_prev) #20-20
        dWxat = np.dot(x_now.T,da_now) # 14-20
        dbat = np.sum( d_tanh(a_now),keepdims=True,axis=0)
        next_delta = da_now
        dxt = np.dot(da_now,Wxa.T)#6-14


        

        
        
        dWxa += dWxat
        dWaa += dWaat
        dWay += dWayt
        dba += dbat


    # Gradient Clipping to avoid Gradient Exploding:
    dWxa = np.clip(dWxa,-50,50)
    dWaa = np.clip(dWaa,-50,50)
    dWay = np.clip(dWay,-50,50)
    dba = np.clip(dba,-50,50)


    # Updating
    Wxa+=LEARNING_RATE*dWxa
    Waa+=LEARNING_RATE*dWaa
    Way+=LEARNING_RATE*dWay
    ba+=LEARNING_RATE*dba

    # resetting
    dWxat *= 0 
    dWaat *= 0
    dWayt *= 0  
    dbat  *= 0
    
    
    
    
    


Epoch 0
Predicted:[0 1 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 10
Predicted:[0 0 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 20
Predicted:[0 0 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 30
Predicted:[0 1 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 40
Predicted:[0 1 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 50
Predicted:[0 0 1 0 0 1 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 60
Predicted:[0 0 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 70
Predicted:[0 1 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 80
Predicted:[0 0 1 1 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 90
Predicted:[0 0 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 100
Predicted:[1 0 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 110
Predicted:[1 0 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 120
Predicted:[1 0 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 130
Predicted:[1 1 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 140
Predicted:[1 1 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epoch 150
Predicted:[1 1 1 0 0 0 0 0]
Actual:[1 1 1 1 0 0 0 0]
Epo

### Summary:

As we training more than 130 Epochs, the model is able to get 100% in sample accuracy.
Again this is only a vanilla RNN