In [0]:
#libraries needed

import numpy as np

In [0]:
#helper functions that we need 

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
  
def initialize_adam(parameters):
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)  
    return v, s


def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    L = len(parameters) // 2                 
    v_corrected = {}                         
    s_corrected = {}  
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)] 
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)] 
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1**t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1**t)

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] ** 2)
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] ** 2)
        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)] / np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / np.sqrt(s_corrected["db" + str(l+1)] + epsilon)
    return parameters, v, s


**basic RNN**


1.   takes as input the previous activation and the current input. 

    calculate current activation At = tanh(np.dot(Waa, At-1) + np.dot(Wax, Xt) + bias_A)


2.   take current activation and output y
      
      that is , Yt = softmax( np.dot(Wya, At) + bias_y)
      
3. Store At, At-1, Xt, parameters in cache

4. return At, Yt




---

**What is important is dimensions , when and how , who got what!**

>Xt.shape = (n_x, m)  

>At.shape = (n_a, m)

>Wax.shape = (n_a, n_x)

>Waa.shape = (n_a, n_a)

>Wya.shape = ( n_y, n_a)

>b_a.shape = (n_a, 1)

>b_y.shape = (n_y, 1)










In [0]:
def rnn_forward_block(xt, aprev, parameters):
  #1. take back the parameters into Waa, Wax, Wya, ba, bx
  Wax = parameters['Wax']
  Waa = parameters['Waa']
  Wya = parameters['Wya']
  ba = parameters['ba']
  by  = parameters['by']
  
  anext = np.tanh( np.dot(Wax, xt) + np.dot( Waa, aprev) + ba)
  yhat = np.dot(Wya, anext) + by
  
  cache = (anext, aprev, parameters, xt)
  
  return yhat, anext, cache

that was a single block.

now ,
X = (X1, X2, ..., Xtx)

thus tx such blocks will work.



---

Now what we need: 

1. A0 - vector of zeros of shape (n_a, m)

2. Loop over the rnn_forward tx times.
3. update a_prev everytime.
4. store the prediction in y
5. add cache into the list  caches

return a, y, caches



In [0]:
def rnn_forward(x, A0, parameters):
  n_x , m, T_x = x.shape
  n_y, n_a = parameters['Wya'].shape
  
  #A0 = np.zeros((n_a, m))
  
  A = np.zeros((n_a, m, T_x))
  Y = np.zeros((n_y, m, T_x))
  
  A_prev = A0
  caches = []
  
  for i in range(T_x):
    y_i, a_i, cache = rnn_forward_block(x[:,:,i], A_prev, parameters)
    A_prev = a_i
    
    Y[:,:,i] = y_i
    A[:,:,i] = a_i
    caches.append(cache)
    
  caches = (caches, x)
    
  return Y, A, caches



In [9]:
#check that what you have done is correct

np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

y_pred,a, caches = rnn_forward(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("y_pred[1][3] =", y_pred[1][3])
print("y_pred.shape = ", y_pred.shape)
print("caches[1][1][3] =", caches[1][1][3])
print("len(caches) = ", len(caches))

a[4][1] =  [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape =  (5, 10, 4)
y_pred[1][3] = [ 1.82870192  1.98141611 -0.7803359   1.26637504]
y_pred.shape =  (2, 10, 4)
caches[1][1][3] = [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) =  2


## LSTM Model

In [0]:
def lstm_cell_forward(xt, aprev, cprev, parameters):
  #parameters retrivel
  
  Wf = parameters['Wf']
  bf = parameters['bf']
  
  Wu = parameters['Wu']
  bu = parameters['bu']
  
  Wo = parameters['Wo']
  bo = parameters['bo']
  
  Wc = parameters['Wc']
  bc = parameters['bc']
  
  Wy = parameters['Wy']
  by = parameters['by']
  
  nx, m = xt.shape
  ny, na = Wy.shape
  
  concat = np.zeros((na+nx, m))
  concat[:na, :] = aprev
  concat[na:, :] = nx
  
  forget = sigmoid(np.dot(Wf, concat) + bf) #forget from previous memory cell
  
  ctilda = np.tanh(np.dot(Wc, concat) + bc)
  update = sigmoid(np.dot(Wu, concat) + bu)
  
  cnext = np.multiply(forget, cprev) + np.multiply(update,ctilda) #new memory cell
  
  output = sigmoid(np.dot(Wo, concat) + bo)
  anext = np.multiply(output, np.tanh(cnext) )
  
  y = softmax(np.dot(Wy, anext) + by)
  
  cache = (anext, cnext, aprev, cprev, forget, update, ctilda, output, xt, parameters)
  
  return anext, cnext, y, cache

  
  

In [0]:
def lstm_forward(x, a0, parameters):
  caches = []
  nx, m, Tx = x.shape
  ny, na = parameters['Wy'].shape
  
  A = np.zeros((na, m, Tx))
  C = np.zeros((na, m, Tx))
  Y = np.zeros((ny, m, Tx))
  
  aprev = a0
  cprev = np.zeros(aprev.shape)
  
  for i in range(Tx):
    anext, cnext, y, cache = lstm_cell_forward(x[:,:,i], aprev, cprev, parameters)
    aprev = anext
    cprev = cnext
    A[:,:,i] = anext
    C[:,:,i] = cnext
    Y[:,:,i] = y
    
    caches.append(cache)
    
  caches = (caches, x)
  return A, Y, C, caches


In [32]:
np.random.seed(1)
x = np.random.randn(3,10,7)
a0 = np.random.randn(5,10)

Wf = np.random.randn(5, 5+3)
bf = np.random.randn(5,1)

Wu = np.random.randn(5, 5+3)
bu = np.random.randn(5,1)

Wo = np.random.randn(5, 5+3)
bo = np.random.randn(5,1)

Wc = np.random.randn(5, 5+3)
bc = np.random.randn(5,1)

Wy = np.random.randn(2,5)
by = np.random.randn(2,1)

parameters = {"Wf": Wf, "Wu": Wu, "Wo": Wo, "Wc": Wc, "Wy": Wy, "bf": bf, "bu": bu, "bo": bo, "bc": bc, "by": by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] =", y[1][4][3])
print("y.shape = ", y.shape)
print("caches[1][1[1]] =", caches[1][1][1])
print("c[1][2][1]", c[1][2][1])
print("len(caches) = ", len(caches))

a[4][3][6] =  0.0007505869303062002
a.shape =  (5, 10, 7)
y[1][4][3] = 0.9089844972350362
y.shape =  (2, 10, 7)
caches[1][1[1]] = [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] 0.03131035093604401
len(caches) =  2


## Backpropogation

- in basic RNN 

- In LSTM rnn 