In [36]:
import numpy as np

In [37]:
def softmax(x):
    x=x-np.max(x)
    return np.exp(x)/np.sum(np.exp(x),axis=0)

In [38]:
def rnn_cell_forward(xt, a_prev, parameters):
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    a_next=np.tanh(np.matmul(Wax,xt)+np.matmul(Waa,a_prev)+ba)
    yt_pred=softmax(np.matmul(Wya,a_next)+by)
    cache=(a_prev,a_next,xt,parameters)
    return yt_pred,cache

In [40]:
def rnn_forward(x, a0, parameters):
    n_x,m,T_x=x.shape
    n_a=a0.shape[0]
    a=np.zeros((n_a,m,T_x))
    Wya=parameters["Wya"]
    n_y=Wya.shape[0]
    y_pred=np.zeros((n_y,m,T_x))
    caches=[]
    a_prev=a0
    for t in range(T_x):
        yt_pred,cache=rnn_cell_forward(x[:,:,t], a_prev, parameters)
        _,a_next,_,_=cache
        a[:,:,t]=a_next
        y_pred[:,:,t]=yt_pred
        caches.append(cache)
        a_prev=a_next
    caches = (caches, x)
    return a, y_pred, caches

In [41]:
def rnn_cell_backward(da_next, cache):
    a_prev,a_next,xt,parameters=cache
    dtemp=1-(a_next*a_next)
    dWaa=np.matmul(da_next*dtemp,a_prev.T) # * operation on matrix is dot product by default
    dWax=np.matmul(da_next*dtemp,xt.T)
    dxt= np.matmul(Wax.T,da_next*dtemp)
    da_prev= np.matmul(Waa.T,da_next*dtemp)
    dba=np.sum(da_next*dtemp,axis=1,keepdims=True)
    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    return gradients

In [42]:
def rnn_backward(da, caches):
    caches,x=caches
    n_x,m,T_x=x.shape
    a0,a1,x1,parameters=caches[0]
    n_a=a0.shape[0]
    da_prev=np.zeros((n_a,m)) 
    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    for t in reversed(range(T_x)):
        gradients=rnn_cell_backward(da_prev+da[:,:,t], caches[t])
        dxt, da_prev, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat
    da0 = da_prev
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    return gradients

### Equations used-

<img src="images/IMG_2171.jpg" >
<img src="images/IMG_2185.jpg" >
<img src="images/IMG_2186.jpg" >
<img src="images/IMG_2187.jpg" >
<img src="images/IMG_2188.jpg" >