In [1]:
import numpy as np
def softmax(x):
    x=x-np.max(x)
    return np.exp(x)/np.sum(np.exp(x),axis=0)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [2]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    Wf = parameters["Wf"]
    bf = parameters["bf"]
    Wi = parameters["Wi"]
    bi = parameters["bi"]
    Wc = parameters["Wc"]
    bc = parameters["bc"]
    Wo = parameters["Wo"]
    bo = parameters["bo"]
    Wy = parameters["Wy"]
    by = parameters["by"]
    n_x, m = xt.shape
    n_y, n_a = Wy.shape
    concat=np.concatenate((a_prev, xt), axis=0)
    ft = sigmoid(np.matmul(Wf,concat)+bf)
    it = sigmoid(np.matmul(Wi,concat)+bi)
    cct = np.tanh(np.matmul(Wc,concat)+bc)
    c_next = (c_prev*ft) +(cct*it)
    ot = sigmoid(np.matmul(Wo,concat)+bo)
    a_next = ot*(np.tanh(c_next))
    yt_pred = softmax(np.matmul(Wy,a_next)+by)
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)
    return a_next, c_next, yt_pred, cache

In [3]:
def lstm_forward(x, a0, parameters):
    caches = []
    n_x, m, T_x = x.shape
    n_y, n_a = parameters['Wy'].shape
    a = np.zeros((n_a,m, T_x))
    c = np.zeros((n_a,m, T_x))
    y = np.zeros((n_y,m, T_x))
    a_prev = a0
    c_prev = np.zeros((n_a, m))
    for t in range(T_x):
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_prev, c_prev, parameters)
        a_prev=a_next
        c_prev=c_next
        a[:,:,t] = a_next
        c[:,:,t]  = c_next
        y[:,:,t] = yt
        caches.append(cache)
    caches = (caches, x)
    return a, y, c, caches

In [4]:
def lstm_cell_backward(da_next, dc_next, cache):
    """
    Implement the backward pass for the LSTM-cell (single time-step).

    Arguments:
    da_next -- Gradients of next hidden state, of shape (n_a, m)
    dc_next -- Gradients of next cell state, of shape (n_a, m)
    cache -- cache storing information from the forward pass

    Returns:
    gradients -- python dictionary containing:
                        dxt -- Gradient of input data at time-step t, of shape (n_x, m)
                        da_prev -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
                        dc_prev -- Gradient w.r.t. the previous memory state, of shape (n_a, m, T_x)
                        dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
                        dWo -- Gradient w.r.t. the weight matrix of the output gate, numpy array of shape (n_a, n_a + n_x)
                        dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
                        dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
                        dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
                        dbo -- Gradient w.r.t. biases of the output gate, of shape (n_a, 1)
    """
    (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache
    n_x, m = xt.shape
    n_a, m = a_next.shape
    dot = da_next*np.tanh(c_next)*ot*(1-ot)
    dcct = (dc_next*it+ot*(1-np.tanh(c_next)*np.tanh(c_next))*it*da_next)*(1-cct*cct)
    dit  = (dc_next*cct+ot*(1-np.tanh(c_next)*np.tanh(c_next))*cct*da_next)*(it)*(1-it)
    dft = (dc_next*c_prev+ot*(1-np.tanh(c_next)*np.tanh(c_next))*c_prev*da_next)*ft*(1-ft)
    dWf = np.matmul(dft,np.concatenate((a_prev, xt), axis=0).T)
    dWi = np.matmul(dit,np.concatenate((a_prev, xt), axis=0).T)
    dWc = np.matmul(dcct,np.concatenate((a_prev, xt), axis=0).T)
    dWo = np.matmul(dot,np.concatenate((a_prev, xt), axis=0).T)
    dbf = np.sum(dft,axis=1,keepdims=True)
    dbi = np.sum(dit,axis=1,keepdims=True)
    dbc = np.sum(dcct,axis=1,keepdims=True)
    dbo = np.sum(dot,axis=1,keepdims=True)
    
    Wf = parameters["Wf"]
    Wf=Wf[:,:n_a]
    Wi = parameters["Wi"]
    Wi=Wi[:,:n_a]
    Wc = parameters["Wc"]
    Wc=Wc[:,:n_a]
    Wo = parameters["Wo"]
    Wo=Wo[:,:n_a]
    
    # Compute derivatives w.r.t previous hidden state, previous memory state and input. Use equations (15)-(17). (≈3 lines)
    da_prev = np.matmul(Wf.T,dft)+np.matmul(Wi.T,dit)+np.matmul(Wc.T,dcct)+np.matmul(Wo.T,dot)
    dc_prev = dc_next*ft+ot*(1-np.tanh(c_next)*np.tanh(c_next))*ft*da_next
    Wf = parameters["Wf"]
    Wf=Wf[:,n_a:]
    Wi = parameters["Wi"]
    Wi=Wi[:,n_a:]
    Wc = parameters["Wc"]
    Wc=Wc[:,n_a:]
    Wo = parameters["Wo"]
    Wo=Wo[:,n_a:]
    dxt = np.matmul(Wf.T,dft)+np.matmul(Wi.T,dit)+np.matmul(Wc.T,dcct)+np.matmul(Wo.T,dot)
    ### END CODE HERE ###
    
    # Save gradients in dictionary
    gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}

    return gradients

In [5]:
def lstm_backward(da, caches):
    
    """
    Implement the backward pass for the RNN with LSTM-cell (over a whole sequence).

    Arguments:
    da -- Gradients w.r.t the hidden states, numpy-array of shape (n_a, m, T_x)
    dc -- Gradients w.r.t the memory states, numpy-array of shape (n_a, m, T_x)
    caches -- cache storing information from the forward pass (lstm_forward)

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradient of inputs, of shape (n_x, m, T_x)
                        da0 -- Gradient w.r.t. the previous hidden state, numpy array of shape (n_a, m)
                        dWf -- Gradient w.r.t. the weight matrix of the forget gate, numpy array of shape (n_a, n_a + n_x)
                        dWi -- Gradient w.r.t. the weight matrix of the update gate, numpy array of shape (n_a, n_a + n_x)
                        dWc -- Gradient w.r.t. the weight matrix of the memory gate, numpy array of shape (n_a, n_a + n_x)
                        dWo -- Gradient w.r.t. the weight matrix of the save gate, numpy array of shape (n_a, n_a + n_x)
                        dbf -- Gradient w.r.t. biases of the forget gate, of shape (n_a, 1)
                        dbi -- Gradient w.r.t. biases of the update gate, of shape (n_a, 1)
                        dbc -- Gradient w.r.t. biases of the memory gate, of shape (n_a, 1)
                        dbo -- Gradient w.r.t. biases of the save gate, of shape (n_a, 1)
    """

    # Retrieve values from the first cache (t=1) of caches.
    (caches, x) = caches
    (a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]
    
    ### START CODE HERE ###
    # Retrieve dimensions from da's and x1's shapes (≈2 lines)
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # initialize the gradients with the right sizes (≈12 lines)
    dx = np.zeros((n_x, m, T_x))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
    dc_prevt = np.zeros((n_a, m))
    dWf = np.zeros((n_a, n_a+n_x))
    dWi = np.zeros((n_a, n_a+n_x))
    dWc = np.zeros((n_a, n_a+n_x))
    dWo = np.zeros((n_a, n_a+n_x))
    dbf = np.zeros((n_a,1))
    dbi = np.zeros((n_a,1))
    dbc = np.zeros((n_a,1))
    dbo = np.zeros((n_a,1))
    for t in reversed(range(T_x)):
        gradients = lstm_cell_backward(da[:,:,t]+da_prevt, dc_prevt, caches[t])
        dx[:,:,t] = gradients["dxt"]
        dWf += gradients["dWf"]
        dWi += gradients["dWi"]
        dWc += gradients["dWc"]
        dWo += gradients["dWo"]
        dbf += gradients["dbf"]
        dbi += gradients["dbi"]
        dbc += gradients["dbc"]
        dbo += gradients["dbo"]
        da_prev=gradients["da_prev"]
        dc_prev=gradients["dc_prev"]
    # Set the first activation's gradient to the backpropagated gradient da_prev.
    da0 = da_prev
    
    ### END CODE HERE ###

    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
                "dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
    
    return gradients

<img src="images/IMG_2191.jpg" >

<img src="images/IMG_2190.jpg" >
<img src="images/IMG_2192.jpg" >