## Libraries

In [0]:
import numpy as np

## Utilities

In [0]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def initialize_adam(parameters) :
    """
    Initializes v and s as two python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL" 
                - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
    
    Arguments:
    parameters -- python dictionary containing your parameters.
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    
    Returns: 
    v -- python dictionary that will contain the exponentially weighted average of the gradient.
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...

    """
    
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
    ### START CODE HERE ### (approx. 4 lines)
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
    ### END CODE HERE ###
    
    return v, s


def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    """
    Update parameters using Adam
    
    Arguments:
    parameters -- python dictionary containing your parameters:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    learning_rate -- the learning rate, scalar.
    beta1 -- Exponential decay hyperparameter for the first moment estimates 
    beta2 -- Exponential decay hyperparameter for the second moment estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates

    Returns:
    parameters -- python dictionary containing your updated parameters 
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    """
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        ### START CODE HERE ### (approx. 2 lines)
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)] 
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)] 
        ### END CODE HERE ###

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        ### START CODE HERE ### (approx. 2 lines)
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1**t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1**t)
        ### END CODE HERE ###

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        ### START CODE HERE ### (approx. 2 lines)
        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] ** 2)
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] ** 2)
        ### END CODE HERE ###

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        ### START CODE HERE ### (approx. 2 lines)
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)
        ### END CODE HERE ###

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        ### START CODE HERE ### (approx. 2 lines)
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)] / np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / np.sqrt(s_corrected["db" + str(l+1)] + epsilon)
        ### END CODE HERE ###

    return parameters, v, s

## Functions

### Forward

In [0]:
def rnn_cell_forward(xt, a_prev, parameters):
  # Retrieve parameters from "parameters"
  (Wax, Waa, Wya, ba, by) = (parameters["Wax"], parameters["Waa"], parameters["Wya"], parameters["ba"], parameters["by"])

  # Compute next activation state using the formula given above
  a_next = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)

  # Compute output of the current cell using the formula given above
  yt_pred = softmax(np.dot(Wya, a_next) + by)

  # Store values you need for backward propagation in cache
  cache = (a_next, a_prev, xt, parameters)

  return a_next, yt_pred, cache

def rnn_forward(x, a0, parameters):
  # Initialize "caches" which will contain the list of all caches
  caches = []

  # Retrieve dimensions from shapes of x and parameters["Wya"]
  n_x, m, T_x = x.shape
  n_y, n_a = parameters["Wya"].shape

  # Initialize "a" and "y" with zeros (=2 lines)
  (a, y_pred) = (np.zeros((n_a, m, T_x)), np.zeros((n_y, m, T_x)))

  # Initialize a_next (=1 line)
  a_next = a0

  # Loop over all time-steps
  for t in range(0, T_x):
    # Update next hiddent state, compute the prediction, get the cache(=1 line)
    a_next, yt_pred, cache = rnn_cell_forward(x[:,:,t], a_next, parameters)

    # Save the value of the new "next" hidden state in a (=1 line)
    a[:,:,t] = a_next

    # Save the value of the prediction in y (=1 line)
    y_pred[:,:,t] = yt_pred

    # Append "cache" to "caches" (=1 line)
    caches.append(cache)
  
  # Store values needed for backward propagation in cache
  caches = (caches, x)

  return a, y_pred, caches

def lstm_cell_forward(xt, a_prev, c_prev, parameters):
  # Retrieve parameters from "parameters"
  (Wf, bf, Wi, bi, Wc, bc) = (parameters["Wf"], parameters["bf"], parameters["Wi"], parameters["bi"], parameters["Wc"], parameters["bc"])
  (Wo, bo, Wy, by) = (parameters["Wo"], parameters["bo"], parameters["Wy"], parameters["by"])

  # Retrieve dimensions from shapes of xt, and Wy
  n_x, m = xt.shape
  n_y, n_a = Wy.shape

  # Concatenate a_prev and xt (=3 lines)
  concat = np.zeros((n_x + n_a, m))
  concat[:n_a, :] = a_prev
  concat[n_a:, :] = xt

  # Compute values for ft, it, cct, c_next, ot, a_next
  # Using the formulas given figure(4) (=6 lines)
  ft = sigmoid(np.dot(Wf, concat) + bf)
  it = sigmoid(np.dot(Wi, concat) + bi)
  cct = np.tanh(np.dot(Wc, concat) + bc)
  c_next = c_prev*ft + it*cct
  ot = sigmoid(np.dot(Wo, concat) + bo)
  a_next = ot*np.tanh(c_next)

  # Compute prediction of the LSTM cell (=1 line)
  yt_pred = softmax(np.dot(Wy, a_next) + by)

  # Store values needed for backward propagation in cache
  cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

  return a_next, c_next, yt_pred, cache

def lstm_forward(x, a0, parameters):
  # Initialize "caches", which will track the list of all the caches
  caches = []

  # Retrieve dimensions from shapes of x and parameters["Wy"] (=2 lines)
  n_x, m, T_x = x.shape
  n_y, n_a = parameters["Wy"].shape

  # Initialize "a", "c" and "y" with zeros (=3 lines)
  a, c, y = np.zeros((n_a, m, T_x)), np.zeros((n_a, m, T_x)), np.zeros((n_y, m, T_x))

  # Initialize a_next and c_next (=2 lines)
  a_next = a0
  c_next = np.zeros((n_a, m))

  # Loop over all time-steps
  for t in range(0, T_x):
    # Update next hidden state, next memory state, compute the prediction,
    # get the cache (=1 line)
    a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)
    
    # Save the value of the new "next" hidden state in a (=1 line)
    a[:,:,t] = a_next

    # Save the value of the prediction in y (=1 line)
    y[:,:,t] = yt

    # Save the value of the next cell state (=1 line)
    c[:,:,t] = c_next

    # Append the cache into caches (=1 line)
    caches.append(cache)
  
  # Store calues needed for backward propagation in cache
  caches = (caches, x)

  return a, y, c, caches

### Backward

In [0]:
def rnn_cell_backward(da_next, cache):
  # Retrieve values from cache
  (a_next, a_prev, xt, parameters) = cache

  # Retrieve values from parameters
  (Wax, Waa, Wya, ba, by) = (parameters["Wax"], parameters["Waa"], parameters["Wya"], parameters["ba"], parameters["by"])

  # compute the gradient of tanh with respect to a_next (=1 line)
  dtanh = 1 - np.power(a_next, 2)

  # Compute the gradient of the loss with respect to Wax (=2 lines)
  dxt = np.dot(Wax.T, da_next*dtanh)
  dWax = np.dot(da_next*dtanh, xt.T)

  # Compute the gradient with respect to Waa(=1 line)
  da_prev = np.dot(Waa.T, da_next*dtanh)
  dWaa = np.dot(da_next*dtanh, a_prev.T)

  # compute the gradient with respect to b (=1 line)
  dba = np.sum(da_next*dtanh, axis=1, keepdims=True)

  # Store the gradients in a python dictionary
  gradients = {"dxt":dxt, "da_prev":da_prev, "dWax":dWax, "dWaa":dWaa, "dba":dba}

  return gradients

def rnn_backward(da, caches):
  # Retrieve values from the first cache (t=1) of caches (=2 lines)
  (caches, x) = caches
  (a1, a0, x1, parameters) = caches[1]

  # Retrieve dimensions from da's and x1's shapes (=2 lines)
  n_a, m, T_x = da.shape
  n_x, m = x1.shape

  # Initialize the gradients with the right sizes (=6 lines)
  dx = np.zeros((n_x, m, T_x))
  dWax = np.zeros((n_a, n_x))
  dWaa = np.zeros((n_a, n_a))
  dba = np.zeros((n_a, 1))
  db0 = np.zeros((n_a, m))
  da_prevt = np.zeros((n_a, m))

  # Loop through all the time steps
  for t in reversed(range(0, T_x)):
    # Compute gradients at time step t. Choose wisely the "da_next" and
    # the "cache" to use in the backward propagation step. (=1 line)
    gradients = rnn_cell_backward(da[:,:,t] + da_prevt, caches[t])

    # Retrieve derivatives from gradients (=1 line)
    (dxt, da_prevt, dWaxt, dWaat, dbat) = (gradients["dxt"], gradients["da_prev"],
      gradients["dWax"], gradients["dWaa"], gradients["dba"])

    # Increment global derivatives w.r.t parameters by adding
    dx[:,:,t] = dxt
    dWax += dWaxt
    dWaa += dWaat
    dba += dbat
  
  # Set da0 to the gradient of a which has been backpropagated through
  # all time-steps (=1 line)
  da0 = da_prevt

  # Store the gradients in a python dictionary
  gradients = {"dx":dx, "da0":da0, "dWax":dWax, "dWaa":dWaa, "dba":dba}

  return gradients

def lstm_cell_backward(da_next, dc_next, cache):
  # Retrieve information from "cache"
  (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache

  # Retrieve dimensions from xt's and a_next's shape (=2 lines)
  n_x, m = xt.shape
  n_a, m = a_next.shape

  # Compute gates related derivatives, you can find their values
  # can be found by looking caregully at equations (7) to (10) (=4 lines)
  dot = da_next*np.tanh(c_next)
  dcct = (da_next*ot*(1-np.power(np.tanh(c_next), 2)) + dc_next)*it
  dit = (da_next*ot*(1-np.power(np.tanh(c_next), 2)) + dc_next)*cct
  dft = (da_next*ot*(1-np.power(np.tanh(c_next), 2)) + dc_next)*c_prev

  # Code equations (7) to (10) (=4 lines)
  dit = dit*it*(1-it)
  dft = dft*ft*(1-ft)
  dot = dot*ot*(1-ot)
  dcct = dcct*(1-np.power(cct, 2))

  # Compute parameters relatived derivatives. Use equations (11)-(14) (=8 lines)
  concat = np.zeros((n_x + n_a, m))
  concat[:n_a, :] = a_prev
  concat[n_a:, :] = xt
  dWf = np.dot(dft, concat.T)
  dWi = np.dot(dit, concat.T)
  dWc = np.dot(dcct, concat.T)
  dWo = np.dot(dot, concat.T)
  dbf = np.sum(dft, axis=1, keepdims=True)
  dbi = np.sum(dit, axis=1, keepdims=True)
  dbc = np.sum(dcct, axis=1, keepdims=True)
  dbo = np.sum(dot, axis=1, keepdims=True)

  # Compute derivatives w.r.t previous hidden statte, previous memory
  # state and input. Use equations (15)-(17). (=3 lines)
  da_prevx = np.dot(parameters["Wf"].T, dft) + np.dot(parameters["Wo"].T, dot) + \
    np.dot(parameters["Wi"].T, dit) + np.dot(parameters["Wc"].T, dcct)
  da_prev = da_prevx[:n_a, :]
  dc_prev = (da_next*ot*(1-np.power(np.tanh(c_next), 2)) + dc_next)*ft
  dxt = da_prevx[n_a:, :]

  # Save gradients in dictionary
  gradients = {"dxt":dxt, "da_prev":da_prev, "dc_prev":dc_prev,
               "dWf":dWf, "dbf":dbf, "dWi":dWi, "dbi":dbi,
               "dWc":dWc, "dbc":dbc, "dWo":dWo, "dbo":dbo}
  
  return gradients

## Test

### Forward

In [5]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {"Waa":Waa, "Wax":Wax, "Wya":Wya, "ba":ba, "by":by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4]", a_next[4])
print("a_next.shape", a_next.shape)
print("yt_pred[1]", yt_pred[1])
print("yt_pred.shape", yt_pred.shape)

a_next[4] [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
a_next.shape (5, 10)
yt_pred[1] [0.9888161  0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
 0.36920224 0.9966312  0.9982559  0.17746526]
yt_pred.shape (2, 10)


In [6]:
np.random.seed(1)
x = np.random.randn(3, 10, 4)
a0 = np.random.randn(5, 10)
Waa = np.random.randn(5, 5)
Wax = np.random.randn(5, 3)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {"Waa":Waa, "Wax":Wax, "Wya":Wya, "ba":ba, "by":by}

a, y_pred, caches = rnn_forward(x, a0, parameters)
print("a[4][1]", a[4][1])
print("a.shape", a.shape)
print("y_pred[1]", y_pred[1][3])
print("y_pred.shape", y_pred.shape)
print("caches[1]", caches[1][1][3])
print("len(caches)", len(caches))

a[4][1] [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape (5, 10, 4)
y_pred[1] [0.79560373 0.86224861 0.11118257 0.81515947]
y_pred.shape (2, 10, 4)
caches[1] [-1.1425182  -0.34934272 -0.20889423  0.58662319]
len(caches) 2


In [7]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
c_prev = np.random.randn(5, 10)
Wf = np.random.randn(5, 5 + 3)
bf = np.random.randn(5, 1)
Wi = np.random.randn(5, 5 + 3)
bi = np.random.randn(5, 1)
Wo = np.random.randn(5, 5 + 3)
bo = np.random.randn(5, 1)
Wc = np.random.randn(5, 5 + 3)
bc = np.random.randn(5, 1)
Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)
parameters = {"Wf":Wf, "Wi":Wi, "Wo":Wo, "Wc":Wc, "Wy":Wy,
              "bf":bf, "bi":bi, "bo":bo, "bc":bc, "by":by}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("c_next[2] = ", c_next[2])
print("c_next.shape = ", c_next.shape)
print("yt[1] = ", yt[1])
print("yt.shape = ", yt.shape)
print("cache[1][3] = ", cache[1][3])
print("len(cache) = ", len(cache))

a_next[4] =  [-0.66408471  0.0036921   0.02088357  0.22834167 -0.85575339  0.00138482
  0.76566531  0.34631421 -0.00215674  0.43827275]
a_next.shape =  (5, 10)
c_next[2] =  [ 0.63267805  1.00570849  0.35504474  0.20690913 -1.64566718  0.11832942
  0.76449811 -0.0981561  -0.74348425 -0.26810932]
c_next.shape =  (5, 10)
yt[1] =  [0.79913913 0.15986619 0.22412122 0.15606108 0.97057211 0.31146381
 0.00943007 0.12666353 0.39380172 0.07828381]
yt.shape =  (2, 10)
cache[1][3] =  [-0.16263996  1.03729328  0.72938082 -0.54101719  0.02752074 -0.30821874
  0.07651101 -1.03752894  1.41219977 -0.37647422]
len(cache) =  10


In [8]:
np.random.seed(1)
x = np.random.randn(3, 10, 7)
a0 = np.random.randn(5, 10)
Wf = np.random.randn(5, 5 + 3)
bf = np.random.randn(5, 1)
Wi = np.random.randn(5, 5 + 3)
bi = np.random.randn(5, 1)
Wo = np.random.randn(5, 5 + 3)
bo = np.random.randn(5, 1)
Wc = np.random.randn(5, 5 + 3)
bc = np.random.randn(5, 1)
Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)
parameters = {"Wf":Wf, "Wi":Wi, "Wo":Wo, "Wc":Wc, "Wy":Wy,
              "bf":bf, "bi":bi, "bo":bo, "bc":bc, "by":by}

a, y, c, caches = lstm_forward(x, a0, parameters)
print("a[4][3][6] = ", a[4][3][6])
print("a.shape = ", a.shape)
print("y[1][4][3] = ", y[1][4][3])
print("y.shape = ", yt.shape)
print("caches[1][1][1] = ", caches[1][1][1])
print("c[1][2][1] = ", c[1][2][1])
print("len(cache) = ", len(cache))

a[4][3][6] =  0.17211776753291672
a.shape =  (5, 10, 7)
y[1][4][3] =  0.9508734618501101
y.shape =  (2, 10)
caches[1][1][1] =  [ 0.82797464  0.23009474  0.76201118 -0.22232814 -0.20075807  0.18656139
  0.41005165]
c[1][2][1] =  -0.8555449167181981
len(cache) =  10


### Backward

In [9]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
Wax = np.random.randn(5, 3)
Waa = np.random.randn(5, 5)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {"Waa":Waa, "Wax":Wax, "Wya":Wya, "ba":ba, "by":by}

a_next, yt_pred, cache = rnn_cell_forward(xt, a_prev, parameters)

da_next = np.random.randn(5, 10)
gradients = rnn_cell_backward(da_next, cache)
print("gradients[\"dxt\"][1][2] =", gradients["dxt"][1][2])
print("gradients[\"dxt\"].shape =", gradients["dxt"].shape)
print("gradients[\"da_prev\"][2][3] =", gradients["da_prev"][2][3])
print("gradients[\"da_prev\"].shape =", gradients["da_prev"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)

gradients["dxt"][1][2] = -1.3872130506020925
gradients["dxt"].shape = (3, 10)
gradients["da_prev"][2][3] = -0.15239949377395495
gradients["da_prev"].shape = (5, 10)
gradients["dWax"][3][1] = 0.4107728249354584
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = 1.1503450668497135
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [0.20023491]
gradients["dba"].shape = (5, 1)


In [10]:
np.random.seed(1)
x = np.random.randn(3, 10, 4)
a0 = np.random.randn(5, 10)
Wax = np.random.randn(5, 3)
Waa = np.random.randn(5, 5)
Wya = np.random.randn(2, 5)
ba = np.random.randn(5, 1)
by = np.random.randn(2, 1)
parameters = {"Waa":Waa, "Wax":Wax, "Wya":Wya, "ba":ba, "by":by}
a, y, caches = rnn_forward(x, a0, parameters)

da = np.random.randn(5, 10, 4)
gradients = rnn_backward(da, caches)

print("gradients[\"dx\"][1][2] =", gradients["dx"][1][2])
print("gradients[\"dx\"].shape =", gradients["dx"].shape)
print("gradients[\"da0\"][2][3] =", gradients["da0"][2][3])
print("gradients[\"da0\"].shape =", gradients["da0"].shape)
print("gradients[\"dWax\"][3][1] =", gradients["dWax"][3][1])
print("gradients[\"dWax\"].shape =", gradients["dWax"].shape)
print("gradients[\"dWaa\"][1][2] =", gradients["dWaa"][1][2])
print("gradients[\"dWaa\"].shape =", gradients["dWaa"].shape)
print("gradients[\"dba\"][4] =", gradients["dba"][4])
print("gradients[\"dba\"].shape =", gradients["dba"].shape)

gradients["dx"][1][2] = [-2.07101689 -0.59255627  0.02466855  0.01483317]
gradients["dx"].shape = (3, 10, 4)
gradients["da0"][2][3] = -0.31494237512664996
gradients["da0"].shape = (5, 10)
gradients["dWax"][3][1] = 11.264104496527777
gradients["dWax"].shape = (5, 3)
gradients["dWaa"][1][2] = 2.303333126579893
gradients["dWaa"].shape = (5, 5)
gradients["dba"][4] = [-0.74747722]
gradients["dba"].shape = (5, 1)


In [11]:
np.random.seed(1)
xt = np.random.randn(3, 10)
a_prev = np.random.randn(5, 10)
c_prev = np.random.randn(5, 10)
Wf = np.random.randn(5, 5 + 3)
bf = np.random.randn(5, 1)
Wi = np.random.randn(5, 5 + 3)
bi = np.random.randn(5, 1)
Wo = np.random.randn(5, 5 + 3)
bo = np.random.randn(5, 1)
Wc = np.random.randn(5, 5 + 3)
bc = np.random.randn(5, 1)
Wy = np.random.randn(2, 5)
by = np.random.randn(2, 1)
parameters = {"Wf":Wf, "Wi":Wi, "Wo":Wo, "Wc":Wc, "Wy":Wy,
              "bf":bf, "bi":bi, "bo":bo, "bc":bc, "by":by}

a_next, c_next, yt, cache = lstm_cell_forward(xt, a_prev, c_prev, parameters)

da_next = np.random.randn(5, 10)
dc_next = np.random.randn(5, 10)
gradients = lstm_cell_backward(da_next, dc_next, cache)
print("gradients[\"dxt\"][1][2] = ", gradients["dxt"][1][2])
print("gradients[\"dxt\"].shape = ", gradients["dxt"].shape)
print("gradients[\"da_prev\"][2][3] = ", gradients["da_prev"][2][3])
print("gradients[\"da_prev\"].shape = ", gradients["da_prev"].shape)
print("gradients[\"dc_prev\"][2][3] = ", gradients["dc_prev"][2][3])
print("gradients[\"dc_prev\"].shape = ", gradients["dc_prev"].shape)
print("gradients[\"dWf\"][3][1] = ", gradients["dWf"][3][1])
print("gradients[\"dWf\"].shape = ", gradients["dWf"].shape)
print("gradients[\"dWi\"][1][2] = ", gradients["dWi"][1][2])
print("gradients[\"dWi\"].shape = ", gradients["dWi"].shape)
print("gradients[\"dWc\"][3][1] = ", gradients["dWc"][3][1])
print("gradients[\"dWc\"].shape = ", gradients["dWc"].shape)
print("gradients[\"dWo\"][1][2] = ", gradients["dWo"][1][2])
print("gradients[\"dWo\"].shape = ", gradients["dWo"].shape)
print("gradients[\"dbf\"][4] = ", gradients["dbf"][4])
print("gradients[\"dbf\"].shape = ", gradients["dbf"].shape)
print("gradients[\"dbi\"][4] = ", gradients["dbi"][4])
print("gradients[\"dbi\"].shape = ", gradients["dbi"].shape)
print("gradients[\"dbc\"][4] = ", gradients["dbc"][4])
print("gradients[\"dbc\"].shape = ", gradients["dbc"].shape)
print("gradients[\"dbo\"][4] = ", gradients["dbo"][4])
print("gradients[\"dbo\"].shape = ", gradients["dbo"].shape)

gradients["dxt"][1][2] =  3.230559115109188
gradients["dxt"].shape =  (3, 10)
gradients["da_prev"][2][3] =  -0.06396214197109235
gradients["da_prev"].shape =  (5, 10)
gradients["dc_prev"][2][3] =  0.7975220387970015
gradients["dc_prev"].shape =  (5, 10)
gradients["dWf"][3][1] =  -0.14795483816449675
gradients["dWf"].shape =  (5, 8)
gradients["dWi"][1][2] =  1.0574980552259903
gradients["dWi"].shape =  (5, 8)
gradients["dWc"][3][1] =  2.304562163687667
gradients["dWc"].shape =  (5, 8)
gradients["dWo"][1][2] =  0.3313115952892109
gradients["dWo"].shape =  (5, 8)
gradients["dbf"][4] =  [0.18864637]
gradients["dbf"].shape =  (5, 1)
gradients["dbi"][4] =  [-0.40142491]
gradients["dbi"].shape =  (5, 1)
gradients["dbc"][4] =  [0.25587763]
gradients["dbc"].shape =  (5, 1)
gradients["dbo"][4] =  [0.13893342]
gradients["dbo"].shape =  (5, 1)
