In [34]:
import numpy as np
import math
import pandas as pd

# Init Parameters Function

In [35]:
def init_parameters(layers_dims):
    np.random.seed(10)
    L = len(layers_dims)
    parameters = {}
    
    for i in range(1, L):
        parameters["W" + str(i)] = np.random.randn(layers_dims[i], layers_dims[i - 1]) * np.square(2 / layers_dims[i - 1])
        parameters["b" + str(i)] = np.zeros((layers_dims[i], 1))
        
    return parameters

In [36]:
parameters = init_parameters([2, 4, 1])

In [37]:
parameters

{'W1': array([[ 1.3315865 ,  0.71527897],
        [-1.54540029, -0.00838385],
        [ 0.62133597, -0.72008556],
        [ 0.26551159,  0.10854853]]),
 'W2': array([[ 0.00107286, -0.04365005,  0.10825655,  0.30075934]]),
 'b1': array([[ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]),
 'b2': array([[ 0.]])}

# Help Functions

In [38]:
def relu(X):
    return X * (X > 0)

In [39]:
relu(np.array([2, -2, -1, 0]))

array([2, 0, 0, 0])

In [40]:
def relu_backward(X):
    return 1 * (X > 0)

In [41]:
relu_backward(np.array([2,3,-1]))

array([1, 1, 0])

# Feedforward Functions

### Test Case

In [42]:
def test_case_feed_forward():
    X = np.array([
        [1],
        [1]
    ])
    parameters = init_parameters([2, 4, 1])
    
    Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
    A1 = relu(Z1)
    Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
    A2 = relu(Z2)
    
    return (Z1, A1, Z2, A2)

In [43]:
# Z1, A1, Z2, A2 = test_case_feed_forward()

In [44]:
# print("Z1 \n" + str(Z1) + "\n================")
# print("A1 \n" + str(A1) + "\n================")
# print("Z2 \n" + str(Z2) + "\n================")
# print("A2 \n" + str(A2) + "\n================")

### Feed Forward

In [45]:
def feed_forward(X, parameters, activation_functions = []):
    L = len(parameters) // 2 + 1
    caches = []
    curr_A = X
    
    for l in range(1, L):
        A = curr_A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        Z = np.dot(W, A) + b
        
        curr_A = relu(Z)
        
        cache = (W, b, A, Z, curr_A)
        caches.append(cache)
    return curr_A, caches

In [46]:
# AL, caches = feed_forward(np.array([[1],[1]]), parameters)

In [47]:
# assert(np.array_equal(caches[0][0], parameters["W1"]))
# assert(np.array_equal(caches[0][1], parameters["b1"]))
# assert(np.array_equal(caches[0][2], np.array([[1],[1]])))
# assert(np.array_equal(caches[0][3], Z1))
# assert(np.array_equal(caches[0][4], A1))
# assert(np.array_equal(caches[1][0], parameters["W2"]))
# assert(np.array_equal(caches[1][1], parameters["b2"]))
# assert(np.array_equal(caches[1][2], A1))
# assert(np.array_equal(caches[1][3], Z2))
# assert(np.array_equal(caches[1][4], A2))

# Back Propagation

In [48]:
def test_case_back_propagation():
    parameters = init_parameters([2, 4, 1])
    X = np.array([
        [1],
        [1]
    ])
    Y = np.array([
        [4]
    ])
    Z1, A1, Z2, A2 = test_case_feed_forward()
    m = X.shape[1]
    dA2 = - Y / A2 + (1 - Y) / (1 - A2)
    dZ2 = dA2 * relu_backward(A2)
    dW2 = 1 / m * np.dot(dZ2, A1.T)
    db2 = 1 / m * np.sum(dZ2, axis = 1, keepdims=True)
    dZ1 = np.dot(parameters["W2"].T, dZ2) * relu_backward(A1)
    dW1 = 1 / m * np.dot(dZ1, X.T)
    db1 = 1 / m * np.sum(dZ1, axis = 1, keepdims=True)
    
    grads = {}
    grads["dW1"] = dW1
    grads["dW2"] = dW2
    grads["db1"] = db1
    grads["db2"] = db2
    
    return grads

In [49]:
# grads = test_case_back_propagation()

### Gradient Checking

In [144]:
def gradient_check(X, Y, parameters, grads, epsilon = 1e-7):
    L = len(parameters) // 2 + 1
    
    number_parameters = get_number_parameters(parameters)
    
    vector = parameters_to_vector(parameters)
    
    
    gradapprox = np.zeros((number_parameters, 1))
    
    for i in range(number_parameters):
        vector_plus = np.copy(vector)
        vector_plus[i][0] += epsilon
        cost_plus = compute_cost(X, Y, vector_to_parameters(vector_plus, parameters))
        
        vector_minus = np.copy(vector)
        vector_minus[i][0] -= epsilon
        cost_minus = compute_cost(X, Y, vector_to_parameters(vector_minus, parameters))
        
        d = (cost_plus - cost_minus) / (2 * epsilon)
        
        gradapprox[i][0] = d
    
    grad = grads_to_vector(grads)
    
    numerator = np.linalg.norm(grad - gradapprox)                              
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)
    difference = numerator / denominator
    
    if difference > 1e-6:
        print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    
    return difference, vector_to_grads(gradapprox, grads)

In [145]:
def get_number_parameters(parameters):
    L = len(parameters) // 2 + 1
    
    count = 0
    
    for i in range(1, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        count += W.shape[0] * W.shape[1] + b.shape[0] * b.shape[1]
        
    return count

In [146]:
def parameters_to_vector(parameters):
    L = len(parameters) // 2 + 1
    
    W = parameters["W1"]
    b = parameters["b1"]
        
    W = W.reshape(W.shape[0] * W.shape[1], 1)
    
    vector = np.concatenate((W, b), axis = 0)
    
    for i in range(2, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        W = W.reshape(W.shape[0] * W.shape[1], 1)
        
        curr = np.concatenate((W, b), axis = 0)

        vector = np.concatenate((vector, curr), axis = 0)
    return vector

In [147]:
# vector = parameters_to_vector(parameters)
# assert(vector.shape[0] == 17)

In [148]:
def vector_to_parameters(vector, parameters):
    
    L = len(parameters) // 2 + 1
    
    for i in range(1, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        num_para_W = W.shape[0] * W.shape[1]
        num_para_b = b.shape[0] * b.shape[1]
        
        vector_W = vector[:num_para_W, :]
        vector_b = vector[num_para_W:num_para_W+num_para_b, :]
        
        vector = vector[num_para_W + num_para_b:, :]
        
        parameters["W" + str(i)] = vector_W.reshape(W.shape[0], W.shape[1])
        parameters["b" + str(i)] = vector_b.reshape(b.shape[0], b.shape[1])
    
    return parameters

In [149]:
# parameters = vector_to_parameters(vector, parameters)

In [150]:
assert(parameters["W1"].shape == (4, 2))
assert(parameters["b1"].shape == (4, 1))
assert(parameters["W2"].shape == (1, 4))
assert(parameters["b2"].shape == (1, 1))

In [151]:
def grads_to_vector(parameters):
    L = len(parameters) // 2 + 1

    W = parameters["dW1"]
    b = parameters["db1"]
        
    W = W.reshape(W.shape[0] * W.shape[1], 1)
    
    vector = np.concatenate((W, b), axis = 0)
    
    for i in range(2, L):
        W = parameters["dW" + str(i)]
        b = parameters["db" + str(i)]
        
        W = W.reshape(W.shape[0] * W.shape[1], 1)
        
        curr = np.concatenate((W, b), axis = 0)

        vector = np.concatenate((vector, curr), axis = 0)
    return vector

In [152]:
def vector_to_grads(vector, grads):
    
    L = len(parameters) // 2 + 1
    
    for i in range(1, L):
        dW = grads["dW" + str(i)]
        db = grads["db" + str(i)]
        
        num_para_dW = dW.shape[0] * dW.shape[1]
        num_para_db = db.shape[0] * db.shape[1]
        
        vector_dW = vector[:num_para_dW, :]
        vector_db = vector[num_para_dW:num_para_dW+num_para_db, :]
        
        vector = vector[num_para_dW + num_para_db:, :]
        
        grads["dW" + str(i)] = vector_dW.reshape(dW.shape[0], dW.shape[1])
        grads["db" + str(i)] = vector_db.reshape(db.shape[0], db.shape[1])
        
    return grads

### Compute Cost

In [153]:
def compute_cost(X, Y, parameters):
    A, caches = feed_forward(X, parameters)
    m = X.shape[1]
    
    cost = 1. / m * np.sum(-Y * np.log(A) - (1 - Y) * np.log(1 - A))

    return cost

In [154]:
# diff, gradaprox = gradient_check(np.array([[1], [1]]), np.array([[4]]), parameters, grads)

In [155]:
# grads

In [156]:
# gradaprox

### Back Propagation

In [157]:
def back_propagation(AL, Y, caches):
    m = AL.shape[1]
    L = len(caches)
    
    grads = {}
    prev_dA = {}
    
    W, b, prev_A, Z, curr_A = caches[L - 1]  
    dAL = -Y / AL + (1 - Y) / (1 - AL)
    dZL = dAL * relu_backward(curr_A)
    
    grads["dW" + str(L)] = 1.0 / m * np.dot(dZL, prev_A.T)
    grads["db" + str(L)] = 1.0 / m * np.sum(dZL, axis = 1, keepdims=True)
    prev_dA["dA" + str(L - 1)] = np.dot(W.T, dZL)
    
    for l in reversed(range(1, L)):
        W, b, prev_A, Z, curr_A = caches[l - 1]
        dA = prev_dA["dA" + str(l)]

        dZ = dA * relu_backward(curr_A)
        
        grads["dW" + str(l)] = 1.0 / m * np.dot(dZ, prev_A.T)
        grads["db" + str(l)] = 1.0 / m * np.sum(dZ, axis = 1, keepdims=True)
        prev_dA["dA" + str(l - 1)] = np.dot(W.T, dZ)
    
    return grads

In [164]:
parameters = init_parameters([2, 4, 4, 1])

In [165]:
AL, caches = feed_forward(np.array([[1], [1]]), parameters)

In [166]:
grads = back_propagation(AL, np.array([[4]]), caches)

In [167]:
diff, grad_aprox = gradient_check(np.array([[1], [1]]), np.array([[4]]), parameters, grads)

[92mYour backward propagation works perfectly fine! difference = 7.46786042651e-11[0m


In [168]:
grad_aprox

{'dW1': array([[ -0.0383493 ,  -0.0383493 ],
        [  0.        ,   0.        ],
        [  0.        ,   0.        ],
        [-10.75064215, -10.75064215]]),
 'dW2': array([[-73.16520249,   0.        ,   0.        , -13.37077795],
        [  0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ]]),
 'dW3': array([[-14.59433714,   0.        ,   0.        ,   0.        ]]),
 'db1': array([[ -0.0383493 ],
        [  0.        ],
        [  0.        ],
        [-10.75064215]]),
 'db2': array([[-35.74499803],
        [  0.        ],
        [  0.        ],
        [  0.        ]]),
 'db3': array([[-127.24135004]])}

In [169]:
grads

{'dW1': array([[ -0.0383493 ,  -0.0383493 ],
        [  0.        ,   0.        ],
        [  0.        ,   0.        ],
        [-10.75064215, -10.75064215]]),
 'dW2': array([[-73.16520249,   0.        ,   0.        , -13.37077795],
        [  0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ,   0.        ]]),
 'dW3': array([[-14.59433714,   0.        ,   0.        ,   0.        ]]),
 'db1': array([[ -0.0383493 ],
        [  0.        ],
        [  0.        ],
        [-10.75064215]]),
 'db2': array([[-35.74499803],
        [  0.        ],
        [  0.        ],
        [  0.        ]]),
 'db3': array([[-127.24135004]])}