In [216]:
import numpy as np
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy.io
import sklearn
import sklearn.datasets

# Init Parameters Function

In [2]:
def init_parameters(layers_dims):
    np.random.seed(10)
    L = len(layers_dims)
    parameters = {}
    
    for i in range(1, L):
        parameters["W" + str(i)] = np.random.randn(layers_dims[i], layers_dims[i - 1]) * np.square(2 / layers_dims[i - 1])
        parameters["b" + str(i)] = np.zeros((layers_dims[i], 1))
        
    return parameters

In [3]:
parameters = init_parameters([2, 4, 1])

In [4]:
parameters

{'W1': array([[ 1.3315865 ,  0.71527897],
        [-1.54540029, -0.00838385],
        [ 0.62133597, -0.72008556],
        [ 0.26551159,  0.10854853]]),
 'W2': array([[ 0.00107286, -0.04365005,  0.10825655,  0.30075934]]),
 'b1': array([[ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]),
 'b2': array([[ 0.]])}

# Help Functions

In [5]:
def relu(X):
    return X * (X > 0)

In [6]:
relu(np.array([2, -2, -1, 0]))

array([2, 0, 0, 0])

In [7]:
def relu_backward(X):
    return 1 * (X > 0)

In [8]:
relu_backward(np.array([2,3,-1]))

array([1, 1, 0])

In [9]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

In [10]:
sigmoid(np.array([1,2,3]))

array([ 0.73105858,  0.88079708,  0.95257413])

In [11]:
def sigmoid_backward(X):
    return X * (1 - X)

In [12]:
sigmoid_backward(np.array([1,2,3]))

array([ 0, -2, -6])

# Feedforward Functions

### Test Case

In [13]:
def test_case_feed_forward():
    X = np.array([
        [1],
        [1]
    ])
    parameters = init_parameters([2, 4, 1])
    
    Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
    A1 = relu(Z1)
    Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
    A2 = relu(Z2)
    
    return (Z1, A1, Z2, A2)

In [14]:
# Z1, A1, Z2, A2 = test_case_feed_forward()

In [15]:
# print("Z1 \n" + str(Z1) + "\n================")
# print("A1 \n" + str(A1) + "\n================")
# print("Z2 \n" + str(Z2) + "\n================")
# print("A2 \n" + str(A2) + "\n================")

### Feed Forward

In [485]:
def feed_forward(X, parameters, activation_functions = ["relu", "sigmoid"]):
    L = len(parameters) // 2 + 1
    caches = []
    curr_A = X
    
    for l in range(1, L):
        A = curr_A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        Z = np.dot(W, A) + b
        
        if activation_functions[l - 1] == "sigmoid":
            curr_A = sigmoid(Z)
        else:
            curr_A = relu(Z)
        
        cache = (W, b, A, Z, curr_A, activation_functions[l - 1])
        caches.append(cache)
    return curr_A, caches

In [17]:
# AL, caches = feed_forward(np.array([[1],[1]]), parameters)

In [18]:
# assert(np.array_equal(caches[0][0], parameters["W1"]))
# assert(np.array_equal(caches[0][1], parameters["b1"]))
# assert(np.array_equal(caches[0][2], np.array([[1],[1]])))
# assert(np.array_equal(caches[0][3], Z1))
# assert(np.array_equal(caches[0][4], A1))
# assert(np.array_equal(caches[1][0], parameters["W2"]))
# assert(np.array_equal(caches[1][1], parameters["b2"]))
# assert(np.array_equal(caches[1][2], A1))
# assert(np.array_equal(caches[1][3], Z2))
# assert(np.array_equal(caches[1][4], A2))

# Back Propagation

In [19]:
def test_case_back_propagation():
    parameters = init_parameters([2, 4, 1])
    X = np.array([
        [1],
        [1]
    ])
    Y = np.array([
        [4]
    ])
    Z1, A1, Z2, A2 = test_case_feed_forward()
    m = X.shape[1]
    dA2 = - Y / A2 + (1 - Y) / (1 - A2)
    dZ2 = dA2 * relu_backward(A2)
    dW2 = 1 / m * np.dot(dZ2, A1.T)
    db2 = 1 / m * np.sum(dZ2, axis = 1, keepdims=True)
    dZ1 = np.dot(parameters["W2"].T, dZ2) * relu_backward(A1)
    dW1 = 1 / m * np.dot(dZ1, X.T)
    db1 = 1 / m * np.sum(dZ1, axis = 1, keepdims=True)
    
    grads = {}
    grads["dW1"] = dW1
    grads["dW2"] = dW2
    grads["db1"] = db1
    grads["db2"] = db2
    
    return grads

In [20]:
# grads = test_case_back_propagation()

### Gradient Checking

In [21]:
def gradient_check(X, Y, parameters, grads, activation_functions, epsilon = 1e-7):
    L = len(parameters) // 2 + 1
    
    number_parameters = get_number_parameters(parameters)
    
    vector = parameters_to_vector(parameters)
    
    
    gradapprox = np.zeros((number_parameters, 1))
    
    for i in range(number_parameters):
        vector_plus = np.copy(vector)
        vector_plus[i][0] += epsilon
        cost_plus = compute_cost(X, Y, vector_to_parameters(vector_plus, parameters), activation_functions)
        
        vector_minus = np.copy(vector)
        vector_minus[i][0] -= epsilon
        cost_minus = compute_cost(X, Y, vector_to_parameters(vector_minus, parameters), activation_functions)
        
        d = (cost_plus - cost_minus) / (2 * epsilon)
        
        gradapprox[i][0] = d
    
    grad = grads_to_vector(grads)
    
    numerator = np.linalg.norm(grad - gradapprox)                              
    denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox)
    difference = numerator / denominator
    
    if difference > 1e-7:
        print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
    else:
        print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
    
    return difference, vector_to_grads(gradapprox, grads)

In [22]:
def get_number_parameters(parameters):
    L = len(parameters) // 2 + 1
    
    count = 0
    
    for i in range(1, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        count += W.shape[0] * W.shape[1] + b.shape[0] * b.shape[1]
        
    return count

In [23]:
def parameters_to_vector(parameters):
    L = len(parameters) // 2 + 1
    
    W = parameters["W1"]
    b = parameters["b1"]
        
    W = W.reshape(W.shape[0] * W.shape[1], 1)
    
    vector = np.concatenate((W, b), axis = 0)
    
    for i in range(2, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        W = W.reshape(W.shape[0] * W.shape[1], 1)
        
        curr = np.concatenate((W, b), axis = 0)

        vector = np.concatenate((vector, curr), axis = 0)
    return vector

In [24]:
# vector = parameters_to_vector(parameters)
# assert(vector.shape[0] == 17)

In [25]:
def vector_to_parameters(vector, parameters):
    
    L = len(parameters) // 2 + 1
    
    for i in range(1, L):
        W = parameters["W" + str(i)]
        b = parameters["b" + str(i)]
        
        num_para_W = W.shape[0] * W.shape[1]
        num_para_b = b.shape[0] * b.shape[1]
        
        vector_W = vector[:num_para_W, :]
        vector_b = vector[num_para_W:num_para_W+num_para_b, :]
        
        vector = vector[num_para_W + num_para_b:, :]
        
        parameters["W" + str(i)] = vector_W.reshape(W.shape[0], W.shape[1])
        parameters["b" + str(i)] = vector_b.reshape(b.shape[0], b.shape[1])
    
    return parameters

In [26]:
# parameters = vector_to_parameters(vector, parameters)

In [27]:
assert(parameters["W1"].shape == (4, 2))
assert(parameters["b1"].shape == (4, 1))
assert(parameters["W2"].shape == (1, 4))
assert(parameters["b2"].shape == (1, 1))

In [28]:
def grads_to_vector(parameters):
    L = len(parameters) // 2 + 1

    W = parameters["dW1"]
    b = parameters["db1"]
        
    W = W.reshape(W.shape[0] * W.shape[1], 1)
    
    vector = np.concatenate((W, b), axis = 0)
    
    for i in range(2, L):
        W = parameters["dW" + str(i)]
        b = parameters["db" + str(i)]
        
        W = W.reshape(W.shape[0] * W.shape[1], 1)
        
        curr = np.concatenate((W, b), axis = 0)

        vector = np.concatenate((vector, curr), axis = 0)
    return vector

In [29]:
def vector_to_grads(vector, grads):
    
    L = len(grads) // 2 + 1
    
    for i in range(1, L):
        dW = grads["dW" + str(i)]
        db = grads["db" + str(i)]
        
        num_para_dW = dW.shape[0] * dW.shape[1]
        num_para_db = db.shape[0] * db.shape[1]
        
        vector_dW = vector[:num_para_dW, :]
        vector_db = vector[num_para_dW:num_para_dW+num_para_db, :]
        
        vector = vector[num_para_dW + num_para_db:, :]
        
        grads["dW" + str(i)] = vector_dW.reshape(dW.shape[0], dW.shape[1])
        grads["db" + str(i)] = vector_db.reshape(db.shape[0], db.shape[1])
        
    return grads

### Compute Cost

In [30]:
def compute_cost(X, Y, parameters, activation_functions):
    A, caches = feed_forward(X, parameters, activation_functions)
    m = X.shape[1]
    
    cost = 1. / m * np.sum(-Y * np.log(A) - (1 - Y) * np.log(1 - A))

    return cost

In [31]:
# diff, gradaprox = gradient_check(np.array([[1], [1]]), np.array([[4]]), parameters, grads)

In [32]:
# grads

In [33]:
# gradaprox

In [37]:
def compute_cross_entropy_cost(AL, Y):
    m = AL.shape[1]
    
    cost = 1. / m * np.sum(-Y * np.log(AL) - (1 - Y) * np.log(1 - AL))
    return cost

In [38]:
def compute_mean_square_cost(AL, Y):
    m = AL.shape[1]
    cost = 1 / (2 * m) * np.sum(np.square(AL - Y))
    return cost

### Back Propagation

In [39]:
def back_propagation(AL, Y, caches, loss = 'cross_entropy'):
    m = AL.shape[1]
    L = len(caches)
    
    grads = {}
    prev_dA = {}
    
    W, b, prev_A, Z, curr_A, activation_func = caches[L - 1]
    if loss == "mean_square_error":
        dAL = AL - Y
    else:
        dAL = -Y / AL + (1 - Y) / (1 - AL)
        
    if activation_func == 'sigmoid':
        dZL = dAL * sigmoid_backward(curr_A)
    else:
        dZL = dAL * relu_backward(curr_A)
    
    grads["dW" + str(L)] = 1.0 / m * np.dot(dZL, prev_A.T)
    grads["db" + str(L)] = 1.0 / m * np.sum(dZL, axis = 1, keepdims=True)
    prev_dA["dA" + str(L - 1)] = np.dot(W.T, dZL)
    
    for l in reversed(range(1, L)):
        W, b, prev_A, Z, curr_A, activation_func = caches[l - 1]
        dA = prev_dA["dA" + str(l)]
        
        if activation_func == "sigmoid":
            dZ = dA * sigmoid_backward(curr_A)
        else:
            dZ = dA * relu_backward(curr_A)
        
        grads["dW" + str(l)] = 1.0 / m * np.dot(dZ, prev_A.T)
        grads["db" + str(l)] = 1.0 / m * np.sum(dZ, axis = 1, keepdims=True)
        prev_dA["dA" + str(l - 1)] = np.dot(W.T, dZ)
    
    return grads

In [40]:
parameters = init_parameters([2, 4, 4, 4, 1])

In [41]:
AL, caches = feed_forward(np.array([[1], [1]]), parameters, activation_functions = ["relu", "sigmoid", "relu", "sigmoid"])

In [42]:
grads = back_propagation(AL, np.array([[4]]), caches)

In [43]:
diff, grad_aprox = gradient_check(np.array([[1], [1]]), np.array([[4]]), parameters, grads, ["relu", "sigmoid", "relu", "sigmoid"])

[92mYour backward propagation works perfectly fine! difference = 2.86906772206e-09[0m


In [44]:
grad_aprox

{'dW1': array([[ 0.02229933,  0.02229933],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [-0.0708208 , -0.0708208 ]]),
 'dW2': array([[-0.04688101,  0.        ,  0.        , -0.0085674 ],
        [-0.18216089,  0.        ,  0.        , -0.03328949],
        [ 0.1770862 ,  0.        ,  0.        ,  0.03236211],
        [-0.10530032,  0.        ,  0.        , -0.0192434 ]]),
 'dW3': array([[-0.30453113, -0.22399288, -0.19337368, -0.17995147],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.43331269,  0.31871603,  0.27514845,  0.25605021],
        [ 0.2257063 ,  0.16601456,  0.14332084,  0.13337284]]),
 'dW4': array([[-1.49583702,  0.        , -0.33606177, -0.23855092]]),
 'db1': array([[ 0.02229933],
        [ 0.        ],
        [ 0.        ],
        [-0.0708208 ]]),
 'db2': array([[-0.0229038 ],
        [-0.08899504],
        [ 0.08651581],
        [-0.05144467]]),
 'db3': array([[-0.57606185],
        [ 0.        ],
       

In [45]:
grads

{'dW1': array([[ 0.02229933,  0.02229933],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [-0.0708208 , -0.0708208 ]]),
 'dW2': array([[-0.04688101,  0.        ,  0.        , -0.0085674 ],
        [-0.18216089,  0.        ,  0.        , -0.03328949],
        [ 0.1770862 ,  0.        ,  0.        ,  0.03236211],
        [-0.10530032,  0.        ,  0.        , -0.0192434 ]]),
 'dW3': array([[-0.30453113, -0.22399288, -0.19337368, -0.17995147],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.43331269,  0.31871603,  0.27514845,  0.25605021],
        [ 0.2257063 ,  0.16601456,  0.14332084,  0.13337284]]),
 'dW4': array([[-1.49583702,  0.        , -0.33606177, -0.23855092]]),
 'db1': array([[ 0.02229933],
        [ 0.        ],
        [ 0.        ],
        [-0.0708208 ]]),
 'db2': array([[-0.0229038 ],
        [-0.08899504],
        [ 0.08651581],
        [-0.05144467]]),
 'db3': array([[-0.57606185],
        [ 0.        ],
       

# Simple Neural Network

In [258]:
def L_model(X, Y, layers_dims, layers_activations, num_epoch = 100, learning_rate = 0.05, loss = "cross_entropy", print_cost = False):
    input_dim = X.shape[0]
    layers_dims = [input_dim] + layers_dims
    
    parameters = init_parameters(layers_dims)
    
    costs = []
    
    AL = None
    
    for epoch in range(1, num_epoch + 1):
        
        AL, caches = feed_forward(X, parameters, activation_functions = layers_activations)
        
        if loss == "mean_square_error":
            cost = compute_mean_square_cost(AL, Y)
        else:
            cost = compute_cross_entropy_cost(AL, Y)
        costs.append(cost)
        
        grads = back_propagation(AL, Y, caches, loss = loss)
        
        if epoch % 50 == 0:
            diff, gradaprox = gradient_check(X, Y, parameters, grads, layers_activations)
            # print(gradaprox)
            # print(grads)
        
        parameters = update_parameters(parameters, grads, lr = learning_rate)
        
        if print_cost == True and epoch % 10 == 0:
            print("Epoch " + str(epoch) + " cost: " + str(cost))
    
    return AL, parameters

In [517]:
def L_model_mini_batch(X, Y, layers_dims, layers_activations, batch_size = 512, num_epoch = 100, learning_rate = 0.05, loss = "cross_entropy", print_cost = False):
    input_dim = X.shape[0]
    layers_dims = [input_dim] + layers_dims
    
    parameters = init_parameters(layers_dims)
    
    costs = []
    
    AL = None
    
    for epoch in range(1, num_epoch + 1):
        mini_batches = random_mini_batches(X, Y, mini_batch_size = batch_size)
        
        for i in range(len(mini_batches)):
            mini_batch = mini_batches[i]
        
            AL, caches = feed_forward(mini_batch[0], parameters, activation_functions = layers_activations)

            if loss == "mean_square_error":
                cost = compute_mean_square_cost(AL, mini_batch[1])
            else:
                cost = compute_cross_entropy_cost(AL, mini_batch[1])
            costs.append(cost)

            grads = back_propagation(AL, mini_batch[1], caches, loss = loss)

            if epoch % 1000 == 0:
                diff, gradaprox = gradient_check(mini_batch[0], mini_batch[1], parameters, grads, layers_activations)
                
                # print(gradaprox)
                # print(grads)

            parameters = update_parameters(parameters, grads, lr = learning_rate)

        if print_cost == True and epoch % 1000 == 0:
            print("Epoch " + str(epoch) + " cost: " + str(cost))
                
    
    return AL, parameters

In [455]:
def random_mini_batches(X, Y, mini_batch_size = 2, seed = 3):
    np.random.seed(seed)
    
    m = X.shape[1]
    
    mini_batches = []
    
    # Shuffle
    permutation = list(np.random.permutation(m))
    shuffle_X = X[:, permutation]
    shuffle_Y = Y[:, permutation]
    
    # Collect Mini Batch
    num_complete_minibatches = math.floor(m / mini_batch_size)
    
    for i in range(num_complete_minibatches):
        mini_batch_X = shuffle_X[:, i * mini_batch_size:(i + 1) * mini_batch_size]
        mini_batch_Y = shuffle_Y[:, i * mini_batch_size:(i + 1) * mini_batch_size]
        
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    if m % mini_batch_size != 0:
        mini_batch_X = shuffle_X[:, num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffle_Y[:, num_complete_minibatches * mini_batch_size:]
        
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
        
    return mini_batches

In [378]:
def update_parameters(parameters, grads, lr = 0.01):
    L = len(parameters) // 2 + 1
    
    for i in range(1, L):
        parameters["W" + str(i)] -= lr * grads["dW" + str(i)]
        parameters["b" + str(i)] -= lr * grads["db" + str(i)]
        
    return parameters

# Momentum

In [379]:
def initialize_velocity(parameters):
    
    L = len(parameters) // 2
    
    v = {}
    
    for i in range(1, L + 1):
        v["dW" + str(i)] = np.zeros(parameters["W" + str(i)].shape)
        v["db" + str(i)] = np.zeros(parameters["b" + str(i)].shape)
    
    return v

In [361]:
initialize_velocity(parameters)

{'dW1': array([[ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.]]),
 'dW2': array([[ 0.,  0.,  0.,  0.]]),
 'db1': array([[ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]),
 'db2': array([[ 0.]])}

In [362]:
def update_parameters_with_momentum(parameters, grads, v, beta = 0.9, lr = 0.05):
    L = len(parameters) // 2
    
    for i in range(1, L + 1):
        v["dW" + str(i)] = beta * v["dW" + str(i)] + (1 - beta) * grads["dW" + str(i)]
        v["db" + str(i)] = beta * v["db" + str(i)] + (1 - beta) * grads["db" + str(i)]
        
        parameters["W" + str(i)] -= lr *  v["dW" + str(i)]
        parameters["b" + str(i)] -= lr *  v["db" + str(i)]
    
    return parameters, v

In [516]:
def L_model_momentum(X, Y, layers_dims, layers_activations, batch_size = 512, num_epoch = 100, learning_rate = 0.05, beta = 0, loss = "cross_entropy", print_cost = False):
    input_dim = X.shape[0]
    layers_dims = [input_dim] + layers_dims
    
    parameters = init_parameters(layers_dims)
    v = initialize_velocity(parameters)
    seed = 0
    
    costs = []
    
    AL = None
    
    for epoch in range(1, num_epoch + 1):
        seed += 1
        mini_batches = random_mini_batches(X, Y, mini_batch_size = batch_size, seed = seed)
        
        for mini_batch in mini_batches:
        
            AL, caches = feed_forward(mini_batch[0], parameters, activation_functions = layers_activations)

            if loss == "mean_square_error":
                cost = compute_mean_square_cost(AL, mini_batch[1])
            else:
                cost = compute_cross_entropy_cost(AL, mini_batch[1])
            costs.append(cost)

            grads = back_propagation(AL, mini_batch[1], caches, loss = loss)

            if epoch % 1000 == 0:
                diff, gradaprox = gradient_check(mini_batch[0], mini_batch[1], parameters, grads, layers_activations)
                
                # print(gradaprox)
                # print(grads)

            parameters, v = update_parameters_with_momentum(parameters, grads, v, lr = learning_rate, beta = beta)

        if print_cost == True and epoch % 1000 == 0:
            print("Epoch " + str(epoch) + " cost: " + str(cost))
                
    
    return AL, parameters

# Adam

In [474]:
def initialize_adam(parameters):
    
    L = len(parameters) // 2
    
    v = {}
    s = {}
    
    for i in range(1, L + 1):
        v["dW" + str(i)] = np.zeros(parameters["W" + str(i)].shape)
        v["db" + str(i)] = np.zeros(parameters["b" + str(i)].shape)
        s["dW" + str(i)] = np.zeros(parameters["W" + str(i)].shape)
        s["db" + str(i)] = np.zeros(parameters["b" + str(i)].shape)
        
    return s, v

In [475]:
initialize_adam(parameters)

({'dW1': array([[ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.]]),
  'dW2': array([[ 0.,  0.,  0.,  0.]]),
  'db1': array([[ 0.],
         [ 0.],
         [ 0.],
         [ 0.]]),
  'db2': array([[ 0.]])},
 {'dW1': array([[ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.],
         [ 0.,  0.]]),
  'dW2': array([[ 0.,  0.,  0.,  0.]]),
  'db1': array([[ 0.],
         [ 0.],
         [ 0.],
         [ 0.]]),
  'db2': array([[ 0.]])})

In [515]:
def L_model_adam(X, Y, layers_dims, layers_activations, batch_size = 512, num_epoch = 100, learning_rate = 0.05, loss = "cross_entropy", print_cost = False):
    input_dim = X.shape[0]
    layers_dims = [input_dim] + layers_dims
    
    parameters = init_parameters(layers_dims)
    s, v = initialize_adam(parameters)
    seed = 0
    t = 0
    
    costs = []
    
    AL = None
    
    for epoch in range(1, num_epoch + 1):
        seed += 1
        mini_batches = random_mini_batches(X, Y, mini_batch_size = batch_size, seed = seed)
        
        for mini_batch in mini_batches:
        
            AL, caches = feed_forward(mini_batch[0], parameters, activation_functions = layers_activations)

            if loss == "mean_square_error":
                cost = compute_mean_square_cost(AL, mini_batch[1])
            else:
                cost = compute_cross_entropy_cost(AL, mini_batch[1])
            costs.append(cost)

            grads = back_propagation(AL, mini_batch[1], caches, loss = loss)

            if epoch % 1000 == 0:
                diff, gradaprox = gradient_check(mini_batch[0], mini_batch[1], parameters, grads, layers_activations)
                
                # print(gradaprox)
                # print(grads)
            t += 1
            parameters, s, v = update_parameters_with_adam(parameters, grads, s, v, t, lr = learning_rate)

        if print_cost == True and epoch % 1000 == 0:
            print("Epoch " + str(epoch) + " cost: " + str(cost))
                
    
    return AL, parameters

In [477]:
def update_parameters_with_adam(parameters, grads, s, v, t, lr = 0.1, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
    
    L = len(parameters) // 2
    
    v_corrected = {}
    s_corrected = {}
    
    for i in range(1, L + 1):
        v["dW" + str(i)] = beta1 * v["dW" + str(i)] + (1 - beta1) * grads["dW" + str(i)]
        v["db" + str(i)] = beta1 * v["db" + str(i)] + (1 - beta1) * grads["db" + str(i)]
        
        v_corrected["dW" + str(i)] = v["dW" + str(i)] / (1 - np.power(beta1, t))
        v_corrected["db" + str(i)] = v["db" + str(i)] / (1 - np.power(beta1, t))
        
        s["dW" + str(i)] = beta2 * s["dW" + str(i)] + (1 - beta2) * np.power(grads["dW" + str(i)], 2)
        s["db" + str(i)] = beta2 * s["db" + str(i)] + (1 - beta2) * np.power(grads["db" + str(i)], 2)
        
        s_corrected["dW" + str(i)] = s["dW" + str(i)] / (1 - np.power(beta2, t))
        s_corrected["db" + str(i)] = s["db" + str(i)] / (1 - np.power(beta2, t))
        
        parameters["W" + str(i)] -= lr * v_corrected["dW" + str(i)] / np.sqrt(s_corrected["dW" + str(i)] + epsilon)
        parameters["b" + str(i)] -= lr * v_corrected["db" + str(i)] / np.sqrt(s_corrected["db" + str(i)] + epsilon)
    
    return parameters, s, v

# Model

In [543]:
def model(X, Y, layers_dims, layers_activations, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9,
          beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 10000, loss = "cross_entropy", print_cost = True):
    input_dim = X.shape[0]
    layers_dims = [input_dim] + layers_dims
    
    parameters = init_parameters(layers_dims)
    
    if optimizer == 'gd':
        pass
    elif optimizer == 'adam':
        s, v = initialize_adam(parameters)
    elif optimizer == 'momentum':
        v = initialize_velocity(parameters)
        
    seed = 0
    t = 0
    
    costs = []
    
    AL = None
    
    for epoch in range(1, num_epochs + 1):
        seed += 1
        mini_batches = random_mini_batches(X, Y, mini_batch_size = mini_batch_size, seed = seed)
        
        for mini_batch in mini_batches:
            (mini_batch_X, mini_batch_Y) = mini_batch 
            AL, caches = feed_forward(mini_batch_X, parameters, activation_functions = layers_activations)

            if loss == "mean_square_error":
                cost = compute_mean_square_cost(AL, mini_batch_Y)
            else:
                cost = compute_cross_entropy_cost(AL, mini_batch_Y)
            costs.append(cost)

            grads = back_propagation(AL, mini_batch_Y, caches, loss = loss)

            if epoch % 1000 == 0:
                diff, gradaprox = gradient_check(mini_batch_X, mini_batch_Y, parameters, grads, layers_activations)
                
                # print(gradaprox)
                # print(grads)
            if optimizer == 'gd':
                parameters = update_parameters(parameters, grads, lr = learning_rate)
            elif optimizer == 'adam':
                t += 1
                parameters, s, v = update_parameters_with_adam(parameters, grads, s, v, t, lr = learning_rate, beta1 = beta1, beta2 = beta2)
            elif optimizer == 'momentum':
                parameters, v = update_parameters_with_momentum(parameters, grads, v, lr = learning_rate, beta = beta)

        if print_cost == True and epoch % 1000 == 0:
            print("Epoch " + str(epoch) + " cost: " + str(cost))
    
    return parameters

# XOR

In [533]:
X = np.array([
    [1, 0, 0, 1],
    [0, 1, 0, 1]
])

Y = np.array([
    [0, 1, 1, 0]
])
AL, parameters = L_model_momentum(X, Y, [4, 4, 1], ["sigmoid", "sigmoid", "sigmoid"], num_epoch = 1000, batch_size = 2, learning_rate = 0.05, beta = 0.1, print_cost = True)

[92mYour backward propagation works perfectly fine! difference = 3.55532248895e-09[0m
[92mYour backward propagation works perfectly fine! difference = 2.48251672928e-09[0m
Epoch 1000 cost: 0.27007980868


In [534]:
AL, caches = feed_forward(X, parameters, activation_functions = ["sigmoid", "sigmoid", "sigmoid"])

# Test 

In [535]:
def load_dataset():
    np.random.seed(3)
    train_X, train_Y = sklearn.datasets.make_moons(n_samples=300, noise=.2) #300 #0.2 
    # Visualize the data
    plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
    train_X = train_X.T
    train_Y = train_Y.reshape((1, train_Y.shape[0]))
    
    return train_X, train_Y

In [444]:
train_X, train_Y = load_dataset()

In [445]:
train_X.shape

(2, 300)

In [446]:
train_Y.shape

(1, 300)

In [539]:
# AL, parameters = L_model(train_X, train_Y, [4, 1], ["relu", "sigmoid"], num_epoch = 9000, learning_rate = 0.0007, print_cost = True)

In [540]:
# AL, parameters = L_model_mini_batch(train_X, train_Y, [4, 1], ["relu", "sigmoid"], batch_size = 64, num_epoch = 5000, learning_rate = 0.0007, print_cost = True)

In [541]:
# AL, parameters = L_model_momentum(train_X, train_Y, [4, 1], ["relu", "sigmoid"], batch_size = 64, num_epoch = 5000, beta = 0.0007, learning_rate = 1.3, print_cost = True)

In [542]:
# AL, parameters = L_model_adam(train_X, train_Y, [4, 1], ["relu", "sigmoid"], batch_size = 64, num_epoch = 5000, learning_rate = 0.0007, print_cost = True)

In [549]:
predict(train_X, train_Y, parameters)

Accuracy: 0.863333333333


array([[0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
        0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
        0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 

In [488]:
def predict(X, y, parameters):
    """
    This function is used to predict the results of a  n-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    p = np.zeros((1,m), dtype = np.int)
    
    # Forward propagation
    a3, caches = feed_forward(X, parameters)
    
    # convert probas to 0/1 predictions
    for i in range(0, a3.shape[1]):
        if a3[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0

    # print results

    #print ("predictions: " + str(p[0,:]))
    #print ("true labels: " + str(y[0,:]))
    print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
    
    return p

In [548]:
parameters = model(train_X, train_Y, [4, 1], ["relu", "sigmoid"], "momentum")

[92mYour backward propagation works perfectly fine! difference = 2.79096398114e-09[0m
[92mYour backward propagation works perfectly fine! difference = 4.08622796797e-09[0m
[92mYour backward propagation works perfectly fine! difference = 3.8423809087e-09[0m
[92mYour backward propagation works perfectly fine! difference = 4.40647394978e-09[0m
[92mYour backward propagation works perfectly fine! difference = 3.57256786281e-09[0m
Epoch 1000 cost: 0.538993036609
[92mYour backward propagation works perfectly fine! difference = 2.12997664622e-09[0m
[92mYour backward propagation works perfectly fine! difference = 4.93531755191e-09[0m
[92mYour backward propagation works perfectly fine! difference = 2.00534668895e-09[0m
[92mYour backward propagation works perfectly fine! difference = 2.46462078201e-09[0m
[92mYour backward propagation works perfectly fine! difference = 5.38237644194e-09[0m
Epoch 2000 cost: 0.458676724502
[92mYour backward propagation works perfectly fine! diff