In [8]:
import numpy as np
import math

# Creating Parameters for the Neural Net

In [9]:
def initialize_parameters_deep(layer_dims):
    
    # input for the initializer can as [5, 4, 2]
    # first layer weight dimensions can be (4, 5) and bias dimensions can be (4, 1)
    # second layer weight dimensions can be (2, 4) and bias dimensions can be (2, 1)
    # DO UNDERSTAND THAT THE FIRST VALUE OF THE PARAMETER CRORRESPONDS TO THE NUMBER OF FEATURES
    np.random.seed(1)
    parameters = {}
    
    # number of layers in a neural network
    L = len(layer_dims)
    
    for l in range(1, L):
        parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])
        parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
        
    return parameters

# Creating Mini-Batches

In [10]:
def random_mini_batches(X, y, mini_batch_size = 10, seed = 0):
    
    np.random.seed(seed)
    m = X.shape[1] # 150
    mini_batches = []
    
    # Shuffling the 150 examples
    permutation  = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]                    # contains 150 shuffled training observations
    shuffled_y = y[:, permutation].reshape((1, m))    # contains corresponding 150 labels for the trainin observations
    
    # Creation of mini batches with proper set of elements
    num_complete_minibatches = math.floor(m / mini_batch_size)
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k*mini_batch_size : (k+1)*mini_batch_size]
        mini_batch_y = shuffled_y[:, k*mini_batch_size : (k+1)*mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_y)
        mini_batches.append(mini_batch)
    
    # Creation of a batch containing rest of the elements
    # "num_complete_minibatches * mini_batch_size" contains all the proper number of elements
    # "num_complete_minibatches * mini_batch_size : m" contains rest of the elements 
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches*mini_batch_size : m]
        mini_batch_y = shuffled_y[:, num_complete_minibatches*mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [11]:
from sklearn.datasets import load_iris

In [12]:
iris = load_iris()
X = iris.data
y = iris.target
y = y.reshape(1, y.shape[0])
X = X.T

In [13]:
print(X.shape)

(4, 150)


In [14]:
# Establishing the mini-batch
mini_batches = random_mini_batches(X, y, 9)

In [15]:
print(X.shape[0])

4


In [16]:
perm = np.random.permutation(X.shape[1])
X_new = X[:, perm]

In [17]:
# Checking the size of each mini-batch
# Each mini batch can now be used seperately for gradient descent
num_mini_batches = X.shape[1] // 9
rest_batch = X.shape[1] - num_mini_batches * 9
for i in range(num_mini_batches):
    print(mini_batches[i][0].shape)
    
print("(4, " + str(rest_batch) + ")")

(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 9)
(4, 6)


In [18]:
print(X_new)

[[ 5.8  6.9  7.4  6.   5.3  6.8  7.7  6.1  5.   5.6  4.4  6.7  6.1  6.7
   4.6  6.2  4.6  6.7  4.6  6.2  5.   5.7  6.7  6.6  5.8  4.8  5.2  7.6
   5.5  6.   6.9  7.9  5.7  5.1  4.8  6.3  5.6  6.3  5.   6.   6.4  5.1
   4.9  4.4  7.7  5.5  7.3  5.5  5.1  5.5  6.8  5.4  4.9  7.1  5.7  4.9
   6.4  5.5  5.1  6.1  5.   4.8  7.2  6.3  6.3  6.9  5.6  7.   6.1  5.7
   6.2  5.8  4.9  5.1  6.2  5.8  4.7  5.5  5.4  5.1  6.4  5.5  5.1  5.2
   4.7  5.2  4.9  6.4  5.4  5.4  6.4  6.1  5.   6.3  5.6  6.7  6.7  5.   6.8
   4.6  5.8  6.7  6.5  5.6  5.1  6.5  4.9  5.   4.4  4.8  6.4  6.5  5.   5.7
   4.5  5.7  4.8  6.3  6.1  4.3  5.4  6.   5.9  6.4  7.2  6.3  6.3  5.2
   6.5  6.   7.7  7.7  6.   5.   6.5  5.4  6.6  7.2  5.6  6.7  6.9  5.   5.7
   5.8  5.9  5.8  5.9  5.7  5.1  6.3]
 [ 2.6  3.1  2.8  2.2  3.7  3.2  2.8  2.9  3.4  2.9  3.2  2.5  3.   3.   3.6
   3.4  3.4  3.1  3.2  2.2  2.   4.4  3.3  2.9  2.7  3.   2.7  3.   2.5
   2.9  3.1  3.8  2.5  2.5  3.1  3.4  3.   2.8  3.3  2.2  2.9  3.5  2.5
   2.9

In [19]:
print(y.shape)

(1, 150)


# Normal Gradient Descent

In [57]:
def update_parameters_with_gd(parameters, grads, learning_rate):
    
    L = len(parameters) // 2
    
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
        
    return parameters

# Momentum Optimization

In [58]:
def initialize_velocity(parameters):
    
    # number of layers in a neural network
    L = len(parameters) // 2
    v = {}
    
    # So we are trying to create the formula for
    # V[dW] = beta * V[dW] + (1 - beta) * dW
    # V[db] = beta * V[db] + (1 - beta) * db
    
    # initialize the velocity with zeros
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parametes["b"] + str(l+1).shape)
    
    return v

In [59]:
def update_parameters_with_momentum(parameters, grads, v, beta, learning_rate):
    
    # number of layers in the neural network
    L = len(parameters) // 2
    
    # Finally implementing the formula
    for l in range(L):
        # computing velocities
        v["dW" + str(l+1)] = beta * v["dW" + str(l+1)] + (1 - beta) * grads["dW" + str(l+1)]
        v["db" + str(l+1)] = beta * v["db" + str(l+1)] + (1 - beta) * grad["db" + str(l+1)]
        
        # update parameters
        # optimizing weights
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v["db" + str(l+1)]
        
    return parameters, v
        

# Adam Optimization

In [60]:
def initialize_adam(parameters):
    
    # Declaring the parameters
    L = len(parameters) # number of layers in a neural net
    v = {}
    s = {}
    
    # Initially all values is eaqual to zero
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        
    return v,s

In [61]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beata2 = 0.999, epsilon = 1e-8):
    
    L = len(parameters) // 2
    v_corrected = {}
    s_corrected = {}
    
    for l in range(L):
        # Moving average of the gradients
        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)]
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)]
        
        # Bias corrected first moment estimate
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1 ** t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1 ** t)
        
        # Moving average of the squared gradients
        s["dW" + str(l+1)] = s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] ** 2)
        s["db" + str(l+1)] = s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] ** 2)
        
        # Bias corrected second moment estimate
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)
        
        # Updating parameters
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)] / np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / np.sqrt(s_corrected["db" + str(l+1)] + epsilon)
        
    return parameters, v, s

# The Bread and Butter

In [62]:
def forward_propagation(X, parameters):
    
    # Customizable depending on the type of model to be created
    # Retrieve each parameter from the dictionary "parameters"
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    #Implement forward propagation
    Z1 = np.dot(W1, X) + b1
    A1 = np.tanh(Z1) # IMPLEMENT AN ACTIVATION FUNCTION DU IDIOT
    Z2 = np.dot(W2, A1) + b2
    A2 = sigmoid(Z2) # IMPLEMENT FINAL LAYER ACTIVATION FUNCTION
    
    cache = {"Z1" : Z1,
             "A1" : A1,
             "Z2" : Z2,
             "A2" : A2}
    
    return A2, cache

In [63]:
def backward_propagation(parameters, cache, X, y):
    
    # Retrieving parameters
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    
    # Retriving values from cache
    A1 = cache["A1"]
    A2 = cache["A2"]
    
    # Other parameters
    m = y.shape[1]
    
    # Implementing backprop
    dZ2 = A2 - y
    dW2 = (1 / m) * np.dot(dZ2, A2.T) # + Regularization            Regularization can be done here
    db2 = (1 / m) * np.sum(dZ2, axis = 1, keepdims = True)
    dZ1 = np.multiply(np.dot(W2.T, dZ2), (1 - np.power(A1, 2)))   # second parameter of np.multiply is the derivative of the activation function
    dW1 = (1 / m) * np.dot(dZ1, X.T) # + Regularization             A1 is basically np.tanh(Z1), so it is like (1 - np.power(np.tanh(x), 2))
    db1 = (1 / m) * np.sum(dZ1, axis = 1, keepdims = True)        # derivative of tanh(x) = 1 - tanh^2(x)
    
    grads = {"dW1" : dW1,
             "db1" : db1,
             "dW2" : dW2,
             "db2" : db2}
    
    return grads

In [64]:
def compute_cost(A, y):
    
    # sigmoid cost function for binary classification
    m = y.shape[1]
    logprobs = np.multiply(np.log(A), y) + np.multiply((1 - y), np.log(1 - A))
    cost = -(np.sum(logprobs) / m)
    
    cost = np.squeeze(cost)
    
    return cost

In [67]:
def model(X, y, layer_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 64, beta = 0.9, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, num_epochs = 10000, print_cost = True):
    
    L = len(layer_dims)    # number of layers in the networks
    cost = []              # appending cost each time
    t = 0                  # counter for adam update
    seed = 10              # initialize to a random value for mini_batch creation
    
    # Initialize parameters
    parameters = initialize_parameters_deep(layer_dims)
    
    # Initialize the optimizer
    if optimizer == "gd":
        pass
    elif optimizer == "momentum":
        v = initialize_velocity(parameters)
    elif optimizer == "adam":
        v, s = initialize_adam(parameters)
        
    #Optimization loop
    for i in range(num_epochs):
        
        # incrementing seed value everytime will ensure randomization of 
        seed = seed + 1
        minibatches = random_mini_batches(X, y, mini_batch_size, seed)
        
        # selecting a minibatch
        for minibatch in minibatches:
            
            # segregating data and target values in the minibatch
            (minibatch_X, minibatch_y) = minibatch
            
            # forward propagation
            a3, cache = forward_propagation(minibatch_X, parameters)
            
            # compute cost
            cost = compute_cost(a3, minibatch_y)
            
            # backward propagation
            grads = backward_propagation(parameters, cache, minibatch_X, minibatch_y)
            
            # update parameters
            if optimizer == "gd":
                parmeters = update_parameters_with_gd(parameters, grads, learning_rate)
            elif optimizer == "momentum":
                parameters == update_parameters_with_momentum(parameters, grads, v, beta, learning_rate)
            elif optimizer == "adam":
                t = t + 1 # Adam counter
                parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon)
                
        
        # Print the cost every 1000 epochs
        if print_cost and i % 1000 == 0:
            print("Cost after epoch %i: %f" %(i, cost))
        if print_cost and i % 100 == 0:
            costs.append(cost)
            
    # plot the cost
    plt.plot(costs)
    plt.ylabel("cost")
    plt.xlaber("epoch per 100")
    plt.show()
                
            
            

# Mini-Batch Gradient Descent

In [68]:
# Training our model
# X.shape[0] = 4
layer_dims = [X.shape[0], 5, 2, 1]                        # 4-W1-5-W2-2-W3-1
parameters = model(X, y, layer_dims, optimizer = "gd")

# Predicting
predictions = predict(train_X, train_y, parameters)

ValueError: operands could not be broadcast together with shapes (2,5) (2,2) 

In [34]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))