In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Creation of dataset


In [2]:
nb_examples = 4

X_train = np.random.randint(1, 100, size = (nb_examples,1)).T
X_test = np.random.randint(1, 100, size = (nb_examples,1)).T

Y_train = np.round(X_train / 100).astype(int)
Y_test = np.round(X_test / 100).astype(int)


In [45]:
print(Y_train.shape)
print(X_train)

(1, 4)
[[84 69 66 62]]


## Coding the helper functions

In [4]:
def sigmoid(Z):
    cache = Z
    A = 1 / (1 + np.exp(- Z))
    return A, cache

In [5]:
def relu(Z):
    cache = Z
    A = np.maximum(0, Z)
    return A, cache

In [6]:
def leaky_relu(Z):
    cache = Z
    A = np.maximum(0.01 ,Z)
    return A, cache

In [7]:
def tanh(Z):
    cache = Z
    A = np.tanh(Z)
    return A, cache

In [32]:
def initialize(layers_dims):
    
    params = {}
    L = len(layers_dims)
    
    for l in range(1, L):
        params[f'W{l}'] = np.random.randn(layers_dims[l], layers_dims[l-1])*0.01
        params[f'b{l}'] = np.zeros((layers_dims[l], 1))
    
    return params

In [33]:
layers_dims = [1,4,7,4,1]
params = initialize(layers_dims)


In [34]:
def linear_forward(A_prev, W, b):
    
    Z = np.dot(W, A_prev) + b
    
    assert (Z.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (A_prev, W, b)
    
    return Z, cache
    

In [35]:
# result = linear_forward(X_train, params['W1'], params['b1'])


In [36]:
# print(result[0].shape)

In [37]:
def activation_forward(A_prev, W, b, activation):
    
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    elif activation == 'relu':
        A, activation_cache = relu(Z)
    elif activation == 'leaky_relu':
        A, activation_cache = leaky_relu(Z)
    elif activation == 'tanh':        
        A, activation_cache = tanh(Z)
    else:
        raise Exception(f'{activation} function is not known ')
    
    assert(A.shape == (W.shape[0], A_prev.shape[1]))
    
    cache = (linear_cache, activation_cache)
    
    return A, cache

In [38]:
res2 = activation_forward(X_train, params['W1'], params['b1'], activation = 'sigmoid')

In [39]:
print(res2[0].shape)

(4, 4)


In [40]:
def L_model_forward(X, params):
    
    L = len(params) // 2
    A = X
    
    caches = []
    
    # forward prop for the L-1 layers
    for l in range(1, L):
        A_prev = A
        A, cache = activation_forward(A_prev, params[f'W{l}'], params[f'b{l}'], activation='relu') 
        caches.append(cache)
    
    #forward prop for the L layer (output layer)
    AL, cache = activation_forward(A, params[f'W{L}'], params[f'b{L}'], activation = 'sigmoid')
    caches.append(cache)

    assert (AL.shape == (1, X.shape[1]))
    
    return AL, caches

In [41]:
AL, caches = L_model_forward(X_train, params)
print(AL)

[[0.49999985 0.49999988 0.49999988 0.49999989]]


In [42]:
def cost_function(AL, Y):
    
    m = Y.shape[1]
    
    cost = - (Y * np.log(AL).T + (1 - Y) * np.log(1 - AL).T) / m
    cost = np.squeeze(cost)
    
    return cost

In [43]:
test = cost_function(AL, Y_train)

In [44]:
print(test)

[[0.17328687 0.17328687 0.17328687 0.17328687]
 [0.17328686 0.17328686 0.17328686 0.17328686]
 [0.17328685 0.17328685 0.17328685 0.17328685]
 [0.17328685 0.17328685 0.17328685 0.17328685]]


## Backward propagation helper functions


In [21]:
dAL = -(np.divide(Y_train,AL) - np.divide(1-Y_train, 1 - AL))
dAL

  """Entry point for launching an IPython kernel.


array([[nan, nan, nan, nan]])

In [22]:
def sigmoid_backward(dA, Z):
     
    s, _ = sigmoid(Z)
    dZ = dA * s * (1 - s) 
    
    return dZ

In [23]:
def relu_backward(dA, Z):
    
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0
    return dZ

In [24]:
def leaky_relu_backward(dA, Z):
    
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0.01
    return dZ

In [25]:
def tanh_backward(dA, Z):
    
    dZ = dA * (1 - tanh(Z))
    return dZ

In [26]:
def linear_backward(dZ, cache):
    
    A_prev, W, b = cache    
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims= True) / m
    dA_prev = np.dot(W.T, dZ)
    
    assert(dW.shape == W.shape)
    assert(db.shape == b.shape)
    assert(dA_prev.shape == A_prev.shape)
    
    return dA_prev, dW, db

In [27]:
def linear_activation_backward(dA, cache, activation):
    
    linear_cache, activation_cache = cache
    Z = activation_cache
    
    if activation == 'relu':
        dZ = relu_backward(dA, Z)
    elif activation == 'sigmoid':
        dZ = sigmoid_backward(dA, Z)
    elif activation == 'tanh':
        dZ = tanh_backward(dA, Z)
    elif activation == 'leaky_relu':
        dZ = leaky_relu_backward(dA, Z)
    else:
        raise Exception (f'{activation} is not a valid activation name' )
    
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

## Backward pass

In [28]:
def L_model_backward(AL, Y, caches):
    
    
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    # Initializing the backpropagation
    dAL = -(np.divide(Y_train,AL) - np.divide(1-Y_train, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]    
    current_cache = caches[L-1]
    grads[f'dA{L-1}'], grads[f'dW{L}'], grads[f'db{L}'] = linear_activation_backward(dAL, current_cache, activation = 'sigmoid')
    
    # lth layer: (RELU -> LINEAR) gradients.
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads['dA' + str(l)], grads['dW' + str(l+1)], grads['db' + str(l+1)] = linear_activation_backward(grads["dA" + str(l + 1)],current_cache,'relu')
    
    return grads

In [29]:
gradient_descent = L_model_backward(AL, Y_train, caches)

  # Remove the CWD from sys.path while we load stuff.


## Update des parameters


In [30]:
def update_parameters(params, grads, learning_rate):
    
    L = len(params) // 2
    for l in range(1, L+1):
        params[f'W{l}'] = params[f'W{l}'] - learning_rate * grads[f'dW{l}']
        params[f'b{l}'] = params[f'b{l}'] - learning_rate * grads[f'db{l}']
    
    return params

In [31]:

update_parameters(params,gradient_descent,learning_rate =12)

{'W1': array([[-1.0379718 ],
        [        nan],
        [        nan],
        [-1.18593281]]), 'b1': array([[ 0.],
        [nan],
        [nan],
        [ 0.]]), 'W2': array([[-0.68763151,  0.4657133 , -1.39582825,  0.30582205],
        [-0.71919882,  0.35453772, -0.57514567,  1.85453257],
        [        nan,         nan,         nan,         nan],
        [ 0.53103639,  2.33771511, -1.29249613, -0.92938368],
        [        nan,         nan,         nan,         nan],
        [        nan,         nan,         nan,         nan],
        [-0.76815344, -0.45681006, -1.26407125, -1.03306349]]), 'b2': array([[ 0.],
        [ 0.],
        [nan],
        [ 0.],
        [nan],
        [nan],
        [ 0.]]), 'W3': array([[ 1.33061685, -0.45413524, -1.01288395, -0.3806624 , -0.28879495,
          0.42639543,  1.217032  ],
        [-1.27744089,  0.5274981 ,  0.44201899,  0.99633391, -1.25046003,
         -0.08221519,  0.10830077],
        [        nan,         nan,         nan,        