status : Add Weight Init to Deep - L - Layer model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import time

In [2]:
"""
Error & Updating
"""


def binary_cross_entropy(a, y):
    return -((y * np.log(a)) + ((1 - y) * np.log(1 - a)))

def update_param(param,dparam,lr):
    pass

"""
Activation Function
"""


def tanh(z):
    return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def ReLU(z):
    return np.where(z >= 0, z, 0)


def LeakyReLU(z: float):
    return np.where(z >= 0, z, 0.01 * z)


"""
Derivative of Activation Function wrp. Z
"""


def dReLU(z: float):
    return np.where(z >= 0, 1, 0)


def dLeakyReLU(z: float):
    return np.where(z >= 0, 1, 0.01)


def dsigmoid(z: float):
    a = sigmoid(z)
    return a*(1 - a)


def dTanh(z: float):
    a = tanh(z)
    return 1 - a ** 2


"""
For Bi-Deep L layer Classification
"""


def cut_off_threshold(A, thr):
    return np.where(A >= thr, 1, 0)

In [3]:
def plot_decision_boundary(model, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
    y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole grid
    Z = model(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
    plt.show()
    
    
def predict_dec(param, X):
    """
    Used for plotting decision boundary.
    
    Arguments:
    param -- python dictionary containing your param 
    X -- input data of size (m, K)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Predict using forward propagation and a classification threshold of 0.5
    a3, cache = L_model_forward(X, param)
    predictions = (a3>0.5)
    return predictions

---

In [4]:
"""
Initiate parameter
"""


def initiate_param(layer_dims,initialization = 'random',seed:int=42):
    """Initiate the paramaters W, B for each layer
    
    Arguments
    ----------
        layer_dims : list
            A sequence of number of units for every layer 
        initialization : str, optional
            A technique of weight initialization (default:random)
        seed : int, optional 
            A seed for randomize the initialization
        
    
    Returns
    ----------
        param : numpy.array
            Array of parameter of every layer 
    
    """
    if seed:
        np.random.seed(seed)
    
    # 4 Options of weight initializating
    if initialization == 'zero':
        param = initialization_zero(layer_dims)
    elif initialization == 'random':
        param = initialization_random(layer_dims)
    elif initialization == 'He':
        param = initialization_he(layer_dims)
    elif initialization == 'Xavier':    
        param = initialization_xavier(layer_dims)
    else: #default : random
        print(f'''There is no weight initialization called "{initialization}"
              switch to default initialization random
              ''')
        param = initialization_random(layer_dims)
        
    return param


In [5]:
def initialization_zero(layer_dims:list):
    """Initialize both weight and bias as zeros
    
    Arguments
    ----------
    layer_dims : int
        A sequence of number of units for every layer 
    
    Returns
    ----------
    param : 
        Array of parameter of every layer 
    """
    
    L = len(layer_dims) - 1  #Exclude input layer to calculating L
    param = {}
    
    for l in range(1,L+1):
        param["W" + str(l)] = np.zeros(shape=(layer_dims[l], layer_dims[l-1])) * 0.01 # Uniform(0,1] * 0.01
        param["b" + str(l)] = np.zeros(shape=(layer_dims[l], 1))
        
        assert(param['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(param['b' + str(l)].shape == (layer_dims[l], 1))
        
    return param
    

In [6]:
def initialization_random(layer_dims:list,scale:int=0.01):
    
    """Initialize weight randomly with Normal(mean=0,sigma=1)
    Initialize bias as uniform distributed ( min=0,max= <1 )
    
    Arguments
    ----------
    layer_dimss : int
        A sequence of number of units for every layer 
    scale : float, optional
        A constant to scale the weight initialization
    
    Returns
    ----------
    param : 
        Array of parameter of every layer 
    """
    L = len(layer_dims) - 1  #Exclude input layer to calculating L
    param = {}
    
    """
    scale : variance of the random variable
    y = scale * x
    var(y) = var(scale*x)
    var(y) = scale^2 * x
    """
    for l in range(1,L+1):
        param["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * scale # Normal(0,1) * scale 
        param["b" + str(l)] = np.random.rand(layer_dims[l], 1)
        
        assert(param['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(param['b' + str(l)].shape == (layer_dims[l], 1))
        
    return param
    

In [7]:
def initialization_xavier(layer_dims:list):
    """
    Initialize weight randomly with Normal(mean=0,sigma=(1/fan_avg))
    Initialize bias as uniform distributed ( min=0,max= <1 )
    
    Arguments
    ----------
    layer_dimss : int
        A sequence of number of units for every layer 
    
    Returns
    ----------
    param : 
        Array of parameter of every layer 
    """
    
    L = len(layer_dims) - 1  #Exclude input layer to calculating L
    param = {}
    
    for l in range(1,L+1):
        fan_in , fan_out = layer_dims[l-1] , layer_dims[l]
        fan_avg = 1/2 * (fan_in + fan_out)
        
        param["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(1/fan_avg) 
        param["b" + str(l)] =  np.random.rand(layer_dims[l], 1)
        
        assert(param['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(param['b' + str(l)].shape == (layer_dims[l], 1))
        
    return param
    

In [8]:
def initialization_he(layer_dims:list):
    """
    Initialize weight randomly with Normal(mean=0,sigma=(2/fan_in))
    Initialize bias as uniform distributed ( min=0,max= <1 )
    
    Arguments
    ----------
    layer_dimss : int
        A sequence of number of units for every layer 
    
    Returns
    ----------
    param : 
        Array of parameter of every layer 
    """
    
    L = len(layer_dims) - 1  #Exclude input layer to calculating L
    param = {}
    
    for l in range(1,L+1):
        fan_in = layer_dims[l-1]
        
        param["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * np.sqrt(2/fan_in) 
        param["b" + str(l)] =  np.random.rand(layer_dims[l], 1)
        
        assert(param['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(param['b' + str(l)].shape == (layer_dims[l], 1))
        
    return param
    

In [9]:
"""
Forward Propagation Unit
"""


def linear_forward(A_prev, W, b):
    """Linear Forward unit
    
    Argument
    ----------    
    1. A_prev --- Activation node of the previous layer A[l-1]
    2. W --- Weight of layer l
    3. b --- Bias of layer l

    Return
    ----------
    1. Z --- Output Z of layer l 
    2. caches --- cache of Linear forward Unit
    """
    Z = np.dot(W, A_prev) + b
    
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    cache = (A_prev,W,b)      # A :for dZ, W for dA & to get updating, b for updating , dA for dZ
    
    return Z, cache


def linear_activation_forward(A_prev, W, b, activation_function):
    """Linear Forward unit
    
    Argument
    ----------    
    1. A_prev --- Activation node of the previous layer A[l-1]
    2. W --- Weight of layer l
    3. b --- Bias of layer l

    Return
    ----------
    1. Z --- Output Z of layer l 
    2. caches --- cache of Linear forward Unit and Activation function
    """
    Z, linear_cache = linear_forward(A_prev, W, b)

    if activation_function == "sigmoid":
        A = sigmoid(Z)
    elif activation_function == "tanh":
        A = tanh(Z)
    elif activation_function == "ReLU":
        A = ReLU(Z)
    elif activation_function == "LeakyReLU":
        A = LeakyReLU(Z)
    elif activation_function == "linear":
        A = Z
    
    activation_cache = Z
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (activation_cache, linear_cache)  # (Z, (A_prev,W,b))

    return A, cache


def L_model_forward(X, param, activation_function="ReLU", last_activation_function="sigmoid"):
    """Forward propagation model from input to output layer
       Apply parameter to the input X to return the Activation Output 
    
    Argument
    ----------    
    1. X --- Input denoted as A[0]
    2. param --- Weight and Bias of every layer
    3. activation_function --- the activation function for hidden layer (default:ReLU)
    4. last_activation_function --- the activation function for output layer (default:sigmoid)
                                    Classication : sigmoid
                                    Regression : linear

    Return
    ----------
    1. AL --- Output A[L] from the propagation (Z[L] with sigmoid activation function)
    2. caches --- the cache of every layer l 
    """

    A = X
    L = (len(param) // 2)  # param stores the weight and bias for L layer, hence len(param) = 2L

    caches = []
    
    # For Hidden Layer [1,2..,L-1]
    for l in range(1,L):  # l = 1,2,..,L-1
        A_prev = A
        W = param["W" + str(l)]
        b = param["b" + str(l)]
        A, cache = linear_activation_forward(A_prev, W, b, activation_function)
        caches.append(cache)  # append cache at layer l
    
    # For Output layer [L]

    A_prev = A
    W = param["W" + str(L)]
    b = param["b" + str(L)]
    AL, cache = linear_activation_forward(A_prev, W, b, last_activation_function)
    caches.append(cache)

    
    assert(AL.shape == (1, X.shape[1]))

    return AL, caches


In [10]:
def compute_cost(AL, Y):
    """
    Compute the cost function with respect to tAL
    cost function : Binary cross entropy
    Arguments:
    A --- predicted value from L-Forward model
    y --- actual output
    """
    
    m = Y.shape[1]
    
    loss = binary_cross_entropy(AL, Y)
    cost = np.divide(loss, m)  # No significant difference in speed when compare to '/' though
    cost = np.sum(cost, axis=1)
    
    return cost

In [11]:
"""
Backward Propagation Unit
"""


def linear_backward(dZ, cache):
    """Use dZ from the layer l to obtain dW,dB,dA_prev
    Arguments
    ----------
      dZ -- Gradient of the cost with respect to the linear output (of current layer l)
      cache -- tuple of values (Z,(A_prev, W, b)) coming from the forward propagation in the current layer (We use only linear cache anyway)

    Returns
    ----------
      dA_prev --- Gradient of the cost with respect to the activation node at the previous layer
      dW --- Gradient of the cost with the weight in this layer
      db --- Gradient of the cost with the bias in this layer
    """
    _, linear_cache = cache  # We use only linear cache
    (A_prev, W, b) = linear_cache  # We do not use b to obtain those 3 gradients

    m = dZ.shape[1]  

    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)

    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db


def linear_activation_backward(dA, cache, activation_function):
    """Input dA to find dZ, then use dZ to obtain dW,dB,dA_prev
    Arguments
    ----------
      dZ -- Gradient of the cost with respect to the linear output (of current layer l)
      cache -- tuple of values (Z,(A_prev, W, b)) coming from the forward propagation in the current layer (We use only linear cache anyway)
    
    Returns
    ----------
      dA_prev --- Gradient of the cost with respect to the activation node at the previous layer
      dW --- Gradient of the cost with the weight in this layer
      db --- Gradient of the cost with the bias in this layer
    """
    if activation_function == "ReLU":
        g_ = dReLU
    elif activation_function == "LeakyReLU":
        g_ = dLeakyReLU
    elif activation_function == "tanh":
        g_ = dTanh
    elif activation_function == "sigmoid":
        g_ = dsigmoid
    else:
        print(f"The activation function {activation_function} not found, ReLU as default")
        g_ = dReLU

    activation_cache, _ = cache  # We use only activation cache
    Z = activation_cache

    dZ = dA * g_(Z)
    dA_prev, dW, db = linear_backward(dZ, cache)
    
    return dA_prev, dW, db


def L_model_backward(AL, Y, cache, activation_function="ReLU", last_activation_function="sigmoid"):
    """
    Backward propagation model from output AL to the parameter gradient of all layers
    Apply parameter to the input X to return the Activation Output 
    
    Arguments:
    A --- A at the layer L
    y --- an actual output
    cache --- cache from the forward propagation
    activation_function --- activation function for the hidden layer
    Return:
     grads  -- A dictionary with the gradients
               grads["dA" + str(l)] = ...
               grads["dW" + str(l)] = ...
               grads["db" + str(l)] = ...
    """
    L = len(cache)  # cache for each layer
    grads = {}
    
    # For Output layer
    dAL = np.divide(1 - Y, 1 - AL) - np.divide(Y, AL)  # dA_[L] : Input for the first linear activation backward
                                                    # Loss : Binary Cross Entropy
    
    current_cache = cache[-1] 
    dA_prev, dW, db = linear_activation_backward(dAL,current_cache,last_activation_function)
    grads["dW" + str(L)] = dW
    grads["db" + str(L)] = db
    
    dA = dA_prev
    
    
    # For Hidden layer [L-1, L-2...,1]
    for l in reversed(range(1,L)): 

        current_cache = cache[l-1] 
        (activation_cache, linear_cache) = current_cache
        
        Z = activation_cache
        a_prev, W, b = linear_cache  # Start with Z_[L] , A_[L-1], W_[L], b_[L]
        
        dA_prev, dW, db = linear_activation_backward(dA, current_cache, activation_function)

        grads["dW" + str(l)] = dW
        grads["db" + str(l)] = db
        
        dA = dA_prev

    return grads

In [12]:
def update_param(param, grads, lr=1e-4):
    """Update parameter 
    Argument:
    1. param -- The current parameter (W1,W2,...,WL,b1,b2,...bL)
    2. grads -- the dictionary of gradient that was obtained from L_model_backward function
    3. lr (default=1e-4) : Learning rate
    Returns:
    1. updated_param -- The parameter that got updated
    """

    L = len(param) // 2  # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    for l in range(1,L+1):
        param["W" + str(l)] = (param["W" + str(l)] - lr * grads["dW" + str(l)])
        param["b" + str(l)] = (param["b" + str(l)] - lr * grads["db" + str(l)])

    return param


In [13]:
a = np.array([[0.99,0.4,0.3]])
b = np.array([[1,1,1]])

compute_cost(a,b)

array([0.71010462])

In [14]:
class Binary_Deep_L_Layer:
    """
    A Deep neural network with L layers
    - Able to fit with the predictors (X) and the response (Y)
    - Able to predict_proba and predict with threshold
    To see the last fit model parameter, uses self.param where self refer to the fit model
    """

    def __init__(self, hyperparam: dict):
        """
        Launch the Deep_L_layer with the given hyperparameter
        
        Arguments:
        hyperparam: A dictionary with key:
         L --- Number of Layers (Hidden layer(s) + Output layer)
         layer_dims --- Number of units of that L layer
         lr --- Learning rate
         forward_activation_function --- Activation function for all hidden layer(s) in forward model (ReLU,LeakyReLU,tanh,sigmoid)
         last_forward_activation_function --- Activation function for all hidden layer(s) in forward model (sigmoid,linear)
          {"L" : 5,
          "layer_dims" : [nrow,8,6,4,2,1],
          "lr" : 1e-5,
          "forward_activation_function" : 'tanh' ,
          "last_forward_activation_function" : 'sigmoid' }
          
        Supported activation
        """
        self.hyperparam = hyperparam  # assume include nrow in dict

        # Explicit hyperparameter attributes
        self.L = hyperparam["L"]
        self.lr = hyperparam["lr"]
        self.forward_activation_function = hyperparam["forward_activation_function"]
        self.last_forward_activation_function = hyperparam["last_forward_activation_function"]
        
    def compiles(self, initialization = 'random' , optimizer='adam', loss='binary_cross_entropy'):
        """
        Develop soon (optional hyperparameter)
        """
        self.initialization = initialization
        self.optimizer = optimizer
        self.loss = loss
        
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.Series,
        Epochs: int = 1000,
        report_cost: bool = True,
        warmup: bool = False,
    ):
        """
        Fit the launched Deep L layer with the given data X , Y

        Arguments:
         X --- Pandas Dataframe of predictors
         Y --- Pandas Series of response (0 : negative, 1:positive)
         Epoch --- number of epochs (default : 1000)
         report_cost --- report the cost epochs every 1000 epoch
         warmup --- update param and save the parameter
        """

        ## First, we initiate the attributes

        # We turn Dataframe into Numpy format
        X = X.to_numpy().T
        Y = Y.to_numpy().T
        nrow = np.shape(X)[0]

        # Assign class attribute
        self.X = X
        self.Y = Y
        self.m = Y.shape[1]
        self.Epochs = Epochs

        self.param = initiate_param(layer_dims = self.hyperparam['layer_dims'],
                                        initialization = self.initialization)
        
        for epoch in range(self.Epochs +1):
            A, cache = L_model_forward(self.X, self.param, 
                                       activation_function=self.forward_activation_function,
                                      last_activation_function=self.last_forward_activation_function)
            #print(A)
            if (report_cost and epoch % 1000 == 0):
                cost = compute_cost(A, self.Y)
                print(f"Epoch {epoch}/{Epochs} : ===Cost=== : {np.squeeze(cost)}")

            grads = L_model_backward(A, self.Y, cache, 
                                     self.forward_activation_function,
                                    self.last_forward_activation_function)
            
            self.param = update_param(self.param, grads, lr=self.lr)
            
    def predict_proba(self, X: pd.DataFrame):
        """
        Predict probability of the observation given input X

        Arguments:
         X --- Pandas Dataframe or Series of predictors
        """
        X = X.to_numpy().T

        A_prob, _ = L_model_forward(X, self.param, 
                                    activation_function=self.forward_activation_function,
                                    last_activation_function=self.last_forward_activation_function
                                   )

        return A_prob

    def predict(self, X, threshold: float = 0.5,predict_proba=False):
        """
        Predict the observation given input X

        Arguments:
         X --- Pandas Dataframe or Series of predictors
        """
        
        A_prob, _ = L_model_forward(X, self.param, 
                            activation_function=self.forward_activation_function,
                            last_activation_function=self.last_forward_activation_function
                           )
        
        
        if not predict_proba:
            A_pred = cut_off_threshold(A_prob, threshold)
        else:
            A_pred = A_prob
        return A_pred

    def __repr__(self):
        return f"Deep_L_Layer({self.hyperparam})"

    def __str__(self):
        return f"A Deep {self.L} Neural network with learning rate = {self.lr} (Forward activation :{self.forward_activation_function},Backward activation :{self.backward_activation_function})"




---



Test case

In [15]:
df = pd.read_csv('Dataset/two_circle.csv')
df = df.astype({"Y":'category'})

In [16]:
df.dtypes

X1     float64
X2     float64
Y     category
dtype: object

In [27]:

hyperparam = {"L" : 5,
              "layer_dims" : [2,8,6,4,2,1],
              "lr" : 1e-4,
              "forward_activation_function" : 'ReLU',
              "last_forward_activation_function" : 'sigmoid',
              "keep_prob_sequence" : [1,0.5,0.6,0.7,1,1]}   #Dropout => No dropout = None OR [1,1,1,1,1,1]

X = df[['X1','X2']]
Y = df[['Y']]

In [28]:
model = Binary_Deep_L_Layer(hyperparam)

In [19]:
#model.compiles(initialization ='Xavier')
#model.fit(X,Y,Epochs=100000)

Epoch 0/100000 : ===Cost=== : 0.6998931011884904
Epoch 1000/100000 : ===Cost=== : 0.6997832340518138
Epoch 2000/100000 : ===Cost=== : 0.6996747689929277
Epoch 3000/100000 : ===Cost=== : 0.6995676841487244
Epoch 4000/100000 : ===Cost=== : 0.6994652629683484
Epoch 5000/100000 : ===Cost=== : 0.6993651000758289
Epoch 6000/100000 : ===Cost=== : 0.699268345818704
Epoch 7000/100000 : ===Cost=== : 0.6991718760887252
Epoch 8000/100000 : ===Cost=== : 0.6990760161394509
Epoch 9000/100000 : ===Cost=== : 0.698981248585247
Epoch 10000/100000 : ===Cost=== : 0.698887799922073
Epoch 11000/100000 : ===Cost=== : 0.6987967709303996
Epoch 12000/100000 : ===Cost=== : 0.6987064504244305
Epoch 13000/100000 : ===Cost=== : 0.6986173921318232
Epoch 14000/100000 : ===Cost=== : 0.6985295760192564
Epoch 15000/100000 : ===Cost=== : 0.6984428677820294
Epoch 16000/100000 : ===Cost=== : 0.6983571634174042
Epoch 17000/100000 : ===Cost=== : 0.6982726456583036
Epoch 18000/100000 : ===Cost=== : 0.6981892957676867
Epoch 190

In [20]:
model.param

{'W1': array([[ 0.22068604, -0.06343837],
        [ 0.29043362,  0.68241276],
        [-0.10518428, -0.10550973],
        [ 0.70749339,  0.34412656],
        [-0.20971881,  0.24326425],
        [-0.20758671, -0.20846711],
        [ 0.10808282, -0.85560019],
        [-0.7721136 , -0.2520617 ]]),
 'b1': array([[0.6094902 ],
        [0.14342104],
        [0.29066899],
        [0.36845843],
        [0.45753592],
        [0.78366867],
        [0.1990569 ],
        [0.51269844]]),
 'W2': array([[-0.20417756,  0.04392019, -0.43469739,  0.14446612, -0.22584358,
         -0.10911024, -0.22715673,  0.70040359],
        [-0.00512355, -0.39978098,  0.31086887, -0.46143517,  0.07889461,
         -0.74074613, -0.5020072 ,  0.07432454],
        [ 0.27822602,  0.06339466, -0.04383038, -0.11539837, -0.55953669,
         -0.27257183, -0.17415114,  0.39948925],
        [ 0.1256563 , -0.66954248,  0.12096747, -0.15005175, -0.25905009,
          0.22681177,  0.38935248,  0.35094919],
        [-0.31550296, 

In [21]:
model.compiles(initialization ='random')
model.fit(X,Y,Epochs=100000)

Epoch 0/100000 : ===Cost=== : 0.693491968812983
Epoch 1000/100000 : ===Cost=== : 0.6934901040486823
Epoch 2000/100000 : ===Cost=== : 0.693488249367628
Epoch 3000/100000 : ===Cost=== : 0.6934864047153113
Epoch 4000/100000 : ===Cost=== : 0.6934845700375185
Epoch 5000/100000 : ===Cost=== : 0.693482745280328
Epoch 6000/100000 : ===Cost=== : 0.6934809303901105
Epoch 7000/100000 : ===Cost=== : 0.6934791253135258
Epoch 8000/100000 : ===Cost=== : 0.6934773299975225
Epoch 9000/100000 : ===Cost=== : 0.6934755443893358
Epoch 10000/100000 : ===Cost=== : 0.6934737684364859
Epoch 11000/100000 : ===Cost=== : 0.6934720020867767
Epoch 12000/100000 : ===Cost=== : 0.6934702452882944
Epoch 13000/100000 : ===Cost=== : 0.6934684979894056
Epoch 14000/100000 : ===Cost=== : 0.6934667601387559
Epoch 15000/100000 : ===Cost=== : 0.6934650316852686
Epoch 16000/100000 : ===Cost=== : 0.6934633125781429
Epoch 17000/100000 : ===Cost=== : 0.6934616027668528
Epoch 18000/100000 : ===Cost=== : 0.693459902201145
Epoch 1900

In [22]:
model.compiles(initialization ='zero')
model.fit(X,Y,Epochs=100000)

Epoch 0/100000 : ===Cost=== : 0.6931471805599456
Epoch 1000/100000 : ===Cost=== : 0.6931471805599456
Epoch 2000/100000 : ===Cost=== : 0.6931471805599456
Epoch 3000/100000 : ===Cost=== : 0.6931471805599456
Epoch 4000/100000 : ===Cost=== : 0.6931471805599456
Epoch 5000/100000 : ===Cost=== : 0.6931471805599456
Epoch 6000/100000 : ===Cost=== : 0.6931471805599456
Epoch 7000/100000 : ===Cost=== : 0.6931471805599456
Epoch 8000/100000 : ===Cost=== : 0.6931471805599456
Epoch 9000/100000 : ===Cost=== : 0.6931471805599456
Epoch 10000/100000 : ===Cost=== : 0.6931471805599456
Epoch 11000/100000 : ===Cost=== : 0.6931471805599456
Epoch 12000/100000 : ===Cost=== : 0.6931471805599456
Epoch 13000/100000 : ===Cost=== : 0.6931471805599456
Epoch 14000/100000 : ===Cost=== : 0.6931471805599456
Epoch 15000/100000 : ===Cost=== : 0.6931471805599456
Epoch 16000/100000 : ===Cost=== : 0.6931471805599456
Epoch 17000/100000 : ===Cost=== : 0.6931471805599456
Epoch 18000/100000 : ===Cost=== : 0.6931471805599456
Epoch 

In [29]:
model.compiles(initialization ='He')
model.fit(X,Y,Epochs=100000)

Epoch 0/100000 : ===Cost=== : 0.7058925914610181
Epoch 1000/100000 : ===Cost=== : 0.7003626297627251
Epoch 2000/100000 : ===Cost=== : 0.6970405135741111
Epoch 3000/100000 : ===Cost=== : 0.694919985424812
Epoch 4000/100000 : ===Cost=== : 0.6935116693459789
Epoch 5000/100000 : ===Cost=== : 0.6928730865977202
Epoch 6000/100000 : ===Cost=== : 0.6925516574807107
Epoch 7000/100000 : ===Cost=== : 0.6922527341245531
Epoch 8000/100000 : ===Cost=== : 0.6919718436778719
Epoch 9000/100000 : ===Cost=== : 0.6917054596840395
Epoch 10000/100000 : ===Cost=== : 0.6914388047276112
Epoch 11000/100000 : ===Cost=== : 0.6911676050632792
Epoch 12000/100000 : ===Cost=== : 0.6908984282554695
Epoch 13000/100000 : ===Cost=== : 0.6906416311649397
Epoch 14000/100000 : ===Cost=== : 0.6903884708786967
Epoch 15000/100000 : ===Cost=== : 0.6901443039003872
Epoch 16000/100000 : ===Cost=== : 0.6899215746245715
Epoch 17000/100000 : ===Cost=== : 0.6897058163002452
Epoch 18000/100000 : ===Cost=== : 0.6894924440947122
Epoch 1

In [24]:
model.param

{'W1': array([[ 0.49447939, -0.13998468],
        [ 0.64763831,  1.5226475 ],
        [-0.2333232 , -0.23385667],
        [ 1.58087409,  0.76847747],
        [-0.46911359,  0.54130353],
        [-0.46337517, -0.46583659],
        [ 0.2411551 , -1.91296599],
        [-1.72514433, -0.56184532]]),
 'b1': array([[0.6112786 ],
        [0.14091129],
        [0.29389715],
        [0.36721661],
        [0.45562394],
        [0.78399416],
        [0.19879067],
        [0.51386524]]),
 'W2': array([[-0.27170122,  0.06075421, -0.57632172,  0.19266985, -0.29952274,
         -0.14774444, -0.30064007,  0.92624954],
        [-0.00674861, -0.52885546,  0.41127246, -0.61042182,  0.1044318 ,
         -0.97983506, -0.66409302,  0.09843062],
        [ 0.36927826,  0.08491174, -0.0572283 , -0.15230798, -0.73807469,
         -0.35828861, -0.23048888,  0.5289575 ],
        [ 0.17017992, -0.8825674 ,  0.16139561, -0.19445354, -0.33911249,
          0.30410292,  0.51387861,  0.46456209],
        [-0.41783027, 

Accuracy

In [25]:
Y_pred = model.predict(X)
y = Y.values.T

ValueError: shapes (8,2) and (400,2) not aligned: 2 (dim 1) != 400 (dim 0)

In [None]:
arr = y - Y_pred

In [None]:
#Accuracy
np.count_nonzero(arr==0) / len(Y)

In [None]:
plt.title("Model with He initialization")
axes = plt.gca()
plot_decision_boundary(lambda x: predict_dec(model.param, x.T), X.values, Y.values)

In [None]:
model.predict(pd.DataFrame(np.array([[0,0.7]])))