In [1]:
import numpy as np

In [12]:
class DeepNN():
    def __init__(self, X, Y, dims_of_layers, activations, alpha = 0.01):
        #dims_of_layers - list of number of units in each layer (first element - num of features in input)
        #activations - activation function applied to each layer
        
        #dims_of_layers[0] - n of features in input
        #activations[0] - activation for first hidden layer
        #we support only 3 activation funcs: linear, sigmoid, relu
        
        #X.shape should be (n_features, m_examples)
        #Y.shape should be (1, m_examples)
        self.X = X
        self.Y = Y
        
        self.m_examples = X.shape[1]
        
        self.dims_of_layers = dims_of_layers
        self.n_layers = len(activations)
        
        self.activations = activations
        self.params = dict()
        
        self.learning_history = []
        self.alpha = alpha
        
        #setting cache dicts for backpropogation
        
        self.cache = dict()
            
    def initialize_params(self):
        
        for i in range(1, len(self.activations) + 1):
            
            #setting parameters layer by layer
            self.params["W" + str(i)] = np.random.randn(self.dims_of_layers[i], self.dims_of_layers[i-1])
            self.params["b" + str(i)] = np.zeros((self.dims_of_layers[i], 1))
            
     
    
    def activation(self, Z, function="linear"):
        if function == "linear":
            return Z
        
        if function == "sigmoid":
            return 1 / (1 + np.exp(-Z))
        
        if function == "relu":
            return Z * (Z > 0)
        
        
    def deriv_activation(self, Z, function):
        if function == "linear":
            return 1.
    
        if function == "sigmoid":
            sigm_z = self.activation(Z, "sigmoid")
            
            return sigm_z *(1 - sigm_z)
        
        if function == "relu":
            return 1. * (Z > 0)
    
    
    
    def forward_propogation(self, X):
        
        #X.shape = (n_features, m_examples)
        
        A_prev = X
        self.cache["A0"]  = A_prev
        
        Z_current = np.dot(self.params["W1"], A_prev) + self.params["b1"]
        A_current = self.activation(Z_current, function=self.activations[0])
        
        self.cache["Z1"] = Z_current
        self.cache["A1"] = A_current
        
        for i in range(1, len(self.activations)):
            A_prev = A_current
            
            #A_prev - cache["A" + str(i)]
            Z_current = np.dot(self.params["W" + str(i+1)], A_prev)
            A_current = self.activation(Z_current, function=self.activations[i])
            
            #keeping values in cache for backprop
            self.cache["Z" + str(i+1)] = Z_current
            self.cache["A" + str(i+1)] = A_current            

            
        predictions = A_current
        
        return predictions
        
    def compute_cost(self, predictions, cost_function="cross_entropy"):
        #leave cost func as a parameter 
        #so that we can use it futher not only for classification
        
        
        
        
        if cost_function == "cross_entropy":
            
            #lets cut off a  tiny constant to avoid log0 problem
            epsilon = 10 ** -15
            
            predictions = np.clip(predictions, epsilon, 1-epsilon)
            
            #BinaryCrossEntropy
        
            cost = (self.Y * np.log(predictions) + 
                    (1 - self.Y) * np.log(1 - (predictions)) * (-1 / self.m_examples))
            
            cost = np.sum(cost, axis=1, keepdims=True) #sum up the columns
            
        #we will have more cost functions here later...
        
        return cost
    
    def deriv_of_cost(self, predictions, cost_function="cross_entropy"):
        
        if cost_function == "cross_entropy":
            
            #avoiding division by zero
            epsilon = 10 ** -15
            predictions = np.clip(predictions, epsilon, 1-epsilon)
            
            dAL = (predictions - self.Y) / (predictions * (1 - predictions))
            
        #we will have more cost functions here later...
        
        return dAL
            
    
    def back_propogation(self, predictions):
        
        L = self.n_layers
        
        grads_cache = dict()

        for i in range(L, 0, -1):
            if i == L:
                dA_i = self.deriv_of_cost(predictions)
            else:
                
                dA_i = np.dot(self.params["W" + str(i+1)].T, grads_cache["dZ" + str(i+1)])
    
    
            grads_cache["dA" + str(i)] = dA_i
            activation_i = self.activations[-i]
            
            Z_i = self.cache["Z" + str(i)]
            A_prev = self.cache["A" + str(i-1)]
            W_i = self.params["W" + str(i)]
            
            dZ_i = dA_i * self.deriv_activation(Z_i, activation_i)
            
            #computing derivs for W, b
            dW_i = (1 / self.m_examples) * np.dot(dZ_i, A_prev.T)
            db_i = (1/ self.m_examples) * np.sum(dZ_i, axis=1, keepdims=True)
            
            
            #storing gradients
            grads_cache["dZ" + str(i)] = dZ_i
            grads_cache["dW" + str(i)] = dW_i
            grads_cache["db" + str(i)] = db_i
            
        return grads_cache
             

            
        
        

In [23]:
dims = [3, 4, 2, 1]
activation = ["relu", "relu", "sigmoid"]

X = np.random.randn(3, 6)
y = np.array([[1,1,1,0,0,0]])




In [24]:
deepnn = DeepNN(X, y, dims, activation)
deepnn.initialize_params()
predictions = deepnn.forward_propogation(X)

In [25]:
deepnn.cache

{'A0': array([[-1.0039288 , -2.61528105,  0.99282464, -1.21592424,  0.74263428,
         -0.76234171],
        [-0.89761618, -0.94736143,  0.463101  ,  0.02540576,  0.29786585,
          0.5660561 ],
        [ 1.74704228, -0.31632605, -0.24545828,  0.57849373, -1.00241947,
          0.99948863]]),
 'Z1': array([[-0.28136488,  0.84093675, -0.20114913,  0.22929381,  0.08370022,
          0.01716013],
        [ 0.18683092,  6.66498489, -2.06516028,  1.47561334, -0.2107284 ,
         -0.64673994],
        [-1.72353776, -2.5904964 ,  1.12433884, -0.33104613,  0.67518199,
          0.80161828],
        [-1.66975996, -0.39556833,  0.44512845,  0.57071072,  0.43408338,
          1.30214053]]),
 'A1': array([[-0.        ,  0.84093675, -0.        ,  0.22929381,  0.08370022,
          0.01716013],
        [ 0.18683092,  6.66498489, -0.        ,  1.47561334, -0.        ,
         -0.        ],
        [-0.        , -0.        ,  1.12433884, -0.        ,  0.67518199,
          0.80161828],
        

In [26]:
deepnn.back_propogation(predictions)

{'dA3': array([[-1.8093341 , -1.00013789, -2.59126192, 26.3843582 ,  1.7722827 ,
          2.84580687]]),
 'dZ3': array([[-1.8093341 , -1.00013789, -0.        , 26.3843582 ,  0.        ,
          2.84580687]]),
 'dW3': array([[4.92349804, 8.63954191]]),
 'db3': array([[4.40344885]]),
 'dA2': array([[ 0.24904871,  0.1376656 ,  0.        , -3.63171761,  0.        ,
         -0.39171568],
        [-2.85752995, -1.57954464,  0.        , 41.66952577,  0.        ,
          4.49445925]]),
 'dZ2': array([[ 0.24904871,  0.1376656 ,  0.        , -3.63171761,  0.        ,
         -0.39171568],
        [-2.85752995, -1.57954464,  0.        , 41.66952577,  0.        ,
          4.49445925]]),
 'dW2': array([[-0.12061403, -0.73249031, -0.05233441, -0.43045484],
        [ 1.38389877,  8.40443199,  0.60047345,  4.93894374]]),
 'db2': array([[-0.60611983],
        [ 6.95448507]]),
 'dA1': array([[ -2.8884116 ,  -1.59661496,   0.        ,  42.11985305,
           0.        ,   4.54303138],
        [ 