In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [30]:
class DL(object):
    """
    A deep learning class used for L-layer neural networks
    ***WIP***
    """  
    
    ###constructor
    def __init__(self, size, input_layer, true_values, testing_input, testing_true, layers_dims, learning_rate = 0.0075, hidden_function = 'relu', last_function = 'sigmoid', num_iterations = 3000, output_form = 'binary', print_cost = False, plot_cost = True):
        self.size = size
        self.X = input_layer
        self.Y = true_values
        self.test_X = testing_input
        self.test_Y = testing_true
        self.learning_rate = learning_rate
        self.last_function = last_function
        self.hidden_function = hidden_function
        self.num_iterations = num_iterations
        self.print_cost = print_cost
        self.dimensions = np.array(layers_dims)
        self.final_paramters = {}
        self.output_form = output_form
        self.plot_cost_bool = plot_cost
        self.optimal_dimensions = np.zeros_like(layers_dims)
        self.index_bool = False
        self.L2 = False
        self.lambd = 0.1
        self.dropout = False
        self.keep_prob = 0.5

    ##helper function
    def help_me(self):
        print(" mandatory initializer variables: ", 
        "    \033[1m size (int):\033[0m the number of layers including input and output layers",
        "    \033[1m input_layer (array):\033[0m the array of input variables for every instance",
        "        every column is seen as one data group where the number of rows represent the",
        "        number of nodes present in the input layer"
        "    \033[1m true_values (array):\033[0m the array of true values to train with",
        "    \033[1m testing_input (array):\033[0m in the same form as input-layer, used to test the final solution",
        "    \033[1m testing_true (array):\033[0m in the same form as output-layer, used to test the final solution",
        "    \033[1m dimensions (array):\033[0m array of dimensions of each layer",
        "   optional variables for initializer:\033[0m", 
        "    \033[1m learning_rate (float):\033[0m defualt set to 0.0075 determines step size of gradient descent",
        "    \033[1m hidden_function (string):\033[0m default set to 'relu', determines activation function of hidden layer",
        "    \033[1m last_function (string):\033[0m default set to 'sigmoid', determines activation function of output layer",
        "    \033[1m num_iteration (int):\033[0m default set to 3000, determines number of iterations in gradient descent phase",
        "    \033[1m print_cost (boolean):\033[0m default set to False, when true prints the cost at every 100th iteration",
        "    \033[1m plot_cost (boolean):\033[0m default set to True, when true makes plot of output of cost function",
        "\033[1m set_dimensions(array layers_dims):\033[0m sets the dimensions (number of nodes) of each layer",
        "\033[1m set_learning_rate(int rate):\033[0m set the learning rate for gradient descent",
        "\033[1m set_hidden_function(string function):\033[0m set the function for the hidden layer. options:",
        "    \033[1m 'relu':\033[0m use the Rectified Linear Unit",
        "    \033[1m 'sigmoid':\033[0m use the sigmpoid function",
        "    \033[1m 'softmax':\033[0m use the Softmax function ***WIP***",
        "\033[1m set_last_function(string function):\033[0m set the function for the hidden layer. options same as for set_hidden_function",
        "\033[1m set_num_iterations(int num):\033[0m set the number of iterations for the gradient descent",
        "\033[1m set_print_cost(bool value):\033[0m turn on/off printing every 100th iteration of gradient descent",
        "\033[1m set_print_cost(bool value):\033[0m turn on/off plotting the cost function output",
        "\033[1m set_final_params(array parameters):\033[0m manually set final parameters", 
        "\033[1m run_machine():\033[0m runs the Machine learning algorithm",
        "\033[1m machine_accuracy():\033[0m compares the accuracy of the machine solution to weights and biases using the training data and the testing data",
        "\033[1m set_L2(bool value, float num):\033[0m turn on/off using L2 regularization default: False and set lambda value (default 0.1)",
        "\033[1m set_dropout(bool value, float num):\033[0m turn on/off jusing Dropout method for regularization default: False and set the dropout probability (default 0.5)",
        sep = "\n")

    ##methods for updating varius variables which can also be initialised in the constructor
    def set_dimensions(self, layers_dims):
        if len(layers_dims) != self.size:
            print("you must enter dimensions for "+str(size)+" layers or reset the size")
        else:
            self.dimensions = layers_dims

    def set_learning_rate(self, rate):
        if isinstance(rate, float):
            self.learning_rate = rate
        else:
            print("argument must be a float")
        
    def set_num_iterations(self, num):
        if isinstance(num, int):
            self.num_iterations = num
        else:
            print("argument must be an integer")
        
    def set_print_cost(self, boo):
        if isinstance(boo, bool):
            self.print_cost = boo
        else:
            print("argument must be a boolean value")
        
    def set_plot_cost(self, boo):
        if isinstance(boo, bool):
            self.plot_cost = boo
        else:
            print("argument must be a boolean value")
        
    def set_final_params(self, params):
        if isinstance(params, list):
            self.final_parameters = params
        else:
            print("argument should be an array of values")
            
    def set_output_form(self, form):
        if ((form != 'binary') and (form != 'multiclass')):
            print("argument must be either 'binary' or 'multiclass'")
        else:
            self.output_form = form
    
    def set_hidden_function(self, function):
        if ((function != 'relu') and  (function != 'sigmoid') and  (function != 'softmax')):
            print("function must be 'relu', 'sigmoid', or 'softmax', new functions may be added in the future")
        else:
            self.hidden_function = function

    def set_last_function(self, function):
        if ((function != 'relu') and  (function != 'sigmoid') and  (function != 'softmax')):
            print("function must be 'relu', 'sigmoid', or 'softmax', new functions may be added in the future")
        else:
            self.last_function = function
        
    def set_L2(self, boo, lambd = 0.1):
        if isinstance(lambd, float):
            self.L2 = True
            self.lambd = lambd
        else:
            print("argument must be in the form (bool, float) value")
            
    def set_dropout(self, boo, prob = 0.5):
        if isinstance(boo, bool):
            self.dropout = boo
            self.keep_prob = prob
        else:
            print("argument must be a boolean value")
            
    ## activation functions used for the activation step of each layer
    ## currently relu and sigmoid are functional
    ## TODO soft_max
    def __sigmoid(self, Z):
        #print("started sigmoid")
        Z = Z.astype(float)
        A = 1/(1+np.exp(-Z))
        activation_cache = Z
        #print("finished sigmoid")
        return A, activation_cache
    
    def __relu(self, Z):
        #print("started relu")
        A = np.maximum(0,Z)
        activation_cache = Z
        #print("finished relu")
        return A, activation_cache
    
    def __soft_max(self, Z):
        num_rows, num_columns = Z.shape
        activation_cache = Z
        A = Z*0
        exps = np.exp(Z)
        exps_sums = np.zeros((1,Z.shape[1]))
        exps_sums = np.sum(exps, axis = 0)
        for i in range(num_columns):
            for j in range(num_rows):
                A[j][i] = exps[j][i]/exps_sums[i]
        return A, activation_cache

    ## backwards propagation step for the activation of each layer
    ## currently the sigmoid and relu functions are functional
    ## TODO soft_max
    def __sigmoid_backward(self, dA, activation_cache):
        #print("started sigmoid back")
        S, Z = self.__sigmoid(activation_cache)
        dZ = dA * (S*(1.0-S))
        #print("finished sigmoid back")
        return dZ
    
    def __relu_backward(self, dA, activation_cache):
        #print("started relu back")
        Z = activation_cache
        num_rows, num_columns = Z.shape
        dZ = Z * 0
        for i in range(num_rows):
            for j in range(num_columns):
                if Z[i][j] <= 0:
                    dZ[i][j] = 0
                elif Z[i][j] > 0:
                    dZ[i][j] = dA[i][j]
        #print("finished relu back")
        return dZ
    
    def __soft_max_backward(self, dA, activation_cache):
        S, Z = self.__soft_max(activation_cache)
        m, n = Z.shape
        p = soft_max(Z)
        # outer products
        # (p1^2  p1*p2 p1*p3 ...)
        # (p2*p1 p2^2  p2*p3 ...)
        # (...                  )
        tensor1 = np.einsum('ij,ik->ijk',p,p)
        # (n,n) identitity of feature vector
        # (p1 0  0 ...)
        # (0  p2 0 ...)
        # (...        )
        tensor2 = np.einsum('ij,jk->ijk',p,np.eye(n,n))
    
        dSoftmax = tensor2 - tensor1
        dZ = np.einsum('ijk,ik->ij', dSoftmax, dA)
        return dZ

    ##initialize the weights and biases for each layer.
    ## weights are initialized to random floats using the He initialization
    ## biases are initialized to zero
    def __initialize_parameters(self, layers_dims):
        #print("started initialize params")
        np.random.seed(1)
        parameters = {}
        L = len(layers_dims)
        
        for l in range(1,L):
            parameters["W" + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * math.sqrt(2/(layers_dims[l-1]))
            parameters["b" + str(l)] = np.zeros((layers_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layers_dims[l], layers_dims[l-1]))
        assert(parameters['b' + str(l)].shape == (layers_dims[l], 1))
        #print("finished initialize params")
        return parameters
    
    ## linear forward propogation step, taken at each layer
    def __linear_forward(self, A, W, b):
        #print("started linear forward")
        Z = np.dot(W,A) + b
        cache = (A,W,b)
        #print("finished linear forward")
        return Z, cache
    
    ## forward propagation for activation step taken at each layer
    def __linear_activation_forward(self, A_prev, W, b, activation):
        #print("started linear activation forward")
        if activation == "sigmoid":
            Z, linear_cache = self.__linear_forward(A_prev, W, b)
            A, activation_cache = self.__sigmoid(Z)
        
        elif activation == "relu":
            Z, linear_cache = self.__linear_forward(A_prev, W, b)
            A, activation_cache = self.__relu(Z)
            if self.dropout:
                D = np.random.rand(A.shape[0], A.shape[1])
                D = (D < self.keep_prob).astype(int)
                print(A.shape)
                print(D.shape)
                A = np.multiply(A,D)
                A = A/self.keep_prob
                cache = (linear_cache, activation_cache, D)
                return A, cache
        
        elif activation == "soft_max":
            Z, linear_cache = self.__linear_forward(A_prev, W, b)
            A, activation_cache = self.__soft_max(Z)
        #print(A.shape)
        cache = (linear_cache, activation_cache)
        #print("finished linear activation forward")
    
        return A, cache
    
    ## forward propagation for the model (includes the linear and activation step for every layer)
    def __full_forward(self, X, parameters):
        #print("started full forward")
        caches = []
        A = X
        L = self.size -1
    
        if self.dropout:
            for l in range(1, L):
                print(l)
                A_prev = A
                A, cache = self.__linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation = self.hidden_function)
                caches.append(cache)
            AL, cache = self.__linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], activation = self.last_function)
            caches.append(cache)
        else:
            for l in range(1, L):
                A_prev = A
                A, cache = self.__linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation = self.hidden_function)
                caches.append(cache)
            AL, cache = self.__linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], activation = self.last_function)
            caches.append(cache)
        #print("finished full forward")
        return AL, caches
    
        ## cross-entropy cost function
        ## L2 regularization option
    def __compute_cost(self, AL, Y, parameters):
        #print("started compute cost")
        L = self.size -1
        L2_cost = 0
        m = Y.shape[1]*Y.shape[0]
        cross_entropy_cost = -(np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL)))/m
        if self.L2:
            for l in range(1,L+1):
                L2_cost += np.sum(np.square(parameters["W"+str(l)])) * (self.lambd/(2*m))
        cost = cross_entropy_cost + L2_cost
        cost = np.squeeze(cost)
        #print("finished compute cost")
        return cost
    
    ## linear backward propogation step, taken at each layer
    ## this includes the dlambd term from L2 regularization in cost function
    def __linear_backward(self, dZ, cache):
        #print("started linear backward")
        A_prev, W, b = cache
        m = A_prev.shape[1]
        dlambd = 0.
        if self.L2:
            dlambd = (self.lambd/m) * W
        
        dW = (np.dot(dZ,A_prev.T))/m + dlambd
        db = np.sum(dZ, axis=1, keepdims=True)/m
        dA_prev = np.dot(W.T, dZ)
        #print("finished linear backward")
        return dA_prev, dW, db
    
    ## backward propagation for activation step taken at each layer
    def __linear_activation_backward(self, dA, cache, activation):
        #print("started linear activation backward")
    
        if activation == "relu":
            if self.dropout:
                linear_cache, activation_cache, D = cache
                dZ = self.__relu_backward(dA, activation_cache)
                dA_prev, dW, db = self.__linear_backward(dZ, linear_cache)
                dA_prev = (dA_prev*D)/self.keep_prob
            else:
                linear_cache, activation_cache = cache
                dZ = self.__relu_backward(dA, activation_cache)
                dA_prev, dW, db = self.__linear_backward(dZ, linear_cache)
        
        elif activation == "sigmoid":
            linear_cache, activation_cache = cache
            dZ = self.__sigmoid_backward(dA, activation_cache)
            dA_prev, dW, db = self.__linear_backward(dZ, linear_cache)

        
        elif activation == "soft_max":
            linear_cache, activation_cache = cache
            dZ = self.__soft_max_backward(dA, activation_cache)
            dA_prev, dW, db = self.__linear_backward(dZ, linear_cache)
        #print("finished linear activation backward")
        #print(dA_prev.shape)
        return dA_prev, dW, db
    
    ## backward propagation for the model (includes the linear and activation step for every layer)
    def __full_backward(self, AL, Y, caches):
        #print("started full backward")
        grads = {}
        L = self.size - 1 #num layers
        m = AL.shape[1]
        Y = Y.reshape(AL.shape)
    
        dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
        current_cache = caches[L-1]
        dA_prev_temp, dW_temp, db_temp = self.__linear_activation_backward(dAL, current_cache, activation = self.last_function)
        grads["dA"+str(L-1)] = dA_prev_temp
        grads["dW"+str(L)] = dW_temp
        grads["db"+str(L)] = db_temp
    
        for l in reversed(range(L-1)):
            current_cache = caches[l]
            dA_prev_temp, dW_temp, db_temp = self.__linear_activation_backward(dA_prev_temp, current_cache, activation= self.hidden_function)
            grads["dA"+str(l)] = dA_prev_temp
            grads["dW"+str(l+1)] = dW_temp
            grads["db"+str(l+1)] = db_temp
        #print("finished full backward")
        return grads

    ## update parameters using gradient descent
    def __update_parameters(self, params, grads, learning_rate):
        #print("started update parameters")
        parameters = params.copy()
        L = len(parameters) // 2 #num layers
    
        for l in range(L):
            parameters["W" + str(l+1)] = parameters["W"+str(l+1)] - learning_rate*grads["dW" + str(l+1)]
            parameters["b" + str(l+1)] = parameters["b"+str(l+1)] - learning_rate*grads["db" + str(l+1)]
        #print("finished update parameters")
        return parameters
    
    ## plots the value of the cost function for every 100th iteration of the gradient descent
    def __plot_costs(self, costs):
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per hundreds)')
        plt.title("Learning rate =" + str(self.learning_rate))
        plt.show()
        
    ## main function to implement the gradient descent learning algorithm
    def __deep_model(self, X, Y):
        #print("started deep model")
        np.random.seed(1)
        costs = []
    
        parameters = self.__initialize_parameters(self.dimensions)
        for i in range(0, self.num_iterations):
            AL, caches = self.__full_forward(X, parameters)
            cost = self.__compute_cost(AL, Y, parameters)
            grads = self.__full_backward(AL, Y, caches)
            parameters = self.__update_parameters(parameters, grads, self.learning_rate)
            if self.print_cost and (i%100 == 0 or 1 == self.num_iterations - 1):
                print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
            if i % 100 == 0 or i == self.num_iterations:
                costs.append(cost)
                #print(i)
        #print("finished deep model")
        return parameters, costs

    ## can show the accuracy with which the final set-up can predict the outcomes based on the inputs
    ## option for 'binary' true values of 0,1 and 'multiclass' true values of arrays i,e [0, 1, 0, 0]
    def __predict(self, X, Y, parameters, option = "binary", print_accuracy = True):
        print("started predict")
        A = X
        L = len(parameters) // 2
    
        for l in range(1, L):
            A_prev = A
            A, cache = self.__linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation = self.hidden_function)
        AL, cache = self.__linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], activation = self.last_function)
    
        correct = 0
        total = 0
        if option == "binary":
            AL = np.round(AL)
            num_rows, num_columns = AL.shape
            for i in range(num_rows):
                for j in range(num_columns):
                    if AL[i][j] == Y[i][j]:
                        correct += 1
                    total +=1
                
        if option == "multiclass":
            Y_arg = np.argmax(Y, axis = 0)
            AL_arg = np.argmax(AL, axis = 0)
            for i in range(len(Y_arg)):
                if Y_arg[i] == AL_arg[i]:
                    correct += 1
                total += 1
        if print_accuracy:
            print("total = "+str(total)+" correct = "+str(correct))
            print("Accuracy = "+str(correct/total))
        print("finished predict")
        return correct/total
    
    ## this will run the machine, deterining the parameters which minimize the cost
    def run_machine(self):
        #print("started run machine")
        self.final_parameters, costs = self.__deep_model(self.X, self.Y)
        if self.plot_cost_bool:
            self.__plot_costs(costs)
        #print("ended run machine")
        
    ## with this function, the accuracy of the machine using training and test data are calculate
    def machine_accuracy(self, test_x = None, test_y = None, option = None, print_accuracy = True):
        print("started machine_accuracy")
        if test_x == None:
            test_x = self.test_X
        if test_y == None:
            test_y = self.test_Y
        if option == None:
            option = self.output_form
        self.__predict(self.X, self.Y, self.final_parameters, option, print_accuracy)
        self.__predict(test_x, test_y, self.final_parameters, option, print_accuracy)
        print("finsihed machine accuracy")
    
    def __optimization_loop(self, index, max_size, num_layers, accuracy_high):
        for i in range(1, max_size + 1):
            #print("current index value is: "+str(index))
            if index == 0:
                return
            else:
                self.dimensions[index] = i
                print(self.dimensions)
                self.run_machine()
                accuracy = self.__predict(self.test_X, self.test_Y, self.final_parameters, option="multiclass")
                if round(accuracy_high, 5) < round(accuracy, 5):
                    print("****new highest accuracy****")
                    accuracy_high = accuracy
                    optimal_dimensions = np.array(self.dimensions)
                    self.optimal_dimensions = optimal_dimensions
                    print("new optimal dimensions = "+str(self.optimal_dimensions))
                #print("current optimal dimensions: "+str(self.optimal_dimensions))
                if i == max_size:
                    print(self.dimensions)
                    self.__check_index(index, max_size)
                    if (self.index_bool == True):
                        self.index_bool = False
                        self.__optimization_loop(index, max_size, num_layers, accuracy_high)
                    else:
                        return
                    
    def __check_index(self, number, max_size):
        if number == 1:
            self.index_bool = False
            return
        if self.dimensions[number - 1] < max_size:
            self.dimensions[number - 1] += 1
            self.index_bool = True
            return
        elif self.dimensions[number - 1] == max_size:
            number -=1
            if number == 0:
                return
            self.dimensions[number] = 1
            self.__check_index(number, max_size)
    
    ## this function will iterate through possible dimensions of hidden layers to guess at the optimal
    ## dimension length
    ## TODO: make optimizations for each of the other variables
    def optimize_dimensions(self, max_size):
        n_x = self.dimensions[0]
        n_y = self.dimensions[-1]
        num_layers = len(self.dimensions) - 2
        index = num_layers
        accuracy_high = 0
        for i in range(1,num_layers + 1):
            self.dimensions[i] = 1
        self.__optimization_loop(index, max_size, num_layers, accuracy_high)
        #print("The optimal dimensions are: "+str(self.optimal_dimensions)+"\n\n")
        return self.optimal_dimensions
    
    
    def run_optimal(self, max_size, plot_costs = True):
        self.plot_cost_bool = plot_costs
        self.dimensions = self.optimize_dimensions(max_size)
        #print("the optimal dimensions are :"+str(self.dimensions)+" for the given max layer dimension: "+str(max_size))
        self.run_machine()
        self.machine_accuracy(print_accuracy = True)
        
        
        