In [None]:
#imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import graphviz

In [None]:
data = pd.read_csv('MNIST.csv')
data = np.array(data)
np.random.shuffle(data)

In [None]:
m, n = data.shape

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

_, m_train = X_train.shape

In [None]:
data_train.shape

In [None]:
class PReLU:
    def __init__(self, alpha_init = 0.01):
        self.alpha = alpha_init
    
    def forward(self, Z):
        return np.maximum(self.alpha * Z, Z)

    def prime(self, Z):
        return np.where(Z > 0, 1, self.alpha)

def update_alpha(alpha, dZ):
    learning_rate = 0.01
    alpha -= learning_rate * np.mean(np.where(dZ < 0, dZ * alpha, 0))
    return alpha

prelu_1 = PReLU()
prelu_2 = PReLU()



def ReLU(Z):
    return np.maximum(Z, 0)

def ReLU_prime(Z):
    return Z > 0



def sigmoid(Z):
    A = 1 / (1 + np.exp(np.clip(-Z, -4, 4)))
    return A

def sigmoid_prime(Z):
    A = sigmoid(Z) * (1 - sigmoid(Z))
    return A



def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A



def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [None]:
class Neural_Network:
    def __init__(self, input_size, hidden_1_size, hidden_2_size, output_size, bias = 1):
        self.input_size = input_size
        self.hidden_1_size = hidden_1_size
        self.hidden_2_size = hidden_2_size
        self.output_size = output_size

        self.W1 = np.random.rand(hidden_1_size, input_size) - 0.5
        self.b1 = np.random.rand(hidden_1_size, bias) - 0.5

        self.W2 = np.random.rand(hidden_2_size, hidden_1_size) - 0.5
        self.b2 = np.random.rand(hidden_2_size, bias) - 0.5

        self.W3 = np.random.rand(output_size, hidden_2_size) - 0.5
        self.b3 = np.random.rand(output_size, bias) - 0.5

    def forward_prop(self, X):
        Z1 = self.W1.dot(X) + self.b1
        A1 = prelu.forward(Z1)

        Z2 = self.W2.dot(A1) + self.b2
        A2 = prelu.forward(Z2)

        Z3 = self.W3.dot(A2)
        A3 = softmax(Z3)

        return A3


pop_size = 100
mutation_rate = 0.1
generations = 100

def fitness_function(network, X, y):
    nn = Neural_Network(input_size=784, hidden_1_size=10, hidden_2_size=10 output_size=10)
    nn.W1 = network['W1']
    nn.b1 = network['b1']
    nn.W2 = network['W2']
    nn.b2 = network['b2']
    nn.W3 = network['W3']
    nn.b3 = network['b3']

    predictions = nn.forward_prop(X)
    accuracy = np.mean(np.argmax(predictions, axis=0) == y)
    return accuracy

def init_pop(nn, pop_size):
    population = []

    for _ in range(pop_size):
        network = {
            'W1': nn.W1.copy()
            'b1': nn.b1.copy()
            'W2': nn.W2.copy()
            'b2': nn.b2.copy()
            'W3': nn.W3.copy()
            'b3': nn.b3.copy()
        }
        population.append(network)
    return population

def select_parents(population, X, y):
    fitness_scores = [fitness_function(network, X, y) for network in population]
    total_fitness = sum(fitness_scores)
    probabilities = [score / total_fitness for score in fitness_scores]
    selected_indicies = np.random.choice(len(population), size = 2, p = probabilities, replace = False)
    return [population[i] for i in selected_indicies]

def crossover(parent_1, parent_2):
    child = {}
    for key in parent_1:
        if np.random.rand() < 0.5:
            child[key] = parent_1[key]
        else:
            child[key] = parent_2[key]
    return child

def mutate(network, mutation_rate):
    for key in network:
        if np.random.rand() < mutation_rate:
            network[key] += np.random.randn(*network[key].shape) * 0.1
    return network

nn = Neural_Network()
population = init_pop(nn, pop_size)

for generation in range(generations):
    new_population = []

    for _ in range(pop_size // 2):
        parent_1 = select_parents(population, X_train, Y_train)
        parent_2 = select_parents(population, X_train, Y_train)

        child_1 = crossover(parent_1, parent_2)
        child_1 = mutate(child_1, mutation_rate)

        child_2 = crossover(parent_1, parent_2)
        child_2 = muatet(child_2, mutation_rate)

        new_population.extend([child_1, child_2])
    population = new_population

best_network = max(population, key=lambda x: fitness_function(x, X_train, Y_train))

In [None]:
def init_params(input, hidden_1, hidden_2, output, bias = 1):
    W1 = np.random.rand(hidden_1, input) - 0.5
    b1 = np.random.rand(hidden_1, bias) - 0.5

    W2 = np.random.rand(hidden_2, hidden_1) - 0.5
    b2 = np.random.rand(hidden_2, bias) - 0.5

    W3 = np.random.rand(output, hidden_2) - 0.5
    b3 = np.random.rand(output, bias) - 0.5
    return W1, b1, W2, b2, W3, b3

def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1.dot(X) + b1
    A1 = prelu_1.forward(Z1)

    Z2 = W2.dot(A1) + b2
    A2 = prelu_2.forward(Z2)

    Z3 = W3.dot(A2)
    A3 = softmax(Z3)

    return Z1, A1, Z2, A2, Z3, A3

def backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
    one_hot_Y = one_hot(Y)

    dZ3 = A3 - one_hot_Y
    dW3 = 1 / m * dZ3.dot(A2.T)
    db3 = 1 / m * np.sum(dZ3, axis = 1, keepdims = True)

    dZ2 = W3.T.dot(dZ3) * prelu_2.prime(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis = 1, keepdims = True)

    dZ1 = W2.T.dot(dZ2) * prelu_1.prime(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis = 1, keepdims = True)

    prelu_1.alpha = update_alpha(prelu_1.alpha, dZ1)
    prelu_2.alpha = update_alpha(prelu_2.alpha, dZ1)
    
    return dW1, db1, dW2, db2, dW3, db3

In [None]:
#Training algs

def gradient_descent(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1

    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2

    W3 = W3 - alpha * dW3
    b3 = b3 - alpha * db3

    return W1, b1, W2, b2, W3, b3

In [None]:
def get_predictions(A3):
    return np.argmax(A3, 0)

def get_accuracy(predictions, Y):
    return round((np.sum(predictions == Y) / Y.size) * 100, 2)

def train(X, Y, epochs, alpha):
    W1, b1, W2, b2, W3, b3 = init_params(784, 10, 10, 10) #ONLY CHANGE HIDDEN LAYERS!!!!

    for e in range(epochs):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
        W1, b1, W2, b2, W3, b3 = gradient_descent(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)

        if e % 10 == 0:
            print("Epoch:", e)
            print("Accuracy:", get_accuracy(get_predictions(A3), Y), "%")

    print("Epoch:", e)
    print("Accuracy:", get_accuracy(get_predictions(A3), Y), "%")

    return W1, b1, W2, b2, W3, b3

In [None]:
W1, b1, W2, b2, W3, b3 = train(X_train, Y_train, 1000, 0.1)

In [None]:
def make_predictions(X, W1, b1, W2, b2, W3, b3):
    _, _, _, _, _, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
    predictions = get_predictions(A3)
    return predictions

def test_results(idx, W1, b1, W2, b2, W3, b3):
    curr_img = X_train[:, idx, None]
    prediction = make_predictions(X_train[:, idx, None], W1, b1, W2, b2, W3, b3)
    label = Y_train[idx]

    print('Prediction:', prediction)
    print('actual:', label)

    curr_img = curr_img.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(curr_img)
    plt.show()

In [None]:
test_results(np.random.randint(0, Y_train.size), W1, b1, W2, b2, W3, b3)
test_results(np.random.randint(0, Y_train.size), W1, b1, W2, b2, W3, b3)


In [None]:
test_predictions = make_predictions(X_test, W1, b1, W2, b2, W3, b3)
get_accuracy(test_predictions, Y_test)