In [1]:
import pandas as pd # Pandas is a libaray that used to work with datasets
import numpy as np # Manipulating arrays
from matplotlib import pyplot as plt #Pyplot exists as a module inside the mathplotlib (not a standalone package)

In [2]:
#using the training data, train our neural network

data = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv')

data = np.array(data) #covert pandas dataframe into ndarray- now a 2D array (60,000, 784)


#Assign X and Y 
Y = data[1:10000, 0:1]
X = data[1:10000,1:]
print (Y.shape)
print(X.shape)
m, n = X.shape
print(m,n)

#we want to transpose both of them to be able to work in our calculations
Y = Y.T
X = X.T

#normalize X (input values)
X = X / 255.0

(9999, 1)
(9999, 784)
9999 784


In [3]:
#initalize our weights, and bias point as matrix
# Input layer: 784, Hidden layer: 10, Output layer: 0-9

'''
What should be the dimentions of the matrix?

1. Weights 
    a. Input -> Hidden layer (784, 10) 
    b. Hidden -> Output (10, 10)

2. Bias
    a. Hidden layer (10, 1)
    b. Output layer (10, 1)
'''
# randn: produce random numbers between [0,1]
#subtract 0.5 to generate negative values as well [-0.5,0.5]
def initalize_parameters():
    W1 = np.random.rand(10,784) - 0.5 # each row corresponds to the node's weight respect to the other 784 input layers
    W2 = np.random.rand (10,10) - 0.5
    B1 = np.random.rand (10,1) - 0.5
    B2 = np.random.rand (10,1) - 0.5
    return W1, W2, B1, B2

#we want to use an activation function so that our output isn't just a linear combination of the input
def sigmoid_activation(Z):
    Z = 1/ (1+ np.exp(-Z))
    return Z

# We are using sigmoid activation
# we also have to ensure that they are the same relative to eachother
def softMax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A


def forward_propgation(W1,W2,B1,B2,X): #where X is the 784 values of pixel image
    Z1 = W1.dot(X) + B1 #let the hidden layer values W1: (10, 784) X: (784, 1) -> dimentions of (10,1)
    A1 = sigmoid_activation(Z1)

    Z2 = W2.dot(A1) + B2 #Output later W2: (10,10) A1: (10,1)-> Z2: (10,1)
    A2 = softMax(Z2)
    return Z1, A1, Z2, A2

#one hot encoding: a way to tell your program which output should be right
def one_hot_encode(Y): #Y is your answer vector in dimentions (Y.size, 1): row, coloumn
    #turn your answer vector into a matrix
    matrix = np.zeros((Y.size, Y.max() + 1))
    matrix[np.arange(Y.size), Y] = 1 #arrange creates a array from 0 - Y.size-1, so then u can use to index
    return matrix.T #matrix is now in dimentions (Y.size(), 10): row, coloumn

# we use backward propogation to change the weights and biases, accoriding to input and expected output data
def backward_propogation(Z1, A1, Z2, A2, W2, Y):
    output_encoded = one_hot_encode(Y)
    #keep track of all the small changes
    dZ2 = A2 - output_encoded #the difference between expected and actual values
    dW2 = 1 / m * dZ2.dot(A1.T)
    dB2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * sigmoid_activation(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    dB1 = 1 / m * np.sum(dZ1)
    return dW1, dB1, dW2, dB2

def learn(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha):
    #we want to update the biases and weights, by the learning rate: alpha
    W1 = W1 - alpha * dW1
    B1 = B1 - alpha * dB1    
    W2 = W2 - alpha * dW2  
    B2 = B2 - alpha * dB2    
    return W1, B1, W2, B2

In [4]:
#this will get the largest value from the output (what the neural network thinks the number is)
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y) #print the expected and actual
    return np.sum(predictions == Y) / Y.size

In [5]:
def gradient_descent(X,Y,alpha, iterations):
    #start with an inital value for weights and biases
    W1, W2, B1, B2 = initalize_parameters()

    for i in range (iterations):
        #we want to first forward propogate
        Z1, A1, Z2, A2= forward_propgation(W1,W2,B1,B2,X) 
        #back propogation to update out weights and biases
        dW1, dB1, dW2, dB2 = backward_propogation(Z1, A1, Z2, A2, W2, Y)
        #depending on the error - change the value of weights and biases
        W1, b1, W2, b2= learn(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha)
        #we only want to print every 100 iterations
        if i%100 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, B1, W2, B2

In [6]:
W1, b1, W2, b2 = gradient_descent(X, Y, 0.10, 5000)

Iteration:  0
[6 6 6 ... 6 2 2] [[0 4 1 ... 6 9 7]]
0.0852085208520852
Iteration:  100
[0 6 1 ... 1 9 9] [[0 4 1 ... 6 9 7]]
0.3684368436843684
Iteration:  200
[0 7 1 ... 1 9 7] [[0 4 1 ... 6 9 7]]
0.3708370837083708
Iteration:  300
[0 4 1 ... 1 9 9] [[0 4 1 ... 6 9 7]]
0.38793879387938796
Iteration:  400
[0 4 1 ... 1 9 9] [[0 4 1 ... 6 9 7]]
0.40404040404040403
Iteration:  500
[0 4 1 ... 2 9 9] [[0 4 1 ... 6 9 7]]
0.45454545454545453
Iteration:  600
[0 4 1 ... 2 9 9] [[0 4 1 ... 6 9 7]]
0.45064506450645064
Iteration:  700
[0 4 1 ... 2 9 9] [[0 4 1 ... 6 9 7]]
0.45584558455845586
Iteration:  800
[0 4 1 ... 8 3 9] [[0 4 1 ... 6 9 7]]
0.4823482348234823
Iteration:  900
[0 4 1 ... 8 3 9] [[0 4 1 ... 6 9 7]]
0.4878487848784879
Iteration:  1000
[0 4 1 ... 8 3 9] [[0 4 1 ... 6 9 7]]
0.49784978497849786
Iteration:  1100
[0 4 1 ... 8 3 9] [[0 4 1 ... 6 9 7]]
0.5138513851385138
Iteration:  1200
[0 4 1 ... 8 3 9] [[0 4 1 ... 6 9 7]]
0.5177517751775178
Iteration:  1300
[0 4 1 ... 8 3 7] [[0 4 1 .