In [2]:
#import libraries
import numpy as np
import pickle

In [5]:
#read dataset
with open('train.p', 'rb') as f:
    X_train, y_train = pickle.load(f)
with open('test.p', 'rb') as f:
  X_test, y_test = pickle.load(f)
#flatten image arrays into vectors
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
#normalize features
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
#split into training and testing
print(f"Train features: {X_train.shape}, labels: {y_train.shape}")
print(f"Test features: {X_test.shape}, labels: {y_test.shape}")

Train features: (60000, 784), labels: (60000,)
Test features: (10000, 784), labels: (10000,)


In [12]:
#create dense layer class
class Dense:
    def __init__(self, input_neurons, output_neurons):
        #generate weights and biases, based on given params
        self.weights = 0.1*np.random.randn(input_neurons, output_neurons)
        self.biases = np.zeros((1, output_neurons))
    def forward(self, inputs):
        #store inputs for backprop
        self.inputs = inputs
        #calculate output
        self.output = np.dot(inputs, self.weights) + self.biases
    def backprop(self, dvalues):
        #get gradients for weights, inputs and biases by implementing calculus
        self.dinputs = np.dot(dvalues, self.weights.T)
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        

#create relu activation class
class ReLu:
    def forward(self, inputs):
        #store inputs for backprop
        self.inputs = inputs
        #apply relu activation
        self.output = np.maximum(0, inputs)
    def backprop(self, dvalues):
        self.dinputs = dvalues.copy()
        #apply relu derivative
        self.dinputs[self.inputs <= 0] = 0

#create softmax activation class
class Softmax:
    def forward(self, inputs):
        #subtract max of each sample from each value and perform exponentation
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        #divide by sum of sample, to create probability distribution
        self.output = exp_values / np.sum(exp_values, axis=1, keepdims=True)
    
#create loss class
class Loss_Categorical_Crossentropy:
    def calculate(self, y_pred, y_true):
        #get number of samples in batch
        samples = len(y_pred)
        #clip to avoid log errors
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        #get correct confidences
        correct_confidences = y_pred[range(samples), y_true]
        #return loss for each sample, using negative log
        return -np.mean(np.log(correct_confidences))

#create combined softmax CC class, for ease of backpropagation
class Softmax_Loss_CC:
    def __init__(self):
        #def initializer method
        self.activation = Softmax()
        self.loss = Loss_Categorical_Crossentropy()
    def forward(self, inputs, y_true):
        #do softmax activation
        self.activation.forward(inputs)
        self.output = self.activation.output
        #return loss from softmax
        return self.loss.calculate(self.output, y_true)
    def backprop(self, dvalues, y_true):
        #get number of samples in previous gradient
        samples = len(dvalues)
        self.dinputs = dvalues.copy()
        #apply softmax_loss_CC derivative to calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        #normalize gradient
        self.dinputs /= samples         

In [13]:
#define adam optimizer class
class Optimizer_Adam:
    #set parameters and iterations
    def __init__(self, learning_rate=0.001, epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.iterations = 0

    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            #initialize momentums and cache
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)
            
        #update momentums using beta_1
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
        
        #bias correction for momentums
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        
        #update cache using beta_2
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
        
        #bias correction for cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
        
        #update weights and biases
        layer.weights += -self.learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)

    def post_update_params(self):
        #update iterations
        self.iterations += 1

In [14]:
#define neural net architecture
dense1 = Dense(784, 128)
relu1 = ReLu()
dense2 = Dense(128, 64)
relu2 = ReLu()
dense3 = Dense(64, 10)
#define softmax_loss instance and optimizer
softmax_loss = Softmax_Loss_CC()
optimizer = Optimizer_Adam()
#define hyperparams
epochs = 5
batch_size = 32

In [16]:
#define training loop
for epoch in range(epochs):
    #shuffle data
    indices = np.arange(len(X_train))
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]
    
    #go through data in batches
    for n in range(0, len(X_train), batch_size):
        #get batch data
        X_batch = X_train[n:n+batch_size]
        y_batch = y_train[n:n+batch_size]
        
        #perform forward pass
        dense1.forward(X_batch)
        relu1.forward(dense1.output)
        dense2.forward(relu1.output)
        relu2.forward(dense2.output)
        dense3.forward(relu2.output)
        #get loss
        loss = softmax_loss.forward(dense3.output, y_batch)
        
        #get accuracy
        predictions = np.argmax(softmax_loss.output, axis=1)
        accuracy = np.mean(predictions == y_batch)     
        
        #perform backpropagayion
        softmax_loss.backprop(softmax_loss.output, y_batch)
        dense3.backprop(softmax_loss.dinputs)
        relu2.backprop(dense3.dinputs)
        dense2.backprop(relu2.dinputs)
        relu1.backprop(dense2.dinputs)
        dense1.backprop(relu1.dinputs)
        
        #update weights and biases
        optimizer.update_params(dense1)
        optimizer.update_params(dense2)
        optimizer.update_params(dense3)
        optimizer.post_update_params()
    
    #display progress
    print(f'Epoch {epoch+1}/{epochs}, loss:{loss:.5f}, accuracy: {accuracy * 100:.3f}%')

Epoch 1/5, loss:0.28508, accuracy: 90.625%
Epoch 2/5, loss:0.00719, accuracy: 100.000%
Epoch 3/5, loss:0.08206, accuracy: 96.875%
Epoch 4/5, loss:0.02417, accuracy: 100.000%
Epoch 5/5, loss:0.00398, accuracy: 100.000%


In [17]:
#perform forward pass on testing data
dense1.forward(X_test)
relu1.forward(dense1.output)
dense2.forward(relu1.output)
relu2.forward(dense2.output)
dense3.forward(relu2.output)
#calculate test loss
test_loss = softmax_loss.forward(dense3.output, y_test)
print(f'Test Loss: {test_loss:.5f}') 
#calculate test accuracy
predictions = np.argmax(softmax_loss.output, axis=1) 
accuracy = np.mean(predictions == y_test)
print(f'Test Accuracy: {accuracy * 100:.3f}%')

Test Loss: 0.08551
Test Accuracy: 97.290%
