In [None]:
import numpy as np
import pandas as pd
import os
import copy
from matplotlib import pyplot as plt

%matplotlib inline

**Data loading, plotting, normalization**

In [None]:
data = pd.read_csv('fashion-mnist_train.csv')

X,y = data.iloc[:, 1:], data.iloc[:, 0].ravel()

In [None]:
test = pd.read_csv('fashion-mnist_test.csv')
X_test,y_test = test.iloc[:, 1:], test.iloc[:, 0].ravel()

In [None]:
X = X.to_numpy()
X_test = X_test.to_numpy()
X = X/255
X_test = X_test/255

In [None]:
fashion_mnist_labels = {
0: 'T-shirt/top',
1: 'Trouser',
2: 'Pullover',
3: 'Dress',
4: 'Coat',
5: 'Sandal',
6: 'Shirt',
7: 'Sneaker',
8: 'Bag',
9: 'Ankle boot'
}

In [None]:
y_a = []
for prediction in y:
    y_a.append(fashion_mnist_labels[prediction])

In [None]:
# Randomly select 100 data points to display
fig = plt.figure(figsize=(25,25))
for epoch in range(100):
    i = np.random.randint(0,9999)
    ax = fig.add_subplot(10, 10, epoch+1)
    ax.imshow(X[i,:].reshape(28,28), cmap=plt.get_cmap('gray'))
    ax.set_title('y: {y}'.format(y=y_a[i]))
    plt.axis('off')


**Writing different Classes/Functions that will be used throughtout the code:**

In [None]:
class Dense:

    # Layer initialization
    def __init__(self, n_inputs, n_neurons,l2=0):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.uniform(-0.12,0.12, (n_inputs, n_neurons))
        self.biases = np.zeros((1, n_neurons))
        # Set regularization strength
        self.l2 = l2
        

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        
        self.output = np.dot(inputs, self.weights) + self.biases + self.l2 * \
                                       np.sum(self.weights * \
                                              self.weights) + self.l2 * \
                                    np.sum(self.biases*self.biases)
        

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        
        # L2 on weights
        if self.l2 > 0:
            self.dweights += 2 * self.l2 * \
                             self.weights
       
        
            self.dbiases += 2 * self.l2 * \
                            self.biases

        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

    # Retrieve layer parameters
    def get_parameters(self):
        return self.weights, self.biases

    # Set weights and biases in a layer instance
    def set_parameters(self, weights, biases):
        self.weights = weights
        self.biases = biases


In [None]:
# ReLU activation Function
class ReLU:

    # Forward pass
    def forward(self, inputs):
        
        self.inputs = inputs
        self.output = np.maximum(0, inputs)

    #Backward Pass
    def backward(self, dvalues):
        
        self.dinputs = dvalues.copy()

        # Zero gradient where inputs are negative
        self.dinputs[self.inputs <= 0] = 0

    
    def predictions(self, outputs):
        return outputs

In [None]:
#Sigmoid Activation
class Sigmoid:

    # Forward pass
    def forward(self, inputs):
    
        self.inputs = inputs
        self.output = 1 / (1 + np.exp(-inputs))
        
    # Backward pass
    def backward(self, dvalues):
        # Derivative(σ) = σ*(1-σ)
        self.dinputs = dvalues * (1 - self.output) * self.output

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return (outputs > 0.5) * 1

In [None]:
# Softmax activation
class Softmax:

    # Forward pass
    def forward(self, inputs):
        
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis=1,
                                            keepdims=True))
        # Normalize
        probabilities = exp_values / np.sum(exp_values, axis=1,
                                            keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):

        # Create uninitialized array
        self.dinputs = np.empty_like(dvalues)

        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in \
                enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            # Calculate Jacobian matrix of the output
            jacobian_matrix = np.diagflat(single_output) - \
                              np.dot(single_output, single_output.T)
            
            self.dinputs[index] = np.dot(jacobian_matrix,
                                         single_dvalues)

    # Calculate predictions for outputs
    def predictions(self, outputs):
        return np.argmax(outputs, axis=1)


In [None]:
# Adam optimizer
class Optimizer_Adam:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,
                 beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)


        # Update momentum  with current gradients
        layer.weight_momentums = self.beta_1 * \
                                 layer.weight_momentums + \
                                 (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * \
                               layer.bias_momentums + \
                               (1 - self.beta_1) * layer.dbiases
        
        
        weight_momentums_corrected = layer.weight_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / \
            (1 - self.beta_1 ** (self.iterations + 1))
        # Update cache with squared current gradients
        layer.weight_cache = self.beta_2 * layer.weight_cache + \
            (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + \
            (1 - self.beta_2) * layer.dbiases**2
        # Get corrected cache
        weight_cache_corrected = layer.weight_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / \
            (1 - self.beta_2 ** (self.iterations + 1))

       
        layer.weights += -self.current_learning_rate * \
                         weight_momentums_corrected / \
                         (np.sqrt(weight_cache_corrected) +
                             self.epsilon)
        layer.biases += -self.current_learning_rate * \
                         bias_momentums_corrected / \
                         (np.sqrt(bias_cache_corrected) +
                             self.epsilon)

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


In [None]:
class Loss:

    # Regularization loss calculation
    def regularization_loss(self, layer):

       
        regularization_loss = 0

        # Calculate regularization loss
        # iterate all trainable layers
        #for layer in self.trainable_layers:

           
            # L2 regularization - weights (λ* |w|**2)
        if layer.l2 > 0:
            regularization_loss += layer.l2 * \
                                       np.sum(layer.weights * \
                                              layer.weights)
            # L2 regularization - biases  (λ* |w0|**2)
        
            regularization_loss += layer.l2 * \
                                       np.sum(layer.biases * \
                                              layer.biases)

        return regularization_loss

    # Set/remember trainable layers
    def remember_trainable_layers(self, trainable_layers):
        self.trainable_layers = trainable_layers


    # Calculates the data and regularization losses
    # given model output and ground truth values
    def calculate(self, output, y):

        # Calculate sample losses
        sample_losses = self.forward(output, y)

        # Calculate mean loss
        data_loss = np.mean(sample_losses)

        # Add accumulated sum of losses and sample count
        self.accumulated_sum += np.sum(sample_losses)
        self.accumulated_count += len(sample_losses)

        # If just data loss - return it
        if not include_regularization:
            return data_loss

        # Return the data and regularization losses
        return data_loss, self.regularization_loss()

    # Calculates accumulated loss
    def calculate_accumulated(self, *, include_regularization=False):

        # Calculate mean loss
        data_loss = self.accumulated_sum / self.accumulated_count

        # If just data loss - return it
        if not include_regularization:
            return data_loss

        # Return the data and regularization losses
        return data_loss, self.regularization_loss()

    # Reset variables for accumulated loss
    def new_pass(self):
        self.accumulated_sum = 0
        self.accumulated_count = 0

In [None]:
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = len(y_pred)


        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # Probabilities for target values -
        # only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[
                range(samples),
                y_true
            ]

        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(
                y_pred_clipped * y_true,
                axis=1
            )

        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def regularization_loss(self, layer):

       
        regularization_loss = 0

        # Calculate regularization loss
        # iterate all trainable layers
        #for layer in self.trainable_layers:

           
            # L2 regularization - weights (λ* |w|**2)
        if layer.l2 > 0:
            regularization_loss += layer.l2 * \
                                       np.sum(layer.weights * \
                                              layer.weights)
            # L2 regularization - biases  (λ* |w0|**2)
        
            regularization_loss += layer.l2 * \
                                       np.sum(layer.biases * \
                                              layer.biases)

        return regularization_loss
    # Backward pass
    def backward(self, dvalues, y_true):

        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # We'll use the first sample to count them
        labels = len(dvalues[0])

        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples



**2 Hidden layers = 25,25 nodes. ReLU Activation. Unregularized:**

*One Forward Pass:*

In [None]:
dense1 = Dense(784,25, l2=0.1)
activation1 = ReLU()

dense2 = Dense(25,25, l2=0.1)
activation2 = ReLU()
dense3 = Dense(25,10)
activation3 = Softmax()
loss_function =  Loss_CategoricalCrossentropy()


dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)

activation2.forward(dense2.output)
dense3.forward(activation2.output)

activation3.forward(dense3.output)
data_loss = loss_function.forward(activation3.output, y)
reg_loss = loss_function.regularization_loss(dense1)
loss = data_loss+reg_loss
mean_loss = np.mean(loss)

optimizer = Optimizer_Adam()


predictions = np.argmax(activation3.output, axis=1)
accuracy = np.mean(predictions==y)


In [None]:
print("loss: ", mean_loss)
print("accuracy: ", accuracy)

*Backprop*

In [None]:
loss_function.backward(activation3.output, y)
activation3.backward(loss_function.dinputs)
dense3.backward(activation3.dinputs)
activation2.backward(dense3.dinputs)
dense2.backward(activation2.dinputs)
activation1.backward(dense2.dinputs)
dense1.backward(activation1.dinputs)

**2000 epochs:**

In [None]:
l = []
for epoch in range(1000):
    
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    activation2.forward(dense2.output)
    dense3.forward(activation2.output)

    activation3.forward(dense3.output)
    data_loss = loss_function.forward(activation3.output, y)
    reg_loss = loss_function.regularization_loss(dense1) + \
    loss_function.regularization_loss(dense2)
    loss = data_loss +reg_loss
    mean_loss = np.mean(loss)
    l.append(mean_loss)
    predictions = np.argmax(activation3.output, axis=1)
    
    accuracy = np.mean(predictions==y)
    
    if not epoch%100:
        print(f'epoch:{epoch},' + 
              f'acc:{accuracy:.3f}, ' + 
              f'loss: {mean_loss:.3f},' )
        
    loss_function.backward(activation3.output, y)
    activation3.backward(loss_function.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()
    

    
    
dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)

activation2.forward(dense2.output)
dense3.forward(activation2.output)

activation3.forward(dense3.output)
loss_t = loss_function.forward(activation3.output, y_test)

mean_loss_t = np.mean(loss_t)
predictions_t = np.argmax(activation3.output, axis=1)

accuracy_t = np.mean(predictions_t==y_test)
print("Test Set Accuracy: ", accuracy_t)
print("Test Loss: ", mean_loss_t)

In [None]:
plt.plot(np.arange(1,len(l)+1,1),l)
plt.xlabel("Number of Epochs")
plt.ylabel("Loss (Cross-Entropy)")

**Regularized, lambda = 0.1**

In [None]:
dense1 = Dense(784,25, weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation1 = ReLU()

dense2 = Dense(25,25, weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation2 = ReLU()
dense3 = Dense(25,10,weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation3 = Softmax()
loss_function =  Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam()


In [None]:
l = []
for epoch in range(2000):
    
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    activation2.forward(dense2.output)
    dense3.forward(activation2.output)

    activation3.forward(dense3.output)
    loss = loss_function.forward(activation3.output, y)
    mean_loss = np.mean(loss)
    l.append(mean_loss)
    predictions = np.argmax(activation3.output, axis=1)
    
    accuracy = np.mean(predictions==y)
    
    if not epoch%100:
        print(f'epoch:{epoch},' + 
              f'acc:{accuracy:.3f}, ' + 
              f'loss: {mean_loss:.3f},' )
        
    loss_function.backward(activation3.output, y)
    activation3.backward(loss_function.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    #optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()
    

    
    
dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)

activation2.forward(dense2.output)
dense3.forward(activation2.output)

activation3.forward(dense3.output)
loss_t = loss_function.forward(activation3.output, y_test)
mean_loss_t = np.mean(loss_t)


predictions_t = np.argmax(activation3.output, axis=1)

accuracy_t = np.mean(predictions_t==y_test)
print("Test Set Accuracy: ", accuracy_t)
print("Test Loss: ", mean_loss_t)

In [None]:
plt.plot(np.arange(1,len(l)+1,1),l)
plt.xlabel("Number of Epochs")
plt.ylabel("Loss (Cross-Entropy)")

In [None]:
fig = plt.figure(figsize=(25,25))
for i in range(15):  
    ax = fig.add_subplot(5, 8, i+1)
    ax.imshow(X_test[i].reshape(28,28), cmap=plt.get_cmap('gray'))
    ax.set_title('y: {y}/ y_pred: {y_pred}'.format(y=y_test[i], y_pred= predictions_t[i]))
    plt.axis('off')

**2 Hidden layers = 25,25 nodes. Sigmoid Activation. Unregularized:**

In [None]:
dense1 = Dense(784,25, weight_regularizer_l2 = 0, bias_regularizer_l2=0)
activation1 = Sigmoid()

dense2 = Dense(25,25, weight_regularizer_l2 = 0, bias_regularizer_l2=0)
activation2 = Sigmoid()
dense3 = Dense(25,10,weight_regularizer_l2 = 0, bias_regularizer_l2=0)
activation3 = Softmax()
loss_function =  Loss_CategoricalCrossentropy()


In [None]:
l = []
for epoch in range(2000):
    
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    activation2.forward(dense2.output)
    dense3.forward(activation2.output)

    activation3.forward(dense3.output)
    loss = loss_function.forward(activation3.output, y)
    mean_loss = np.mean(loss)
    l.append(mean_loss)
    predictions = np.argmax(activation3.output, axis=1)
    
    accuracy = np.mean(predictions==y)
    
    if not epoch%100:
        print(f'epoch:{epoch},' + 
              f'acc:{accuracy:.3f}, ' + 
              f'loss: {mean_loss:.3f},' )
        
    loss_function.backward(activation3.output, y)
    activation3.backward(loss_function.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()
    

    
    
dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)

activation2.forward(dense2.output)
dense3.forward(activation2.output)

activation3.forward(dense3.output)
loss_t = loss_function.forward(activation3.output, y_test)
mean_loss_t = np.mean(loss_t)


predictions_t = np.argmax(activation3.output, axis=1)

accuracy_t = np.mean(predictions_t==y_test)
print("Test Set Accuracy: ", accuracy_t)
print("Test Loss: ", mean_loss_t)

In [None]:
plt.plot(np.arange(1,len(l)+1,1),l)
plt.xlabel("Number of Epochs")
plt.ylabel("Loss (Cross-Entropy)")

In [None]:
dense1 = Dense(784,25, weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation1 = Sigmoid()

dense2 = Dense(25,25, weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation2 = Sigmoid()
dense3 = Dense(25,10,weight_regularizer_l2 = 0.1, bias_regularizer_l2=0.1)
activation3 = Softmax()
loss_function =  Loss_CategoricalCrossentropy()
optimizer = Optimizer_Adam()

In [None]:
l = []
for epoch in range(2000):
    
    dense1.forward(X)
    activation1.forward(dense1.output)

    dense2.forward(activation1.output)

    activation2.forward(dense2.output)
    dense3.forward(activation2.output)

    activation3.forward(dense3.output)
    loss = loss_function.forward(activation3.output, y)
    mean_loss = np.mean(loss)
    l.append(mean_loss)
    predictions = np.argmax(activation3.output, axis=1)
    
    accuracy = np.mean(predictions==y)
    
    if not epoch%100:
        print(f'epoch:{epoch},' + 
              f'acc:{accuracy:.3f}, ' + 
              f'loss: {mean_loss:.3f},' )
        
    loss_function.backward(activation3.output, y)
    activation3.backward(loss_function.dinputs)
    dense3.backward(activation3.dinputs)
    activation2.backward(dense3.dinputs)
    dense2.backward(activation2.dinputs)
    activation1.backward(dense2.dinputs)
    dense1.backward(activation1.dinputs)
    
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.update_params(dense3)
    optimizer.post_update_params()
    

    
    
dense1.forward(X_test)
activation1.forward(dense1.output)

dense2.forward(activation1.output)

activation2.forward(dense2.output)
dense3.forward(activation2.output)

activation3.forward(dense3.output)
loss_t = loss_function.forward(activation3.output, y_test)
mean_loss_t = np.mean(loss_t)


predictions_t = np.argmax(activation3.output, axis=1)

accuracy_t = np.mean(predictions_t==y_test)
print("Test Set Accuracy: ", accuracy_t)
print("Test Loss: ", mean_loss_t)