In [11]:
import numpy as np
np.random.seed(22)

In [12]:
# Activation functions and their derivatives
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1-np.tanh(x)**2

def relu(x):
    return np.maximum(0,x)

def relu_derivative(x):
    x[x<=0] = 0
    x[x>0] = 1
    return x

def leaky_relu(x):
    return np.maximum(0.01*x,x)

def leaky_relu_derivative(x):
    x[x<=0] = 0.01
    x[x>0] = 1
    return x

def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x)*(1-sigmoid(x))

def swish(x):
    return x*sigmoid(x)

def swish_derivative(x):
    return sigmoid(x) + x*sigmoid_derivative(x)

def softmax(x):
    x = np.clip(x, -100, 100)  
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def softmax_derivative(x):
    s = softmax(x)
    return s * (1 - s)

# Loss functions and their derivatives
def mse(y_true, y_pred):
    return np.mean(np.power(y_true-y_pred, 2))

def mse_derivative(y_true, y_pred):
    return 2*(y_pred-y_true)/y_true.size

def cross_entropy(y_true, y_pred):
    return -np.log(y_pred[y_true==1]).mean()

def cross_entropy_derivative(y_true, y_pred):
    return y_pred-y_true

In [13]:
def get_activation(activation):
    if activation == "sigmoid":
        return sigmoid
    if activation == "relu":
        return relu
    if activation == "tanh":
        return tanh
    if activation == "softmax":
        return softmax

def get_activation_derivative(activation):
    if activation == "sigmoid":
        return sigmoid_derivative
    if activation == "relu":
        return relu_derivative
    if activation == "tanh":
        return tanh_derivative
    if activation == "softmax":
        return softmax_derivative

def get_loss(loss):
    if loss == "mse":
        return mse
    if loss=="cross_entropy":
        return cross_entropy

def get_loss_derivative(loss):
    if loss == "mse":
        return mse_derivative
    if loss=="cross_entropy":
        return cross_entropy_derivative
    

In [14]:
import numpy as np

# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_propagation(self, input):
        raise NotImplementedError

    # computes dE/dX for a given dE/dY (and update parameters if any)
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

In [15]:
# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        # He normal initialization
        self.weights = np.random.randn(output_size, input_size) * np.sqrt(2 / input_size)
        # self.weights = np.random.rand(output_size, input_size) - 0.5
        self.bias = np.zeros((output_size, 1))
        # self.bias = np.random.rand(output_size, 1) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.weights, self.input) + self.bias
        return self.output

    # computes dL/dW, dL/dB for a given output_error=dL/dY. Returns input_error=dL/dX.
    def backward_propagation(self, output_error, learning_rate):
        weights_error = np.outer(output_error, self.input)  # dL/dW = dL/dY * dY/dW
        input_error = np.dot(self.weights.T, output_error)  # dL/dX = dL/dY * dY/dX
        bias_error = output_error                       # dL/dB = dL/dY * dY/dB

        # update parameters

        self.weights -= learning_rate *weights_error
        self.bias -= learning_rate*bias_error
        return input_error

In [16]:
# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_derivative):
        self.activation = activation
        self.activation_derivative = activation_derivative

    # returns the activated input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # Returns input_error=dE/dX for a given output_error=dE/dY.
    # learning_rate is not used because there is no "learnable" parameters.
    def backward_propagation(self, output_error, learning_rate):
        return self.activation_derivative(self.input) * output_error

In [17]:
class Network:
    def __init__(self, epochs=1000, learning_rate=0.001, loss_type="cross_entropy", activation_type="relu",hidden_layer_sizes=(100,10)):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.loss = get_loss(loss_type)
        self.loss_derivative = get_loss_derivative(loss_type)
        self.activation = get_activation(activation_type)
        self.activation_derivative = get_activation_derivative(activation_type)
        self.hidden_layer_sizes = hidden_layer_sizes
        self.layers = []
        
    def use(self, loss, loss_derivative):
        self.loss = loss
        self.loss_derivative = loss_derivative
    def add(self, layer):
        self.layers.append(layer)
   
    def fit(self, X_train, y_train):
        self.layers = []
        self.layers.append(FCLayer(X_train.shape[1], self.hidden_layer_sizes[0]))
        self.layers.append(ActivationLayer(self.activation, self.activation_derivative))
        #add hidden layers in a loop
        for i in range(1, len(self.hidden_layer_sizes)):
            self.layers.append(FCLayer(self.hidden_layer_sizes[i-1], self.hidden_layer_sizes[i]))
            self.layers.append(ActivationLayer(self.activation, self.activation_derivative))

        #output is always softmax 
        self.layers.append(FCLayer(self.hidden_layer_sizes[-1], 10))
        self.layers.append(ActivationLayer(softmax,  softmax_derivative))
        
         # sample dimension first
        samples = len(X_train)

        # training loop
        for i in range(self.epochs):
            err = 0
            for j in range(samples):
                # forward propagation
                output = X_train[j]
                for layer in self.layers:
                    output = layer.forward_propagation(output)

                # compute loss (for display purpose only)
                err += self.loss(y_train[j], output)

                # backward propagation
                error = self.loss_derivative(y_train[j], output)
                for layer in reversed(self.layers):
                    error = layer.backward_propagation(error, self.learning_rate)

            # calculate average error on all samples
            err /= samples
            print('epoch %d/%d   error=%f' % (i+1, self.epochs, err))
                
      # predict output for given input
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []
        # run network over all samples
        for i in range(samples):
            # forward propagation
            output = input_data[i].reshape(input_data.shape[1], -1)
            for layer in self.layers:
                output = layer.forward_propagation(output)
            result.append(output)
        return result
    
    def get_accuracy(self, x_test, y_test):
        y_pred = self.predict(x_test)
        y_pred = np.argmax(y_pred , axis=1)
        y_test = np.argmax(y_test , axis=1)
        return np.sum(y_pred == y_test) / len(y_test)


In [18]:
import gzip
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
train_set, valid_set, test_set = None , None, None 
with gzip.open('./mnist-1.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set  = pickle.load(f, encoding='latin1')

# Check that the datasets are loaded correctly
print("Train set:", train_set)
print("Valid set:", valid_set)
print("Test set:", test_set)

# Access the train_set variable
X_train, y_train = train_set
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_valid, y_valid = valid_set
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)

X_test, y_test = test_set
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


# Preprocess validation and test data sets
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype("int32")
X_valid = scaler.fit_transform(X_valid).astype("int32")
X_test = scaler.fit_transform(X_test).astype("int32")
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train).astype("int32")
X_valid = scaler.fit_transform(X_valid).astype("int32")
X_test = scaler.fit_transform(X_test).astype("int32")

Train set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([5, 0, 4, ..., 8, 4, 8], dtype=int64))
Valid set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([3, 8, 6, ..., 5, 6, 8], dtype=int64))
Test set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([7, 2, 1, ..., 4, 5, 6], dtype=int64))
X_train shape: (50000, 784)
y_train shape: (50000,)
X_valid

In [19]:
# this is under the assumption that the validation data which we loaded earlier is not there in the training data and needs to be appended to the end 
# we append validation set to the end of the training set so validation fraction can take it as validation data when we do grid search
X_train= np.concatenate((X_train,X_valid))
y_train= np.concatenate((y_train,y_valid))
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train = X_train.reshape(X_train.shape[0], 28*28, 1)

# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np.eye(10)[y_train].reshape(y_train.shape[0], 10, 1)

# same for test data : 10000 samples
X_test = X_test.reshape(X_test.shape[0], 28*28, 1)

y_test = np.eye(10)[y_test].reshape(y_test.shape[0], 10, 1)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (60000, 784)
y_train shape: (60000,)
X_train shape: (60000, 784, 1)
y_train shape: (60000, 10, 1)


In [20]:
net = Network(epochs=100,learning_rate=0.01,loss_type="cross_entropy",activation_type="relu",hidden_layer_sizes=(100,50))
# net.add(FCLayer(784,100))
# net.add(ActivationLayer(relu, relu_derivative))
# net.add(FCLayer(100, 10))
# net.add(ActivationLayer(softmax, softmax_derivative))

# train
net.fit(X_train[:1000],y_train[:1000])

# test
out = net.predict(X_test)


epoch 1/100   error=2.112854
epoch 2/100   error=1.468231
epoch 3/100   error=0.885944
epoch 4/100   error=0.638340
epoch 5/100   error=0.534915
epoch 6/100   error=0.472573
epoch 7/100   error=0.420666
epoch 8/100   error=0.391426
epoch 9/100   error=0.355265
epoch 10/100   error=0.330445
epoch 11/100   error=0.308192
epoch 12/100   error=0.256440
epoch 13/100   error=0.256399
epoch 14/100   error=0.300688
epoch 15/100   error=0.217338
epoch 16/100   error=0.192713
epoch 17/100   error=0.179184
epoch 18/100   error=0.200410
epoch 19/100   error=0.204236
epoch 20/100   error=0.211320
epoch 21/100   error=0.251977
epoch 22/100   error=0.285090
epoch 23/100   error=0.255106
epoch 24/100   error=0.233522
epoch 25/100   error=0.180813
epoch 26/100   error=0.119551
epoch 27/100   error=0.090659
epoch 28/100   error=0.078871
epoch 29/100   error=0.065214
epoch 30/100   error=0.058388
epoch 31/100   error=0.052380
epoch 32/100   error=0.046919
epoch 33/100   error=0.039904
epoch 34/100   erro

In [21]:
print(f'Accuracy for training samples: {net.get_accuracy(X_train[:1000], y_train[:1000])}')
print(f'Accuracy for test samples: {net.get_accuracy(X_test[:200], y_test[:200])}')

Accuracy for training samples: 0.994
Accuracy for test samples: 0.81
