In [23]:
import numpy as np
np.random.seed(22)

In [24]:
#activation functions 

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x):
    return np.maximum(x, 0)

def relu_prime(x):
    return (x > 0).astype(int)

def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1 - np.tanh(x)**2


def softmax(X):
    exp_X = np.exp(X - np.max(X, axis=1, keepdims=True))
    return exp_X / np.sum(exp_X, axis=1, keepdims=True)

def softmax_prime(x):
    # compute softmax function
    softmax_output = softmax(x)
    # compute softmax derivative
    softmax_derivative = np.zeros_like(softmax_output)
    for i in range(softmax_output.shape[0]):
        for j in range(softmax_output.shape[1]):
            if i == j:
                softmax_derivative[i, j] = softmax_output[i, j] * (1 - softmax_output[i, j])
            else:
                softmax_derivative[i, j] = -softmax_output[i, j] * softmax_output[i, j]
    return softmax_derivative


In [25]:
#loss functions
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / y_true.size

def hinge(y_true, y_pred):
    return np.mean(np.maximum(1 - y_true * y_pred, 0))

def hinge_prime(y_true, y_pred):
    temp = 1 - y_true * y_pred
    return -y_true * (temp > 0) / y_true.size

def cross_entropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)  # clip to avoid log(0)
    loss = - y_true * np.log(y_pred)
    return loss.mean()

def cross_entropy_prime(y_true, y_pred):
    # compute cross-entropy loss derivative
    return y_pred - y_true

In [26]:
def get_activation(activation):
    if activation == "sigmoid":
        return sigmoid
    if activation == "relu":
        return relu
    if activation == "tanh":
        return tanh
    if activation == "softmax":
        return softmax

def get_activation_derivative(activation):
    if activation == "sigmoid":
        return sigmoid_prime
    if activation == "relu":
        return relu_prime
    if activation == "tanh":
        return tanh_prime
    if activation == "softmax":
        return softmax_prime

def get_loss(loss):
    if loss == "mse":
        return mse
    if loss =="hinge":
        return hinge
    if loss=="cross_entropy":
        return cross_entropy

def get_loss_derivative(loss):
    if loss == "mse":
        return mse_prime
    if loss =="hinge":
        return hinge_prime
    if loss=="cross_entropy":
        return cross_entropy_prime
    

In [27]:

class Layer: 
    def __init__(self):
        pass
    
    def forward_propagation(self, input_data):
        raise NotImplementedError
    
    def backward_propagation(self, output_error, learning_rate):
        raise NotImplementedError

In [28]:
class FCLayer(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.normal(size=(input_size, output_size)) / np.sqrt(input_size)
        self.bias = np.zeros((1, output_size) )
        
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input.reshape(1, -1), self.weights) + self.bias
        return self.output
    
    def backward_propagation(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        
        return input_error

In [29]:
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    def backward_propagation(self, output_error, learning_rate):
        if self.activation_prime is None:
            return output_error
        else:
            return output_error * self.activation_prime(self.input)


In [30]:
class Network:
    def __init__(self, epochs=1000, learning_rate=0.001, loss="hinge", activation="softmax", hidden_layer_sizes=(10,2)):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.loss = loss
        self.activation = activation
        self.hidden_layer_sizes = hidden_layer_sizes
        
        self.layers = []
        
    def fit(self, X, y):
        self.layers.append(FCLayer(X.shape[1], self.hidden_layer_sizes[0]))
        self.layers.append(ActivationLayer(get_activation(self.activation), get_activation_derivative(self.activation)))
        for i in range(1, len(self.hidden_layer_sizes)):
            self.layers.append(FCLayer(self.hidden_layer_sizes[i-1], self.hidden_layer_sizes[i]))
            self.layers.append(ActivationLayer(get_activation(self.activation), get_activation_derivative(self.activation)))
        self.layers.append(FCLayer(self.hidden_layer_sizes[-1], 10))
        self.layers.append(ActivationLayer(get_activation(self.activation),  get_activation_derivative(self.activation)))
        
        for epoch in range(self.epochs):
            for i in range(len(X)):
                # forward propagation
                output = X[i].reshape(1, -1)
                for layer in self.layers:
                    output = layer.forward_propagation(output)
                
                # compute error and loss
                error = get_loss_derivative(self.loss)(y[i], output)
                loss = get_loss(self.loss)(y[i], output)
                
                # backward propagation
                for layer in reversed(self.layers):
                    error = layer.backward_propagation(error, self.learning_rate)
                    
            print("Epoch:", epoch, "Loss:", loss)
                
    def predict(self, X):
        # forward propagate through the network and return the output
        output = X
        for layer in self.layers:
            output = layer.forward_propagation(output)
        return output


In [31]:
import gzip
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
train_set, valid_set, test_set = None , None, None 
with gzip.open('./mnist-1.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set  = pickle.load(f, encoding='latin1')

# Check that the datasets are loaded correctly
print("Train set:", train_set)
print("Valid set:", valid_set)
print("Test set:", test_set)

# Access the train_set variable
X_train, y_train = train_set
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_valid, y_valid = valid_set
print("X_valid shape:", X_valid.shape)
print("y_valid shape:", y_valid.shape)

X_test, y_test = test_set
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Preprocess validation and test data sets
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)
X_test = scaler.fit_transform(X_test)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)
X_test = scaler.fit_transform(X_test)

Train set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([5, 0, 4, ..., 8, 4, 8], dtype=int64))
Valid set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([3, 8, 6, ..., 5, 6, 8], dtype=int64))
Test set: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([7, 2, 1, ..., 4, 5, 6], dtype=int64))
X_train shape: (50000, 784)
y_train shape: (50000,)
X_valid

In [32]:
# this is under the assumption that the validation data which we loaded earlier is not there in the training data and needs to be appended to the end 
# we append validation set to the end of the training set so validation fraction can take it as validation data when we do grid search
X_train= np.concatenate((X_train,X_valid))
y_train= np.concatenate((y_train,y_valid))
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)



X_train shape: (60000, 784)
y_train shape: (60000,)


In [33]:


# network
net = Network(epochs=20)

# train
net.fit(X_train,y_train.reshape(-1, 1))

# test
out = net.predict(X_train)
print(out)

Epoch: 0 Loss: 0.7961941612917435
Epoch: 1 Loss: 0.8512232229058052
Epoch: 2 Loss: 0.8688672731694563


KeyboardInterrupt: 

In [None]:
print(out)

In [None]:
predicted_number = np.argmax(out[1])
predicted_number

In [None]:
# just wanted to see what the data looks like as an image 
sample_size = 2
images = X_train[:sample_size]
labels = y_train[:sample_size]

# Reshape the images from 1D to 2D arrays
images = images.reshape((-1, 28, 28))

# Plot the images as subplots in a 5x5 grid
fig, axes = plt.subplots(nrows=1, ncols=sample_size, figsize=(8, 8))
for i, ax in enumerate(axes.flat):
    # Plot image
    ax.imshow(images[i], cmap='gray')
    # Set title to the corresponding label
    ax.set_title(str(labels[i]))
    # Remove ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')

# Show the plot
plt.tight_layout()
plt.show()