In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
data = np.load("mnist.npz") # loading dataset

# assign dataset
x_train = data["x_train"] # shape : (60000, 784)
y_train = data["y_train"] # shape : (60000,)

x_test = data["x_test"]
y_test = data["y_test"]

# data normalization and flattening 
x_train = x_train / 255.0
x_train = x_train.reshape(-1, 28*28)

x_test = x_test / 255.0
x_test = x_test.reshape(-1, 28*28)

#one hotting for have 10 classes
y_train_onehot = np.eye(10)[y_train]
y_test_onehot = np.eye(10)[y_test]

In [None]:
def initialization(X, hidden): # function initialisation of wheights of the ANN
	"""
	initialization(X, hidden) -> weights
	
	arg :	- X = features (60000, 784) matrice.
			- hidden = the number of hidden neurons.
	
	output : - wheights = the dict of all the matrices weights (W1, b1...).
	"""
	output=10
	n_input = X.shape[1]

	#first layer
	W1 = np.random.randn(hidden, n_input) * np.sqrt(1. / n_input)
	b1 = np.zeros((1, hidden))

	#second layer
	W2 = np.random.randn(output, hidden) * np.sqrt(1. / hidden)
	b2 = np.zeros((1, output))

	weights = {
		"W1" : W1,
		"b1" : b1,
		"W2" : W2,
		"b2" : b2
	}
	
	return weights

In [None]:
def sigmoid(z):
	return 1 / (1 + np.exp(-z))
def softmax(z):
	exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
	return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def forward_propagation(X, weights): # forward propagation function
	"""
	forward_propagation(X, weights) -> activations
	
	arg :	- X = features (60000, 784) matrice.
			- wheights = the dict of all the weights (W1, b1...).
	
	output : - activations = the dict of all activations matrices (A1, A2).
	"""
	W1 = weights["W1"]
	b1 = weights["b1"]
	W2 = weights["W2"]
	b2 = weights["b2"]

	#layer 0
	A0 = X # shape : (60000, 784)

	#first layer
	Z1 = A0 @ W1.T + b1 # shape : (60000, 784) @ (784, 10) + (1,10) = (60000, 10)
	A1 = sigmoid(Z1)

	#second layer
	Z2 = A1 @ W2.T + b2   # shape : (60000, 10) @ (10, 10) + (1,10) = (60000, 10)
	A2 = softmax(Z2)

	activations = {
		"A1" : A1,
		"A2" : A2
	}

	return activations

In [None]:
def cross_entropy_loss(activations, Y): # loss function
	"""
	cross_entropy_loss(activations, Y) -> loss

	arg :	- activations = the dict of all activations matrices (A1, A2).
			- Y = labels one hotted of the X features.
	
	output :	- loss = loss of the forward propagation.
	"""
	A = activations["A2"]
	m = Y.shape[0]
	
	# Pour éviter log(0)
	epsilon = 1e-9
	
	loss = -np.sum(Y * np.log(A + epsilon)) / m
	
	return loss

In [None]:
def gradient(X, Y, activations, weights):
	"""
	gradient(X, Y, activations, weights) -> gradients
	
	arg :	- X = features (60000, 784) matrice.
			- Y = labels one hotted of the X features.
			- activations = the dict of all activations matrices (A1, A2).
			- wheights = the dict of all the weights (W1, b1...).
			
	output :	- gradients = gradients of all parameters (weights), determine the way to go to update parameters.
	"""
	A1 = activations["A1"]
	A2 = activations["A2"]

	W2 = weights["W2"]

	m = X.shape[0]

										# d mean the partial derivative
	dZ2 = (A2 - Y) / m 						 # = (dLoss / dA2) * (dA2 / dZ2) easy derivative, cross entropy then softmax
	dW2 = A1.T @ dZ2						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dW2)
	db2 = np.sum(dZ2, axis=0, keepdims=True) # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / db2)

	dA1 = dZ2 @ W2 						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1)
	dZ1 = dA1 * (A1 * (1 - A1)) 			 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1)
	dW1 = X.T @ dZ1 						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1) * (dZ1 / dW1)
	db1 = np.sum(dZ1, axis=0, keepdims=True) # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1) * (dZ1 / db1)
	
	gradients = {
		"dW1" : dW1.T,
		"db1" : db1,
		"dW2" : dW2.T,
		"db2" : db2
	}


	return gradients

In [None]:
def update(weights, gradients, learning_rate): # function for updating the weight of the model.
	"""
	gradient(X, Y, activations, weights) -> gradients
	
	arg :	- gradients = gradients of all parameters (weights);
			- wheights = the dict of all the weights (W1, b1...).
			- learning_rate = the speed of the learning (high : learning fast but bad for finding minimum, and low : learning slow but higher chance to find the minimum)
			
	output :	- wheights = the dict of all the weights (W1, b1...) update by the gradients.
	"""
	# update all parameters
	weights["W1"]-=learning_rate*gradients["dW1"]
	weights["b1"]-=learning_rate*gradients["db1"]

	weights["W2"]-=learning_rate*gradients["dW2"]
	weights["b2"]-=learning_rate*gradients["db2"]

	return weights

In [None]:
def accuracy_score(y_true, y_pred): # func to determine the accuracy score
    return np.mean(y_true == y_pred)

In [None]:
class Model:

	#initialization of the model
	def __init__(self, X, Y,hidden=100):
		self.X = X
		self.Y = Y
		self.weights = initialization(X,hidden)


	# function to fit the model and return loss
	def fit(self, n_iter=1000, learning_rate=0.1):
		cost = []
		for _ in tqdm(range(n_iter)):
			activations = forward_propagation(self.X, self.weights)
			gradients = gradient(self.X, self.Y,activations, self.weights)
			self.weights = update(self.weights, gradients, learning_rate)
			cost.append(cross_entropy_loss(activations, self.Y))
		return cost
		

	def predict(self, X):
		activations = forward_propagation(X, self.weights)
		return np.argmax(activations["A2"], axis=1)  # return the predict class (0-9)


	def score(self, X, y):
		y_pred = self.predict(X)
		return np.mean(y_pred == y)


In [None]:
newModel = Model(x_train, y_train_onehot, hidden=100)

In [None]:
d= 100

# Test on different iterations
for i in range(10):
	model = (Model(x_train,y_train_onehot,hidden=100))
	model.fit(d+i*100)
	acc = model.score(x_test, y_test)

	print(f"Test accuracy : {acc * 100:.2f}%")

"""
Test accuracy : 75.01% # 100 iter
Test accuracy : 80.64% # 200 iter
Test accuracy : 84.82% # 300 iter
Test accuracy : 86.59% # 400 iter
Test accuracy : 87.49% # 500 iter
Test accuracy : 88.49% # 600 iter
Test accuracy : 88.96% # 700 iter
Test accuracy : 89.43% # 800 iter
Test accuracy : 89.65% # 900 iter
Test accuracy : 89.89% # 1000 iter

the model improves on different iterations
this seems that the model is good
"""

100%|██████████| 100/100 [00:16<00:00,  6.11it/s]


Précision sur le test : 75.01%


100%|██████████| 200/200 [00:30<00:00,  6.57it/s]


Précision sur le test : 80.64%


100%|██████████| 300/300 [00:46<00:00,  6.45it/s]


Précision sur le test : 84.82%


100%|██████████| 400/400 [01:02<00:00,  6.40it/s]


Précision sur le test : 86.59%


100%|██████████| 500/500 [01:18<00:00,  6.40it/s]


Précision sur le test : 87.49%


100%|██████████| 600/600 [01:27<00:00,  6.84it/s]


Précision sur le test : 88.49%


100%|██████████| 700/700 [01:41<00:00,  6.87it/s]


Précision sur le test : 88.96%


100%|██████████| 800/800 [01:56<00:00,  6.86it/s]


Précision sur le test : 89.43%


100%|██████████| 900/900 [02:16<00:00,  6.60it/s]


Précision sur le test : 89.65%


100%|██████████| 1000/1000 [02:27<00:00,  6.76it/s]

Précision sur le test : 89.89%



