In [52]:
import numpy as np
import pandas as pd

In [60]:
data = np.load("mnist.npz") # loading dataset

# assign dataset
x_train = data["x_train"] # shape : (60000, 784)
y_train = data["y_train"] # shape : (60000,)

x_test = data["x_test"]
y_test = data["y_test"]

# data normalization and flattening 
x_train = x_train / 255.0
x_train = x_train.reshape(-1, 28*28)

x_test = x_test / 255.0
x_test = x_test.reshape(-1, 28*28)

#one hotting for have 10 classes
y_train_onehot = np.eye(10)[y_train]
y_test_onehot = np.eye(10)[y_test]

In [61]:
def initialization(X, hidden=10, output=10): # function initialisation of wheights of the ANN
	n_input = X.shape[1]

	#first layer
	W1 = np.random.randn(hidden, n_input) * np.sqrt(1. / n_input)
	b1 = np.zeros((1, hidden))

	#second layer
	W2 = np.random.randn(output, hidden) * np.sqrt(1. / hidden)
	b2 = np.zeros((1, output))
	return W1, b1, W2, b2

In [62]:
def sigmoid(z):
	return 1 / (1 + np.exp(-z))
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def forward_propagation(X, W1, b1, W2, b2): # forward propagation function
	#layer 0
	A0 = X # shape : (60000, 784)

	#first layer
	Z1 = A0 @ W1.T + b1 # shape : (60000, 784) @ (784, 10) + (1,10) = (60000, 10)
	A1 = sigmoid(Z1)

	print(Z1.shape)
	#second layer
	Z2 = A1 @ W2 + b2   # shape : (60000, 10) @ (10, 10) + (1,10) = (60000, 10)
	A2 = softmax(Z2)

	return A2,A1

In [63]:
def cross_entropy_loss(A, Y): # loss function
    m = Y.shape[0]
    
    # Pour Ã©viter log(0)
    epsilon = 1e-9
    
    loss = -np.sum(Y * np.log(A + epsilon)) / m
    return loss

In [64]:
def gradient(X, Y, A1, A2, W2):
	m = X.shape[0]

										# d mean the partial derivative
	dZ2 = (A2 - Y) / m 						 # = (dLoss / dA2) * (dA2 / dZ2) easy derivative, cross entropy then softmax
	dW2 = A1.T @ dZ2						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dW2)
	db2 = np.sum(dZ2, axis=0, keepdims=True) # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / db2)

	dA1 = dZ2 @ W2.T 						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1)
	dZ1 = dA1 * (A1 * (1 - A1)) 			 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1)
	dW1 = X.T @ dZ1 						 # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1) * (dZ1 / dW1)
	db1 = np.sum(dZ1, axis=0, keepdims=True) # = (dLoss / dA2) * (dA2 / dZ2) * (dZ2 / dA1) * (dA1 / dZ1) * (dZ1 / db1)
	return dW1, db1, dW2, db2

In [65]:
W1, b1, W2, b2 = initialization(x_train)
A2,A1 = forward_propagation(x_train, W1, b1, W2, b2)

(60000, 10)


In [68]:
a,b,c,d = (gradient(x_train, y_train_onehot, A1, A2, W2))
print(np.sum(a),b.shape,c.shape,d.shape)

6.806588046054802 (1, 10) (10, 10) (1, 10)
