In [24]:
import tensorflow as tf
import numpy as np

mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

alpha = 1/60000;
trainingId = 0;

def sigmoid(x):
    # https://en.wikipedia.org/wiki/Sigmoid_function
    return 1.0 / (1.0 + np.exp(-x))

def sigmoid_derivative(x):
    # https://www.geeksforgeeks.org/derivative-of-the-sigmoid-function/
    return sigmoid(x) * (1.0 - sigmoid(x))

def softmax(x):
    # Subtract the maximum value in each row for numerical stability
    x_max = np.max(x, axis=0, keepdims=True)
    # Compute the exponentials
    e_x = np.exp(x - x_max)
    # Normalize the exponentials to get the probabilities
    return e_x / np.sum(e_x, axis=0, keepdims=True)

# softmaxExample_1Column_Of_10Digits = softmax([
# [0],
# [2],
# [4],
# [6],
# [8],
# [9],
# [7],
# [5],
# [3],
# [1]]);

X = x_train[trainingId].reshape(-1,1)
W1 = np.zeros((20, X.size));
B1 = np.zeros((20, 1));
Z1 = np.zeros((20, 1));
A1 = np.zeros((20, 1));

W2 = np.zeros((10, 20));
B2 = np.zeros((10, 1));
Z2 = np.zeros((10, 1));
Y = np.zeros((10, 1));
expectY = np.zeros((Y.size,1));
expectY[y_train[trainingId]] = 1;

def loadTrainingData():
    global X, expectY;
    X = x_train[trainingId].reshape(-1,1);
    expectY = np.zeros((10, 1));
    expectY[y_train[trainingId]] = 1;


### Forward Propagation

def forwardPropagationFromXToA1():
    global Z1, A1;
    Z1 = W1 @ X + B1;
    A1 = sigmoid(Z1);

def forwardPropagationFromA1ToY(): 
    global Z2, Y;
    Z2 = W2 @ A1 + B2;
    Y = softmax(Z2);

### Loss Function

def loss():
    global Y, expectY;
    # Compute the MSE loss
    return np.sum((expectY - Y) ** 2) / expectY.size


### Backward propagation

def impactOf_A1_On_Z2():
    # Z2 = W2 @ A1 + B2;
    # dZ2/dA1 = W2
    return W2;

def impactOf_Y_On_Loss():
    # L = 1/Y.size * [(expectY1 - Y1)^2 + (expectY2 - Y2)^2 + ... + (expectYn - Yn)^2]
    # dL/dYn = 2 / Y.size * (expectYn - Yn) * (-1)
    return 2 / expectY.size * (expectY - Y) * (-1);

def impactOf_Z2_On_Y():
     # Y = softmax(Z2);
     # softmax only changes the scale of the output, so the derivative is the same as the identity
     # so no impact
    return np.ones((Y.size, 1));

def impactOf_Z1_On_A1():
    # A1 = sigmoid(Z1);
    # dA1/dZ1 = sigmoid_derivative(Z1)
    return sigmoid_derivative(Z1);

def impactOf_W2_On_Z2():
    # Z2 = W2 @ A1 + B2;
    # dZ2/dW2 = A1
    return A1

def impactOf_B2_On_Z2():
    # Z2 = W2 @ A1 + B2;
    # dZ2/dB2 = 1
    return np.ones((Z2.size, 1));

def impactOf_W2_On_Loss():
    # chaining rule: dL/dW2 = dL/dY * dY/dZ2 * dZ2/dW2
    return (impactOf_Y_On_Loss() * impactOf_Z2_On_Y()) @ impactOf_W2_On_Z2().T;

def impactOf_W1_On_Z1():
    # Z1 = W1 @ X + B1;
    # dZ1/dW1 = X
    return X;

def impactOf_B1_On_Z1():
    # Z1 = W1 @ X + B1;
    # dZ1/dB1 = 1
    return np.ones((Z1.size, 1));

def impactOf_B2_On_Loss():
    # chaining rule: dL/dB2 = dL/dY * dY/dZ2 * dZ2/dB2
    return impactOf_Y_On_Loss() * impactOf_Z2_On_Y() * impactOf_B2_On_Z2();

def impactOf_A1_On_Loss():
    # chaining rule: dL/dA1 = dL/dY * dY/dZ2 * dZ2/dA1 = dZ2/dA1 * dL/dY * dY/dZ2
    return impactOf_A1_On_Z2().T @ (impactOf_Y_On_Loss() * impactOf_Z2_On_Y());

def impactOf_W1_On_Loss():
    # chaining rule: dL/dW1 = dL/dA1 * dA1/dZ1 * dZ1/dW1
    return (impactOf_A1_On_Loss() * impactOf_Z1_On_A1()) @ impactOf_W1_On_Z1().T;

def impactOf_B1_On_Loss():
    # chaining rule: dL/dB1 = dL/dA1 * dA1/dZ1 * dZ1/dB1
    return impactOf_A1_On_Loss() * impactOf_Z1_On_A1() * impactOf_B1_On_Z1();

def backPropagationFromYToA1():
    global W2, B2;
    W2 = W2 - alpha * impactOf_W2_On_Loss();
    B2 = B2 - alpha * impactOf_B2_On_Loss();

def backPropagationFromA1ToX():
    global W1, B1    
    W1 = W1 - alpha * impactOf_W1_On_Loss();
    B1 = B1 - alpha * impactOf_B1_On_Loss();

def train():
    global trainingId;
    for i in range(60000):
        trainingId = i;
        loadTrainingData();
        forwardPropagationFromXToA1();
        forwardPropagationFromA1ToY();
        print("!!! trainingId: ", trainingId);
        print("!!! loss: ", loss());
        backPropagationFromYToA1();
        backPropagationFromA1ToX();

train();




!!! trainingId:  0
!!! loss:  0.09000000000000001
!!! trainingId:  1
!!! loss:  0.09000004000068
!!! trainingId:  2
!!! loss:  0.09000008000104001
!!! trainingId:  3
!!! loss:  0.09000012000108
!!! trainingId:  4
!!! loss:  0.0900001600008
!!! trainingId:  5
!!! loss:  0.09000020000020001
!!! trainingId:  6
!!! loss:  0.08999984000167996
!!! trainingId:  7
!!! loss:  0.09000027999964005
!!! trainingId:  8
!!! loss:  0.08999951999887995
!!! trainingId:  9
!!! loss:  0.0899999600074
!!! trainingId:  10
!!! loss:  0.09000000000439994
!!! trainingId:  11
!!! loss:  0.0900000400106799
!!! trainingId:  12
!!! loss:  0.08999968000383975
!!! trainingId:  13
!!! loss:  0.09000051999668018
!!! trainingId:  14
!!! loss:  0.08999936000840016
!!! trainingId:  15
!!! loss:  0.0900005999942003
!!! trainingId:  16
!!! loss:  0.09000024000687985
!!! trainingId:  17
!!! loss:  0.09000067998724053
!!! trainingId:  18
!!! loss:  0.09000031999608005
!!! trainingId:  19
!!! loss:  0.09000036000139985
!!! tr