In [1]:
import pandas as pd
import numpy as np

In [2]:
test = pd.read_csv("./archive_mnist/mnist_test.csv")
train = pd.read_csv("./archive_mnist/mnist_train.csv")

test_x = np.array(test.iloc[:, 1:])/255
test_y = np.array(test.iloc[:, 0])
train_x = np.array(train.iloc[:, 1:])/255
train_y = np.array(train.iloc[:, 0])
print(np.shape(test_x))

(10000, 784)


In [3]:
class FNN:
    def __init__(self, structure):
        if isinstance(structure, list)!=True:
            raise ValueError("input should be a list of layer sizes")
        self.struct = structure
        self.num_layers = len(structure)-1
        self.w = list()
        self.b = list()
        for i in range(self.num_layers):
            self.w.append((np.random.rand(structure[i], structure[i+1]) - 0.5)*0.01)
            self.b.append(np.zeros((1, structure[i+1])))

    def forward_propagation(self, X):
        a = X
        A = list()
        Z = list()
        A.append(a)
        for i in range(self.num_layers):
            z = a@self.w[i] + self.b[i]
            Z.append(z)
            if i == self.num_layers-1:
                a = self.softmax(z)
            else:
                a = self.relu(z)
            A.append(a)
        return Z, A
    
    def backward_propagation(self, Z, A, y, learning_rate):
        delta = [None] * self.num_layers
        
        for i in reversed(range(self.num_layers)):
            if i == self.num_layers-1:
                # Output layer
                delta[i] = A[-1] - y
            else:
                # Hidden layers
                delta[i] = delta[i+1] @ self.w[i+1].T * self.relu_deri(Z[i])
            
            # Weight gradient: input_to_layer @ delta
            dw = A[i].T @ delta[i]  # A[i] is input to layer i
            
            # Bias gradient: sum delta across batch
            db = np.sum(np.array(delta[i]), axis=0, keepdims=True)
            
            # Update weights immediately
            self.w[i] -= learning_rate * dw
            self.b[i] -= learning_rate * db
    
    def predict(self, X):
        _, A = self.forward_propagation(X)
        return np.argmax(A[-1])
    
    def train(self, X, Y, learning_rate = 0.005):
        Z, A = self.forward_propagation(X)
        y = self.one_hot_encoding(Y)
        self.backward_propagation(Z, A, y, learning_rate)
    
    def relu(self, z):
        return np.maximum(0, z)
    
    def relu_deri(self, z):
        return np.where(0 < z, 1, 0)
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def one_hot_encoding(self, Y):
        y = np.zeros((1, self.struct[-1]))
        y[0, Y] = 1
        return y

    def __str__(self):
        return str(self.w)

In [6]:
network1 = FNN([784, 10, 10, 10])
for i in range(5000):
    network1.train(train_x[i:i+1], train_y[i])

print(network1.predict(test_x[1300:1301]))
print(test_y[1300])

1
4
