In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [10]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data = np.array(data)
m,n = data.shape
print(m,n)

42000 785


In [12]:
np.random.shuffle(data)
data_dev = data[0:1000].T
y_dev = data_dev[0] #selecting the first row which contains labels of each training samples
x_dev = data_dev[1:n] /255.0
data_train = data[1000:m].T
y_train = data_train[0]
x_train = data_train[1:n] /255.0
print(y_train)

[5 4 3 ... 3 7 8]


In [13]:
print(x_train[:, 0].shape)

(784,)


In [16]:
def init_parameters():
    W1 = np.random.randn(16, 784) * 0.01 
    b1 = np.zeros((16, 1))
    W2 = np.random.randn(16, 16) * 0.01
    b2 = np.zeros((16, 1))
    W3 = np.random.randn(10, 16) * 0.01
    b3 = np.zeros((10, 1))
    
    return {
        "W1" : W1, 
        "W2" : W2,
        "W3" : W3, 
        "b1" : b1, 
        "b2" : b2, 
        "b3" : b3
    }

def ReLu(z):
    return np.maximum(0,z)

def SoftMax(z):
    z_shifted = z - np.max(z, axis = 0, keepdims = True)
    z_exp = np.exp(z_shifted) #shifted to prevent overflow 
    exponential_sum = np.sum (z_exp, axis = 0, keepdims = True)
    return z_exp / exponential_sum # returns an array of probablistic distributions
    

def forward_propagation(X, parameters):
    Z1 = parameters["W1"].dot(X) + parameters["b1"]
    a1 = ReLu(Z1) 
    Z2 = parameters["W2"].dot(a1) + parameters["b2"]
    a2 = ReLu(Z2) 
    Z3 = parameters["W3"].dot(a2) + parameters["b3"]
    a3 = SoftMax(Z3) 
    
    return Z1, a1, Z2, a2, Z3, a3

def one_hot_encoding(Y):
    Y_encoded = np.zeros((Y.size, 10))
    Y_encoded[np.arange(Y.size), Y] = 1
    return Y_encoded.T

def calculate_loss(a3, y):
    #using cross entropy loss fn here 
    epsilon = 1e-12
    #predicted probablities are clipped between 0 and 1 to avoid crashes ( log(0) =1 , log(1) = 0)
    a3_clipped = np.clip(a3, epsilon, 1-epsilon)
    loss = -np.sum(y * np.log(a3_clipped))/y.shape[1]
    return loss

    
def backward_propagation(X, Y_hot_encoded, parameters, Z1, a1, Z2, a2, Z3, a3, m):
    dZ3 = a3 - Y_hot_encoded
    dZ2 = parameters["W3"].T.dot(dZ3) * (Z2 > 0)
    dZ1 = parameters["W2"].T.dot(dZ2) * (Z1 > 0)

    dW3 = dZ3.dot(a2.T) / m
    dW2 = dZ2.dot(a1.T) / m
    dW1 = dZ1.dot(X.T) / m

    db3 = np.sum(dZ3, axis = 1 , keepdims = True) / m
    db2 = np.sum(dZ2, axis = 1 , keepdims = True) / m
    db1 = np.sum(dZ1, axis = 1 , keepdims = True) / m

    return {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2, "dW3": dW3, "db3": db3}

def gradient_descent(gradients, parameters, lr):
        parameters["W3"] -= lr * gradients["dW3"]
        parameters["b3"] -= lr * gradients["db3"]
        parameters["W2"] -= lr * gradients["dW2"]
        parameters["b2"] -= lr * gradients["db2"]
        parameters["W1"] -= lr * gradients["dW1"]
        parameters["b1"] -= lr * gradients["db1"]
        return parameters

def train(X_train, Y_train, learning_rate=0.01, epochs=1000, batch_size=64):
    parameters = init_parameters()
    for epoch in range(epochs):
        permutation = np.random.permutation(X_train.shape[1])
        X_shuffled = X_train[:, permutation]
        Y_shuffled = Y_train[permutation]
        for i in range(0, X_train.shape[1], batch_size):
            X_batch = X_shuffled[:, i:i+batch_size]
            Y_batch = Y_shuffled[i:i+batch_size]
            Z1, A1, Z2, A2, Z3, A3 = forward_propagation(X_batch, parameters)
            Y_one_hot = one_hot_encoding(Y_batch)
            loss = calculate_loss(A3, Y_one_hot)
            gradients = backward_propagation(X_batch, Y_one_hot, parameters, Z1, A1, Z2, A2, Z3, A3, batch_size)
            parameters = gradient_descent(gradients, parameters, learning_rate)
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss}")
    return parameters

trained_params = train(x_train, y_train, learning_rate=0.1, epochs=1000, batch_size=64)


Epoch 0, Loss: 2.0534368399693155
Epoch 100, Loss: 0.05409087222982386
Epoch 200, Loss: 0.017993214471590042
Epoch 300, Loss: 0.0002927940837006458
Epoch 400, Loss: 4.973635705660889e-06
Epoch 500, Loss: 1.4780886008207606e-06
Epoch 600, Loss: 7.274869139373559e-05
Epoch 700, Loss: 9.636974703522394e-08
Epoch 800, Loss: 2.6145658593989916e-05
Epoch 900, Loss: 4.0775460656591826e-05


In [18]:
def compute_accuracy(X, Y, parameters):
    _, _, _, _, _, A3 = forward_propagation(X, parameters)
    predictions = np.argmax(A3, axis=0)
    return np.mean(predictions == Y)

val_accuracy = compute_accuracy(x_dev, y_dev, trained_params)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Validation Accuracy: 93.20%
