In [2]:
### IMPLEMENTATION OF A DIGIT CLASSIFIER OF MNIST DATASET USING NUMPY

import pandas as pd
import numpy as np
!wget https://pjreddie.com/media/files/mnist_train.csv
!wget https://pjreddie.com/media/files/mnist_test.csv


train_data=pd.read_csv("mnist_train.csv")
test_data=pd.read_csv("mnist_test.csv")

print("training data shape:", train_data.shape)
print("test data shape:", test_data.shape)

# let me print a few rows to see what the data looks like

print(train_data.head)

X_train = train_data.iloc[:,1:].values
Y_train = train_data.iloc[:,0].values

X_test = test_data.iloc[:,1:].values
Y_test = test_data.iloc[:,0].values

print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)

#next lets normalise the values :D
X_train = X_train / 255.0
X_test = X_test / 255.0

print("minimum value in X_train:", np.min(X_train))
print("maximum value in X_train", np.max(X_train))

#next we will perform one hot encoding :D

def oneh_encode(Y, num_classes=10):
    one_hot = np.zeros((Y.size, num_classes))
    one_hot[np.arange(Y.size), Y] = 1
    return one_hot

Y_train_encoded = oneh_encode(Y_train)
Y_test_encoded = oneh_encode(Y_test)

print("Y_train_encoded shape:", Y_train_encoded.shape)
print("Example of one-hot encoded label:", Y_train_encoded[0])

# Set the size of each layer
input_size = 784
hidden_size = 128
output_size = 10

# Initialize weights and biases
np.random.seed(42)

W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros((1, output_size))

# Print shapes to verify
print("W1 shape:", W1.shape)
print("b1 shape:", b1.shape)
print("W2 shape:", W2.shape)
print("b2 shape:", b2.shape)

def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def forward_propagation(X, W1, b1, W2, b2):
    # Hidden layer
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)  # Apply ReLU activation

    # Output layer
    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)  # Apply softmax activation

    # Store activations for backpropagation
    op = (Z1, A1, Z2, A2)
    return A2, op

# we are going to use cross entropy loss :D

def compute_loss(Y,A2):
    m = Y.shape[0]
    loss = -np.sum(Y*np.log(A2 + 1e-8)) / m
    return loss



def relu_derivative(x):
    return (x > 0).astype(float)  # Derivative of ReLU: 1 if x > 0, else 0

def backward_propagation(X, Y, cache, W1, W2):
    Z1, A1, Z2, A2 = cache
    m = X.shape[0]  # Number of examples

    # Output layer error
    dZ2 = A2 - Y  # Error at output
    dW2 = np.dot(A1.T, dZ2) / m  # Gradient of W2
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m  # Gradient of b2

    # Hidden layer error
    dA1 = np.dot(dZ2, W2.T)  # Backpropagate through W2
    dZ1 = dA1 * relu_derivative(Z1)  # Element-wise multiplication
    dW1 = np.dot(X.T, dZ1) / m  # Gradient of W1
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # Gradient of b1

    return dW1, db1, dW2, db2

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    # Update weights and biases using gradient descent
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

    return W1, b1, W2, b2


#training loop
def train(X, Y, W1, b1, W2, b2, iterations, learning_rate):
    for i in range(iterations):
        # Step 1: forward propagation
        A2, op = forward_propagation(X, W1, b1, W2, b2)

        # Step 2: compute the loss
        loss = compute_loss(Y, A2)

        # Print the loss every 100 iterations
        if i % 100 == 0:
            print(f"Iteration {i}, Loss: {loss}")


        # Step 3: Backpropagation
        dW1, db1, dW2, db2 = backward_propagation(X, Y, op, W1, W2)

        # Step 4: Update parameters
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)

    return W1, b1, W2, b2


iterations = 1000
learning_rate = 0.1

#training the neural network
W1, b1, W2, b2 = train(X_train, Y_train_encoded, W1, b1, W2, b2, iterations, learning_rate)




--2025-01-31 16:02:03--  https://pjreddie.com/media/files/mnist_train.csv
Resolving pjreddie.com (pjreddie.com)... 162.0.215.52
Connecting to pjreddie.com (pjreddie.com)|162.0.215.52|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 109575994 (104M) [text/csv]
Saving to: ‘mnist_train.csv.1’


2025-01-31 16:02:13 (12.6 MB/s) - ‘mnist_train.csv.1’ saved [109575994/109575994]

--2025-01-31 16:02:13--  https://pjreddie.com/media/files/mnist_test.csv
Resolving pjreddie.com (pjreddie.com)... 162.0.215.52
Connecting to pjreddie.com (pjreddie.com)|162.0.215.52|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18289443 (17M) [text/csv]
Saving to: ‘mnist_test.csv.1’


2025-01-31 16:02:16 (8.30 MB/s) - ‘mnist_test.csv.1’ saved [18289443/18289443]

training data shape: (59999, 785)
test data shape: (9999, 785)
<bound method NDFrame.head of        5  0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  ...  0.608  0.609  0.610  \
0      0  0    0    0    0    0

In [5]:
def predict(X, W1, b1, W2, b2):
    A2, _ = forward_propagation(X, W1, b1, W2, b2)
    predictions = np.argmax(A2, axis=1)
    return predictions

# Making predictions on the test set
predictions = predict(X_test, W1, b1, W2, b2)
accuracy = np.mean(predictions == Y_test) * 100
print(f"Test Accuracy: {accuracy:.2f}%")


Test Accuracy: 92.72%
