# MNIST Digit Classification (NumPy from Scratch)

This notebook implements a feedforward neural network to classify handwritten digits from the MNIST dataset using only NumPy. It includes:
- Manual forward and backward propagation
- ReLU and Softmax activation
- Cross-entropy loss
- Adam optimizer
- Hyperparameter tuning
- Visualizations

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder

# Load data
(X_train_full, y_train_full), (X_test, y_test) = mnist.load_data()

# Normalize to [0, 1]
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

# Flatten (28x28) → (784)
X_train_full = X_train_full.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

# One-hot encode labels
encoder = OneHotEncoder(sparse=False)
y_train_full_oh = encoder.fit_transform(y_train_full.reshape(-1, 1))
y_test_oh = encoder.transform(y_test.reshape(-1, 1))

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full_oh, test_size=0.1, random_state=42)

In [None]:
def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def softmax(z):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

def cross_entropy(y_pred, y_true):
    return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))

def accuracy(y_pred, y_true):
    return np.mean(np.argmax(y_pred, axis=1) == np.argmax(y_true, axis=1))

In [None]:
class NeuralNetwork:
    def __init__(self, input_size, hidden1, hidden2, output_size, lr=0.001):
        self.lr = lr
        self.params = {
            "W1": np.random.randn(input_size, hidden1) * 0.01,
            "b1": np.zeros((1, hidden1)),
            "W2": np.random.randn(hidden1, hidden2) * 0.01,
            "b2": np.zeros((1, hidden2)),
            "W3": np.random.randn(hidden2, output_size) * 0.01,
            "b3": np.zeros((1, output_size)),
        }

    def forward(self, X):
        self.cache = {}
        self.cache["Z1"] = X @ self.params["W1"] + self.params["b1"]
        self.cache["A1"] = relu(self.cache["Z1"])
        self.cache["Z2"] = self.cache["A1"] @ self.params["W2"] + self.params["b2"]
        self.cache["A2"] = relu(self.cache["Z2"])
        self.cache["Z3"] = self.cache["A2"] @ self.params["W3"] + self.params["b3"]
        self.cache["A3"] = softmax(self.cache["Z3"])
        return self.cache["A3"]

    def backward(self, X, y_true):
        m = X.shape[0]
        A3 = self.cache["A3"]
        dZ3 = A3 - y_true
        dW3 = self.cache["A2"].T @ dZ3 / m
        db3 = np.sum(dZ3, axis=0, keepdims=True) / m

        dA2 = dZ3 @ self.params["W3"].T
        dZ2 = dA2 * relu_derivative(self.cache["Z2"])
        dW2 = self.cache["A1"].T @ dZ2 / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m

        dA1 = dZ2 @ self.params["W2"].T
        dZ1 = dA1 * relu_derivative(self.cache["Z1"])
        dW1 = X.T @ dZ1 / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m

        for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
            self.params[key] -= self.lr * eval(f"d{key}")

    def train_batch(self, X, y):
        y_pred = self.forward(X)
        loss = cross_entropy(y_pred, y)
        acc = accuracy(y_pred, y)
        self.backward(X, y)
        return loss, acc