In [2]:
#Import libraries

import numpy as np

In [3]:
#Prepare training and test data

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Load MNIST (first time takes a minute)
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)

# Convert y to string → integer
y = y.astype(int)

# Filter only 3s and 8s
mask = (y == 3) | (y == 8)
X = X[mask]
y = y[mask]

# Convert labels: 3 → 0, 8 → 1
y = (y == 8).astype(int)

# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X_train.shape , X_test.shape , y_train.shape , y_test.shape)

# Transpose data

X_train = X_train.T
X_test = X_test.T

y_train = y_train.reshape(1, -1)
y_test = y_test.reshape(1, -1)

print(X_train.shape , X_test.shape , y_train.shape , y_test.shape)

(9357, 784) (4609, 784) (9357,) (4609,)
(784, 9357) (784, 4609) (1, 9357) (1, 4609)


In [23]:
#Define Sigmoid function and its derivative

def sigmoid(x):
    x = np.clip(x, -500, 500) 
    return 1/ (1 + np.exp(-x))

def sigmoid_derivative(a):
    return a * (1 - a)

In [16]:
#Initialize parameters 

def initialize_parameters(n_x, n_h, n_y):

    w1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))

    w2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))

    learning_rate = 0.5

    return(w1, b1, w2, b2, learning_rate)

In [17]:
#Forward propagation 

def forward_propagation(X, w1, b1, w2, b2):
    #Input to hidden layer
    z1 = np.dot(w1, X) + b1
    a1 = sigmoid(z1)

    #Hidden layer to output
    z2 = np.dot(w2, a1) + b2
    a2 = sigmoid(z2)

    cache = {"z1": z1, "a1": a1, "z2": z2, "a2": a2}

    return(a2, cache)

In [18]:
#Compute loss

def compute_loss(a2, y):
    loss = -np.mean(y * np.log(a2 + 1e-9) + (1 - y) * np.log(1 - a2 + 1e-9))
    return(loss)

In [19]:
#Back propagation

def back_propagation(w1, w2, cache, X, y):
    a1 = cache["a1"]
    a2 = cache["a2"]

    m = X.shape[1]
  
    d2 = a2 - y
    dw2 = np.dot(d2, a1.T)/ m
    db2 = np.sum(d2, axis=1, keepdims=True) / m

    d1 = np.dot(w2.T, d2) * (a1 * (1 - a1))
    dw1 = np.dot(d1, X.T)/ m
    db1 = np.sum(d1, axis=1, keepdims=True) / m

    grads = {"dw1": dw1, "db1": db1, "dw2": dw2, "db2": db2}
    return grads

In [20]:
#Update weights and biases

def update_parameters(w1, b1, w2, b2, grads):
    dw1 = grads["dw1"]
    db1 = grads["db1"]
    dw2 = grads["dw2"]
    db2 = grads["db2"]
    
    w1 -= learning_rate * dw1
    b1 -= learning_rate * db1
    w2 -= learning_rate * dw2
    b2 -= learning_rate * db2

    return w1, b1, w2, b2 

In [24]:
#Training loop

w1, b1, w2, b2, learning_rate = initialize_parameters(784, 128, 1)

for epoch in range(1000):
    
    a2, cache = forward_propagation(X_train, w1, b1, w2, b2)

    loss = compute_loss(a2, y_train)

    if epoch % 100 == 0:
        print(f"Epoch {epoch} | Loss: {loss:.4f}")

    grads = back_propagation(w1, w2, cache, X_train, y_train)

    w1, b1, w2, b2 = update_parameters(w1, b1, w2, b2, grads)
    
    

Epoch 0 | Loss: 0.7027
Epoch 100 | Loss: 0.2240
Epoch 200 | Loss: 0.1186
Epoch 300 | Loss: 0.1039
Epoch 400 | Loss: 0.0855
Epoch 500 | Loss: 0.0776
Epoch 600 | Loss: 0.0758
Epoch 700 | Loss: 0.0629
Epoch 800 | Loss: 0.0746
Epoch 900 | Loss: 0.0477


In [22]:
#Test model

a2, cache = forward_propagation(X_test, w1, b1, w2, b2)

predictions = (a2 > 0.5).astype(int)  # shape: (1, m)
correct = (predictions == y_test).astype(int)
accuracy = np.mean(correct)

print(accuracy)




0.9711434150574962


  return 1/ (1 + np.exp(-x))
