  **GOAL** : Create a neural network with at least one hidden layer to model
a) California housing dataset (regression),
b) Full MNIST dataset (classification)
Use sklearn packages.
Submit python files and also report the RMSE for regression and accuracy for classification on the test sets. Choose random_state  = 42 for splitting the dataset of california housing into 75% for training and 25% for testing, the first 60k samples for train, and remaining for test in Full MNIST dataset.

i have created a 4 layer neural network with two hidden layers. activation function used in hidden layer is ReLU.

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.neural_network import MLPRegressor,MLPClassifier
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
def ReLU(z):
  return np.maximum(0,z)
def ReLUD(z):
  return np.heaviside(z,1)
def Sigmoid(z):
  return 1/(1+np.exp(-z))
def Linear(z):
  return(z)
def Softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)

In [None]:
def forward_prop(W1,W2,W3,X,b1,b2,b3,p):
    Z1 = W1 @ X + b1
    A1 = ReLU(Z1)
    Z2 = W2 @ A1 + b2
    A2 = ReLU(Z2)
    Z3 = W3 @ A2 + b3
    if p==1:
      A3=Linear(Z3)
    elif p==2:
      A3=Softmax(Z3)
    return Z1,A1,Z2,A2,Z3,A3

In [None]:
def backward_prop(Y,A3,A2,A1,W3,W2,X,Z2,Z1,p):
  m=X.shape[1]
  if p==1:
    DZ3=2*(A3-Y)/m
    DW3=DZ3 @ A2.T
    Db3=np.sum(DZ3,axis=1,keepdims=True)

    DZ2=(W3.T @ DZ3) * ReLUD(Z2)
    DW2=DZ2 @ A1.T
    Db2=np.sum(DZ2,axis=1,keepdims=True)

    DZ1=(W2.T @ DZ2) * ReLUD(Z1)
    DW1=DZ1 @ X.T
    Db1=np.sum(DZ1,axis=1,keepdims=True)

  elif p==2:
    DZ3=(A3 - Y)
    DW3=(1/m) * DZ3 @ A2.T
    Db3=(1/m) * np.sum(DZ3, axis=1, keepdims=True)

    DZ2=(W3.T @ DZ3) * ReLUD(Z2)
    DW2=(1/m) * DZ2 @ A1.T
    Db2=(1/m) * np.sum(DZ2, axis=1, keepdims=True)

    DZ1=(W2.T @ DZ2) * ReLUD(Z1)
    DW1=(1/m) * DZ1 @ X.T
    Db1=(1/m) * np.sum(DZ1, axis=1, keepdims=True)

  return DW3,DW2,DW1,Db3,Db2,Db1

In [None]:
def cost_func_cross_entrop(Yhat,Y):
  m=Yhat.shape[1]
  ep=1e-14
  loss=-Y*np.log(np.maximum(Yhat,ep))-(1-Y)*np.log(np.maximum((1-Yhat),ep))
  return np.sum(loss,axis=1)/m
def cost_func_MSE(Yhat, Y):
    m = Y.shape[1]
    loss = np.sum((Yhat - Y) ** 2) / m
    return loss
def cost_func_cat_cross_entrop(Yhat, Y):
    m = Y.shape[1]
    eps = 1e-12
    cost = -np.sum(Y * np.log(np.maximum(Yhat, eps))) / m
    return cost


**California Dataset**

This is a regression problem, so i have used sigmoid activation function at the output layer.since data is less compared to MNIST i have used batch gradient descent method.in order to avoid vanishing/exploding gradients i have used HE initialization for weights.

In [None]:
from sklearn.preprocessing import StandardScaler
X, y = datasets.fetch_california_housing(return_X_y=True)

scaler = StandardScaler()
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.25,random_state=42)
X_train_scaled = scaler.fit_transform(X_train).T
X_test_scaled = scaler.transform(X_test).T
m_test=X_test.shape[0]
m_train=X_train.shape[0]
Y=Y_train.reshape(1,m_train)
Ytest=Y_test.reshape(1,m_test)

np.random.seed(42)
n1,n2,n3,n4=8,60,30,1
W1=np.random.randn(n2,n1)*np.sqrt(1/(n1))
W2=np.random.randn(n3,n2)*np.sqrt(1/(n2))
W3=np.random.randn(n4,n3)*np.sqrt(1/(n3))
b1=np.random.randn(n2,1)
b2=np.random.randn(n3,1)
b3=np.random.randn(n4,1)

itrations=10000
alpha=0.01

for i in range(itrations):
  Z1,A1,Z2,A2,Z3,Yhat=forward_prop(W1,W2,W3,X_train_scaled,b1,b2,b3,1)
  DW3,DW2,DW1,Db3,Db2,Db1=backward_prop(Y,Yhat,A2,A1,W3,W2,X_train_scaled,Z2,Z1,1)
  W1=W1-alpha*DW1
  W2=W2-alpha*DW2
  W3=W3-alpha*DW3
  b1=b1-alpha*Db1
  b2=b2-alpha*Db2
  b3=b3-alpha*Db3
  cost=cost_func_MSE(Yhat,Y)
  if i%500==0:
    print(cost)

5.603140039116418
0.45120819215467217
0.4233309564673106
0.4029618848585914
0.3862686975150037
0.3743143031340625
0.36531593296941817
0.35745589916807147
0.3508164755368316
0.3445552019340799
0.33888910618280166
0.33375529397394943
0.3291138942804739
0.3245328203134036
0.32008428202843436
0.3158823425575742
0.31221242044385183
0.3089433988340376
0.31361450933403084
0.310183386346048


In [None]:
Z1,A1,Z2,A2,Z3,Yhat_test=forward_prop(W1,W2,W3,X_test_scaled,b1,b2,b3,1)
rmse = np.sqrt(mean_squared_error(Ytest.flatten(), Yhat_test.flatten()))
print("Test RMSE:", rmse)

Test RMSE: 0.5671816175900567


In [None]:
# ----------------- Mini-batch generator -----------------
def get_batches(X, Y, batch_size):
    m = X.shape[1]
    idx = np.random.permutation(m)
    for i in range(0, m, batch_size):
        batch = idx[i:i+batch_size]
        yield X[:, batch], Y[:, batch]

# ----------------- Accuracy -----------------
def accuracy(X, Y, W1, b1, W2, b2, W3, b3):
    _, _, _, _, _, A3 = forward_prop(W1,W2,W3,X,b1,b2,b3,2)
    preds = np.argmax(A3, axis=0)
    labels = np.argmax(Y, axis=0)
    return np.mean(preds == labels) * 100

**MNIST datasets**

since it is a multiclass classification problem i have used softmax activation function at the output layer.i converted 28x28 pixels to 784 feature vector, so in input layer i have to take 784 neurons and since i am categorizing 10 digits my output layer is also contains 10 neurons.onehot encoding is used.since data is huge i have used mini batch gradient descent for faster computation.

In [None]:
import numpy as np
from keras.datasets import mnist
from sklearn.preprocessing import OneHotEncoder, StandardScaler
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, -1) / 255.0
X_test = X_test.reshape(10000, -1) / 255.0

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).T
X_test = scaler.transform(X_test).T

enc = OneHotEncoder(sparse_output=False)
Y_train = enc.fit_transform(y_train.reshape(-1, 1)).T
Y_test = enc.transform(y_test.reshape(-1, 1)).T

np.random.seed(42)
n_x, n_h1, n_h2, n_y = 784, 240, 120, 10
W1 = np.random.randn(n_h1, n_x) * np.sqrt(2. / n_x)
b1 = np.zeros((n_h1, 1))
W2 = np.random.randn(n_h2, n_h1) * np.sqrt(2. / n_h1)
b2 = np.zeros((n_h2, 1))
W3 = np.random.randn(n_y, n_h2) * np.sqrt(2. / n_h2)
b3 = np.zeros((n_y, 1))

epochs = 20
batch_size = 256
alpha = 0.01

for epoch in range(epochs):
    epoch_cost = 0
    for Xb, Yb in get_batches(X_train, Y_train, batch_size):
        Z1,A1,Z2,A2,Z3,A3 = forward_prop(W1,W2,W3,Xb,b1,b2,b3,2)
        dW3,dW2,dW1,db3,db2,db1 = backward_prop(Yb,A3,A2,A1,W3,W2,Xb,Z2,Z1,2)

        W1 -= alpha * dW1
        b1 -= alpha * db1
        W2 -= alpha * dW2
        b2 -= alpha * db2
        W3 -= alpha * dW3
        b3 -= alpha * db3

        epoch_cost += cost_func_cat_cross_entrop(A3, Yb)

    avg_cost = epoch_cost / (X_train.shape[1] // batch_size)
    print(f"Epoch {epoch+1}/{epochs} | Cost: {avg_cost:.4f}")
acc = accuracy(X_test, Y_test, W1, b1, W2, b2, W3, b3)
print(f"  Test Accuracy: {acc:.2f}%")


Epoch 1/20 | Cost: 1.0438
Epoch 2/20 | Cost: 0.4469
Epoch 3/20 | Cost: 0.3448
Epoch 4/20 | Cost: 0.2944
Epoch 5/20 | Cost: 0.2619
Epoch 6/20 | Cost: 0.2385
Epoch 7/20 | Cost: 0.2202
Epoch 8/20 | Cost: 0.2054
Epoch 9/20 | Cost: 0.1933
Epoch 10/20 | Cost: 0.1826
Epoch 11/20 | Cost: 0.1740
Epoch 12/20 | Cost: 0.1656
Epoch 13/20 | Cost: 0.1579
Epoch 14/20 | Cost: 0.1516
Epoch 15/20 | Cost: 0.1457
Epoch 16/20 | Cost: 0.1397
Epoch 17/20 | Cost: 0.1348
Epoch 18/20 | Cost: 0.1299
Epoch 19/20 | Cost: 0.1257
Epoch 20/20 | Cost: 0.1213
  Test Accuracy: 95.37%
