In [66]:
import numpy as np
from matplotlib import pyplot as plt
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.layers import LeakyReLU
from keras.callbacks import EarlyStopping

In [67]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

X_train = X_train.reshape(X_train.shape[0], -1).astype('float32') / 255
X_test = X_test.reshape(X_test.shape[0], -1).astype('float32') / 255

Y_train_encoded = to_categorical(Y_train, num_classes=10)
Y_test_encoded = to_categorical(Y_test, num_classes=10)


In [68]:
model = Sequential([
    Dense(units=128, input_shape=(784,), activation='relu'),  
    LeakyReLU(alpha=0.01),  
    Dense(units=128, activation='relu'),  
    LeakyReLU(alpha=0.01), 
    Dense(units=64, activation='relu'), 
    LeakyReLU(alpha=0.01),  
    Dense(units=10, activation='softmax') 
])


In [69]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model.summary()

batch_size = 32
epochs = 10

Y_train_encoded = to_categorical(Y_train)
Y_test_encoded = to_categorical(Y_test)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_56 (Dense)            (None, 128)               100480    
                                                                 
 leaky_re_lu_38 (LeakyReLU)  (None, 128)               0         
                                                                 
 dense_57 (Dense)            (None, 128)               16512     
                                                                 
 leaky_re_lu_39 (LeakyReLU)  (None, 128)               0         
                                                                 
 dense_58 (Dense)            (None, 64)                8256      
                                                                 
 leaky_re_lu_40 (LeakyReLU)  (None, 64)                0         
                                                                 
 dense_59 (Dense)            (None, 10)              

In [70]:
model.fit(X_train, Y_train_encoded, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[early_stopping])

loss, accuracy = model.evaluate(X_test, Y_test_encoded)
print(f"Test accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
def init_params():
    W1 = np.random.rand(10, 784) * 0.01
    b1 = np.zeros((10, 1))
    W2 = np.random.rand(10, 10) * 0.01
    b2 = np.zeros((10, 1))
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

def ReLU_deriv(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, 10))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

In [None]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    m = Y.size
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

In [None]:

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate):
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    return W1, b1, W2, b2

In [None]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, learning_rate, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
        if i % 10 == 0:
            predictions = get_predictions(A2)
            print(f"Iteration: {i}, Accuracy: {get_accuracy(predictions, Y)}")
    return W1, b1, W2, b2

In [None]:
X_train_flat = X_train.T
Y_train_flat = Y_train.T

W1, b1, W2, b2 = gradient_descent(X_train_flat, Y_train_flat, 0.1, 1000)

Iteration: 0, Accuracy: 0.09761666666666667
Iteration: 10, Accuracy: 0.09871666666666666
Iteration: 20, Accuracy: 0.09881666666666666
Iteration: 30, Accuracy: 0.11508333333333333
Iteration: 40, Accuracy: 0.18138333333333334
Iteration: 50, Accuracy: 0.2058
Iteration: 60, Accuracy: 0.22191666666666668
Iteration: 70, Accuracy: 0.23776666666666665
Iteration: 80, Accuracy: 0.26681666666666665
Iteration: 90, Accuracy: 0.3127
Iteration: 100, Accuracy: 0.36561666666666665
Iteration: 110, Accuracy: 0.4259
Iteration: 120, Accuracy: 0.5016
Iteration: 130, Accuracy: 0.5576833333333333
Iteration: 140, Accuracy: 0.5996833333333333
Iteration: 150, Accuracy: 0.6401666666666667
Iteration: 160, Accuracy: 0.67495


KeyboardInterrupt: 

In [None]:
_, _, _, A2 = forward_prop(W1, b1, W2, b2, X_test.T)
predictions = get_predictions(A2)
accuracy = get_accuracy(predictions, Y_test)
print(f"Test accuracy: {accuracy}")

Test accuracy: 0.9116
