In [1]:
import numpy as np
import pandas as pd

# Lab 9 - Multi-layer Perceptron Forward Pass & Backpropagation

## Part I
For this exercise you will implement a simple 2-layer perceptron with the forward pass and the backpropagation to learn the weights

For the first part you'll build and train a 2-layer neural network that predicts the prices of houses, using the usual Boston housing dataset.

In [2]:
boston = pd.read_csv('./BostonHousing.txt')

As usual, consider the MEDV as your target variable. 
* Split the data into training, validation and testing (70,15,15)%
* Experiment with different number of neurons per layer for your network, using the validation set

In [3]:
# your code goes here
from sklearn.model_selection import train_test_split
# The target variable is the last column of our dataset
X = boston.values[:,:-1]
y = boston.values[:,1].reshape(-1, 1)
# Now lets split the 3 results
X_train, X_tv, y_train, y_tv = train_test_split(X, y, test_size=0.3, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_tv, y_tv, test_size=0.5, random_state=0)

In [4]:
def sigmoid_activation(z:np.ndarray) -> np.ndarray:
    # your code goes here
    return 1 / (1 + np.exp(-z))

def rmse(y_pred:np.ndarray, y_real:np.ndarray) -> float:
    return np.sqrt(np.mean(np.power(y_pred - y_real, 2)))

def sigmoid_derivative(z:np.ndarray) -> np.ndarray:
    # your code goes here
    return sigmoid_activation(z) * (1 - sigmoid_activation(z))


In [5]:
# your code goes here

def backpropagation_2layer(X:np.ndarray, y_real:np.ndarray, dim_input:int,
                            dim_hidden:int, dim_output:int, lr:float=0.0001, repeats:int=300) -> tuple[np.ndarray]:
    
    # X  (n, m)
    # Initializing weights and biases with random values
    # First we initialize the input to hidden layers of weights and biases
    W_input = np.random.normal(size=(dim_input, dim_hidden)) # Wi (m, h)
    b_input = np.random.normal(size=(dim_hidden, 1)) # bi (h, 1)
    # Then the hidden to output layers of weights and biases
    W_output = np.random.normal(size=(dim_hidden, dim_output)) # Wo (h, o)
    b_output = np.random.normal(size=(dim_output, 1)) # bo (o, 1)

    
    for repeat in range(repeats):
        hidden_layer_i = X @ W_input + b_input.T # (n, m) x (m, h) -> (n, h)
        hidden_layer_o = sigmoid_activation(hidden_layer_i)

        output_layer = hidden_layer_o @ W_output + b_output.T # (n, h) x (h, o) -> (n, o)
        error_o = output_layer - y_real # (n, o)

        derivative_wo = hidden_layer_o.T @ error_o # (h, n) x (n, o) -> (h, o)
        derivative_bo = np.sum(error_o, axis=0).reshape(-1, 1) # (o, 1)

        error_h = error_o @ W_output.T * sigmoid_derivative(hidden_layer_i) # (n, o) x (o, h) -> (n, h) * (n, h)
    
        derivative_wi = X.T @ error_h # (m, n) x (n, h)
        derivative_bi = np.sum(error_h, axis=0).reshape(-1, 1) # (h, 1)

        W_output -= lr * derivative_wo
        b_output -= lr * derivative_bo
    
        W_input -= lr * derivative_wi
        b_input -= lr * derivative_bi

    return W_input, b_input, W_output, b_output


for i in range(1, 14):
    hidden_layer_dim = i

    W_input, b_input, W_output, b_output = backpropagation_2layer(X_train, y_train, X_train.shape[1], hidden_layer_dim, y_train.shape[1])
    
    y_val_pred = sigmoid_activation(X_val @ W_input + b_input.T) @ W_output + b_output.T
    
    val_rmse = rmse(y_val_pred, y_val)
    print(f"RMSE para {i} neurônio(s): {val_rmse}")

  return 1 / (1 + np.exp(-z))


RMSE para 1 neurônio(s): 24.228501118020844
RMSE para 2 neurônio(s): 24.22849262683326
RMSE para 3 neurônio(s): 24.023015652710665
RMSE para 4 neurônio(s): 24.22849262675135
RMSE para 5 neurônio(s): 23.256916877010752
RMSE para 6 neurônio(s): 24.16080026946378
RMSE para 7 neurônio(s): 24.228492626934194
RMSE para 8 neurônio(s): 24.22849262675135
RMSE para 9 neurônio(s): 24.035929285927903
RMSE para 10 neurônio(s): 24.03748901739713
RMSE para 11 neurônio(s): 23.201334149101033
RMSE para 12 neurônio(s): 24.052381707381407
RMSE para 13 neurônio(s): 24.140952954174985


## Part II 

For this exercise you will build and train a 2-layer neural network that predicts the exact digit from a hand-written image, using the MNIST dataset. 
For this exercise, add weight decay to your network.

In [6]:
from sklearn.datasets import load_digits

In [7]:
digits = load_digits()

In [8]:
X = digits.data
y = digits.target

In [9]:
X.shape

(1797, 64)

Again, you will split the data into training, validation and testing.

In [10]:
# your code goes here:
# Now lets split the 3 results
y = y.reshape(-1, 1)
X_train, X_tv, y_train, y_tv = train_test_split(X, y, test_size=0.3, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_tv, y_tv, test_size=0.5, random_state=0)

In [11]:
# your code goes here:
# your code goes here

def backpropagation_2layer_wd(X:np.ndarray, y_real:np.ndarray, dim_input:int,
                            dim_hidden:int, dim_output:int, lr:float=0.0001, wd:float=0.001, repeats:int=300) -> tuple[np.ndarray]:
    
    # X  (n, m)
    # Initializing weights and biases with random values
    # First we initialize the input to hidden layers of weights and biases
    W_input = np.random.normal(size=(dim_input, dim_hidden)) # Wi (m, h)
    b_input = np.random.normal(size=(dim_hidden, 1)) # bi (h, 1)
    # Then the hidden to output layers of weights and biases
    W_output = np.random.normal(size=(dim_hidden, dim_output)) # Wo (h, o)
    b_output = np.random.normal(size=(dim_output, 1)) # bo (o, 1)

    
    for repeat in range(repeats):
        hidden_layer_i = X @ W_input + b_input.T # (n, m) x (m, h) -> (n, h)
        hidden_layer_o = sigmoid_activation(hidden_layer_i)

        output_layer = hidden_layer_o @ W_output + b_output.T # (n, h) x (h, o) -> (n, o)
        error_o = output_layer - y_real # (n, o)

        derivative_wo = hidden_layer_o.T @ error_o # (h, n) x (n, o) -> (h, o)
        derivative_bo = np.sum(error_o, axis=0).reshape(-1, 1) # (o, 1)

        error_h = error_o @ W_output.T * sigmoid_derivative(hidden_layer_i) # (n, o) x (o, h) -> (n, h) * (n, h)
    
        derivative_wi = X.T @ error_h # (m, n) x (n, h)
        derivative_bi = np.sum(error_h, axis=0).reshape(-1, 1) # (h, 1)

        derivative_wo += wd * W_output
        derivative_wi += wd * W_input

        W_output -= lr * derivative_wo
        b_output -= lr * derivative_bo
    
        W_input -= lr * derivative_wi
        b_input -= lr * derivative_bi

    return W_input, b_input, W_output, b_output


for i in range(1, 14):
    hidden_layer_dim = i

    W_input, b_input, W_output, b_output = backpropagation_2layer_wd(X_train, y_train, X_train.shape[1], hidden_layer_dim, y_train.shape[1])
    
    y_val_pred = sigmoid_activation(X_val @ W_input + b_input.T) @ W_output + b_output.T
    val_rmse = rmse(y_val_pred, y_val)
    print(f"RMSE para {i} neurônio(s): {val_rmse}")

RMSE para 1 neurônio(s): 2.8455050874711376
RMSE para 2 neurônio(s): 2.7292127593078224
RMSE para 3 neurônio(s): 2.7220503707531134
RMSE para 4 neurônio(s): 2.069302184984579
RMSE para 5 neurônio(s): 1.9650259408562865
RMSE para 6 neurônio(s): 1.964608048488267
RMSE para 7 neurônio(s): 1.850462527185814
RMSE para 8 neurônio(s): 1.9811286697185873
RMSE para 9 neurônio(s): 1.8615370361753336
RMSE para 10 neurônio(s): 1.805967507650367
RMSE para 11 neurônio(s): 1.972336020938315
RMSE para 12 neurônio(s): 2.1535412875507607
RMSE para 13 neurônio(s): 1.8827757106681902
