# Exercise 1
Add Backpropagation to your MLP and train the model on the ZIP-Dataset.

In [1]:
import numpy as np

**1. Initialization**
- define threshold for activation function (UNUSED), number of layers, number of neurons per layer
- for each layer, initialize a weight matrix with random numbers (dim = #neurons in previous leyer x #neurons in current layer) and a bias vector with ones

**2. Training**
- ***Feedforward***
    - Feed batch data through layers (f_i(w_i*x+b), using weight matrices w and activation functions f)
    - for every neuron/layer, store output value y_i and derivative y*_i
    
- ***Backpropagation*** 
    - Quantify network error (MSE = 1/2 * (y_n-t)^2)
    - Update weights w_i in layer i by product of backpropagated error, derivative, input vector and learning rate (SGD)
    
**3. Prediction/Inference**


# Todo: introduce bias

In [434]:
class MLP:
    def __init__(self, depth, layer_width, threshold, learning_rate):
        """
        This constructor sets random network weights and checks if the input depth matches the provided layers.
        """
        self.threshold = threshold
        self.learning_rate = learning_rate
        self.depth = depth
        if not len(layer_width) == (depth + 1):
            raise Exception("'layer_width' needs to be of length 'depth' + 1")  
        self.layer_width = layer_width
        self.network_weights = [] # store layer weights
        self.network_derivatives = [] # store derivatives of activated layer neurons
        self.network_outputs = [np.zeros((1,self.layer_width[0]))] # store output values of activated layer neurons
        width_prev = self.layer_width[0]
        for width in self.layer_width[1:]:
            # add 1 dimension to weight matrix to account for bias
            self.network_weights.append(np.random.randn(width_prev, width)* np.sqrt(1. / (width_prev)))
            self.network_derivatives.append(np.zeros((1, width))) 
            self.network_outputs.append(np.zeros((1, width)))
            width_prev = width
        self.error_memory = []
        

    ############  Activation functions  ######### 
    def heaviside(self, X):
        """This Function is a tiny implementation of the heaviside step function."""
        return (X >= self.threshold).astype(int)
    
    def sigmoid(self, X):
        sig = 1/(1+np.exp(-X))
        return (X >= self.threshold).astype(int)
    #############################################
        
    def mean_squared_error(self, Y_m, T_m):
        """Quantify rrror after feedforward step."""
        return 1/2 * np.power((Y_m - T_m),2)
    
    def feed_forward(self, X):
        """This Function passes the input X through all weights and returns the prediction vector."""
        X_i = X.copy()
        self.network_outputs[0] = X_i
        for i in range(self.depth):
            # Compute weighted sum
            z_i = X_i @ self.network_weights[i] 
            # Apply activation function
            X_i = self.sigmoid(z_i)
            # Store derivatives
            D_i = X_i*(1-X_i)
            self.network_outputs[i+1] = X_i
            # see Paul's implementation for diagonal derivative matrices
            self.network_derivatives[i] = D_i
        return X_i
    
    def backpropagate(self, error):
        """Backpropagate error and update weight matrices."""
        d_last_hidden_layer = self.network_derivatives[self.depth-1] * error
        for i in range(self.depth-1):
            d_tmp = d_last_hidden_layer
            for j in range(i, self.depth-1):
                d_tmp = self.network_derivatives[j]*self.network_weights[j+1].T
            d_tmp *= d_tmp
            dW = -self.learning_rate * self.network_outputs[i].T @ d_tmp 
            self.network_weights[i] += dW
            # TODO: also update biases?
        # update last layer weights
        d_W_last_hidden_layer = -self.learning_rate * self.network_outputs[self.depth-1].T @ d_last_hidden_layer 
        self.network_weights[self.depth-1] += d_W_last_hidden_layer
        return
    
    def train(self, X, Y, M):
        """Train the MLP on (X,Y) in M equally sized batches using feedforward and backpropagation with Stochastic Gradient Descent."""
        # Shuffle data indices and split in subsets of size M
        batch_indices = np.arange(X.shape[0])
        np.random.shuffle(batch_indices)
        batch_splits = np.array_split(batch_indices, M)
        for m in range(len(batch_splits)):
            # fetch batch
            print('Fetch batch no.',m)
            X_m = X[batch_splits[m]]
            T_m = Y[batch_splits[m]]
            Y_m = self.feed_forward(X_m)
            # Quantify error
            E = self.mean_squared_error(Y_m.ravel(), T_m)
            self.error_memory.append(E)
            dE = (Y_m.T - T_m).T
            # Backpropagate
            self.backpropagate(dE)
        return 
    
    def predict(self, X):
        """This function passes the input X to the iteration function."""
        X_i = X.copy()
        for i in range(self.depth):
            # Compute weighted sum
            z_i = X_i @ self.network_weights[i]
            # Apply activation function
            X_i = self.sigmoid(z_i)
        return X_i.ravel()
    
    def accuracy(self, labels, predictions):
        """This function calculates the binary class accuracy for given true/predicted labels."""
        return np.mean(labels == predictions)

**Load ZIP data set**

In [422]:
path_to_train = '/Users/Eva/Downloads/zip.train'
path_to_test = '/Users/Eva/Downloads/zip.test'
training_data = np.array(pd.read_csv(path_to_train, sep=' ', header=None))
test_data = np.array(pd.read_csv(path_to_test, sep =' ',header=None))

X_train_zip, y_train_zip = training_data[:,1:-1], training_data[:,0]
X_test_zip, y_test_zip = test_data[:,1:], test_data[:,0]

# We only want to classify two different digits. You can choose which digits you want to classify youself

X_train_zip = X_train_zip[np.logical_or(y_train_zip == 0, y_train_zip == 1)]
y_train_zip = y_train_zip[np.logical_or(y_train_zip == 0, y_train_zip == 1)]

X_test_zip = X_test_zip[np.logical_or(y_test_zip == 0, y_test_zip == 1)]
y_test_zip = y_test_zip[np.logical_or(y_test_zip == 0, y_test_zip == 1)]

#### Classify the Zip-Dataset with the random initial weights

In [435]:
mlp_network = MLP(threshold=0.01, learning_rate=0.1, depth=2, layer_width=[X_train_zip.shape[1], 10, 1])

In [436]:
for i in range(mlp_network.depth):
    print('Layer:', i)
    print(mlp_network.network_weights[i].shape)
    print(mlp_network.network_outputs[i+1].shape)
    print(mlp_network.network_derivatives[i].shape)

Layer: 0
(256, 10)
(1, 10)
(1, 10)
Layer: 1
(10, 1)
(1, 1)
(1, 1)


In [437]:
mlp_network.train(X_train_zip[:100,:], y_train_zip[:100], 10)

Fetch batch no. 0
Fetch batch no. 1
Fetch batch no. 2
Fetch batch no. 3
Fetch batch no. 4
Fetch batch no. 5
Fetch batch no. 6
Fetch batch no. 7
Fetch batch no. 8
Fetch batch no. 9


In [438]:
for i in range(mlp_network.depth):
    print('Layer:', i)
    print(mlp_network.network_weights[i].shape)
    print(mlp_network.network_outputs[i+1].shape)
    print(mlp_network.network_derivatives[i].shape)

Layer: 0
(256, 10)
(10, 10)
(10, 10)
Layer: 1
(10, 1)
(10, 1)
(10, 1)


In [439]:
y_pred_mlp = mlp_network.predict(X_test_zip[:100,:])

In [440]:
y_pred_mlp

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [441]:
y_test_zip[:100]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0.])

In [443]:
mlp_network.accuracy(y_test_zip[:100], y_pred_mlp)

0.62

In [350]:
np.unique(y_pred_mlp, return_counts=True)

(array([0, 1]), array([79, 21]))

In [351]:
np.unique(y_train_zip, return_counts=True)

(array([0., 1.]), array([1194, 1005]))

#### Get a mean accuracy over multiple runs

In [353]:
acc_list_mlp = []
n_runs = 100
for i in range(n_runs):
    mlp_network = MLP(threshold=0.01, learning_rate=0.1,depth=2, layer_width=[X_train_zip.shape[1], 10, 1])
    y_pred_loop = mlp_network.predict(X_train_zip)
    acc_list_mlp.append(mlp_network.accuracy(y_train_zip, y_pred_loop))
print("Mean Acc over", n_runs, "runs, with random weights is:", np.mean(acc_list_mlp))

Mean Acc over 100 runs, with random weights is: 0.48371532514779453


### (a) Optimize width (the number of neurons in a hidden layer; it is usually the same for all of them) and depth of the network. Try to find a setting that trains in a reasonable time. Plot the loss.

### (b) Show some digits that are classified incorrectly.

### (c) Plot your first weight layer as a grayscale image.