# COMP 551 - Mini-project 3
Group 63

In [20]:
import keras
import numpy as np
%matplotlib inline                                 
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace  
import scipy.sparse as sparse
import pandas as pd
import seaborn as sns
import os
from google.colab import files
np.random.seed(1234)

In [2]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

"""Vectorize the 28*28 pictures to a 784 vector."""

x_train = np.reshape(x_train, (-1, 784)).astype('float32')
x_test = np.reshape(x_test, (-1, 784)).astype('float32')


"""The intensity ranges from 0 to 255. We divide all intensities by the maximum (255) to obtain a [0-1] range."""


x_train, x_test = x_train/255.0, x_test/255.0


"""We transform the (N,) vector of labels using one-hot encoding into a (N,C) matrix."""

y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

#this is just because I was using these values on my local computer and had errors with installing tensorflor so I could install keras 
np.save('x_train.npy',x_train)
np.save('x_test.npy',x_test)
np.save('y_train.npy',y_train)
np.save('y_test.npy',y_test)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28) (10000, 28, 28) (60000,) (10000,)


## Task 1. Data pre-processing

- Load the raw data from Keras.
- Vectorize 28*28 pictures to 1D vector.
- Normalize the intensity of the pixel.

Load the MNIST dataset distributed with Keras. 

In [27]:
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()


Vectorize the 28*28 pictures to a 784 vector.

In [28]:
x_train = np.reshape(x_train, (-1, 784)).astype('float32')
x_test = np.reshape(x_test, (-1, 784)).astype('float32')


The intensity ranges from 0 to 255. We divide all intensities by the maximum (255) to obtain a [0-1] range.

In [29]:
print('Intensity before normalization:', np.amin(x_train), np.amax(x_train))
x_train, x_test = x_train/255.0, x_test/255.0
print('Intensity after normalization:', np.amin(x_train), np.amax(x_train))

Intensity before normalization: 0.0 255.0
Intensity after normalization: 0.0 1.0


We transform the (N,) vector of labels using one-hot encoding into a (N,C) matrix.

In [30]:
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

Subset the data to use in Colab.

In [8]:
#when running in colab I subsetted the data
#data_slice = 3000
#x_train = x_train[:data_slice,:]
#y_train = y_train[:data_slice,:]
#x_test = x_test[:data_slice,:]
#y_test = y_test[:data_slice,:]

## Task 2. Multilayer perceptron implementation

### 2.1 Build the network
Our task is a multiclass classification.The cost function will be the multi-class cross-entropy loss. We will use the following architecture:
- output layer = softmax activation
- hidden layers (0, 1 or 2): 128 units, ReLu or logistic activation


First, we implement the activation functions.

In [9]:
# logistic/sigmoid
logistic = lambda z: 1./ (1 + np.exp(-z))

# softmax
eps=1e-8
def softmax(z):
    logits = z - np.max(z) # for numerical stability
    sum_logits = np.sum(np.exp(logits), axis=1) +eps
    softmax = np.exp(logits)/sum_logits[:,None] 
    return softmax

# relu
relu = lambda z: np.maximum(0,z)

# derivatives of relu (formula from backprop slides)
def relu_dv(q):
  q[q<=0] = 0
  q[q>0] = 1
  return q

Next, we build the MLP class.

In [32]:
# for 2 hidden layer with relu activation
class MLP2layer_relu_l1:
    
    def __init__(self, M = 128,lambd=0.005):
        self.M = M
        self.lambd=lambd
            
    def fit(self, x_train, y_train, x_test, y_test, optimizer):
        N = x_train.shape[0]
        C = y_train.shape[1] # number of classes
        D = x_train.shape[1]
        def gradient(x, y, params,lmbd):
            v, w, u = params
            # forward pass
            N = x.shape[0]
            # add bias to the input layer
            x = np.column_stack([x,np.ones(N)*0.1])
            b = np.ones((N,1))*0.1

            q1 = np.dot(x, v) 
            z1 = relu(np.hstack((q1,b))) #N x M want to column stack to add bias here
            q2 = np.dot(z1,w) 
            z2 = relu(np.hstack((q2,b)))
            yh = softmax(np.dot(z2, u))#N x C
            # backward pass => gradient formula adapted from class dw = (yh-y)*z, dv = (yh-y)*w*deriv_relu(q)*x
            
            dy = yh - y #N x C
            
            du = np.dot(z2.T,dy)/N 
            du[:-1] = du[:-1] + lmbd*np.sign(u[:-1]) #/N not adding to biases
            
            dz2 = np.dot(dy,u.T)
            dz2 = np.delete(dz2, -1, axis=1)
            dq2 = relu_dv(q2)
            
            dw = np.dot(z1.T, dz2 * dq2)/N 
            dw[:-1] = dw[:-1] + lmbd*np.sign(w[:-1]) #/N not adding to biases
            dz1 = np.dot(dz2, w.T) #N x M
            dz1 = np.delete(dz1,-1,axis=1)
            dq1 = relu_dv(q1)
            dv = np.dot(x.T, dz1 * dq1)/N
            dv[:-1] = dv[:-1] + lmbd*np.sign(v[:-1]) #/N not adding to biases
            dparams = [dv, dw, du]
            return dparams
        
        # initialize the parameters with values in the standard normal distribution and scaled to be low
        u = np.random.randn(self.M+1,C) * 0.1 #M x C
        w = np.random.randn(self.M+1,self.M) * .01 #M x M
        v = np.random.randn(D+1,self.M) * .01 #D x M
        
        params0 = [v,w,u]

        # run the mini-batch gradient descent to update the parameters
        self.params, self.train_loss, self.test_loss = optimizer.run(gradient, x_train, y_train, x_test, y_test, params0,self.lambd,reg='l1')
        return self
    
    def predict(self, x):
        v, w, u = self.params
        # add bias to the input layer
        Nt = x.shape[0]
        x = np.column_stack([x,np.ones(Nt)*0.1])
        b1 = np.ones((Nt,1))*0.1
     
        # forward pass only using updated parameters

        q1 = np.dot(x,v)
        z1 = relu(np.hstack((q1,b1)))
        q2 = np.dot(z1,w)
        z2 = relu(np.hstack((q2,b1)))
        yh = softmax(np.dot(z2, u))#N x C
        return yh

In [31]:
# for 2 hidden layer with relu activation
class MLP2layer_relu_l2:
    
    def __init__(self, M = 128,lambd=0.005):
        self.M = M
        self.lambd=lambd
            
    def fit(self, x_train, y_train, x_test, y_test, optimizer):
        N = x_train.shape[0]
        C = y_train.shape[1] # number of classes
        D = x_train.shape[1]
        def gradient(x, y, params,lmbd):
            v, w, u = params
            # forward pass
            N = x.shape[0]
            # add bias to the input layer
            x = np.column_stack([x,np.ones(N)*0.1])
            b = np.ones((N,1))*0.1

            q1 = np.dot(x, v) #np.column_stack([np.dot(x, v),np.ones(N)*0.1]) #trying adding bias here
            z1 = relu(np.hstack((q1,b))) #N x M want to column stack to add bias here
            q2 = np.dot(z1,w) #np.column_stack([np.dot(z1,w),np.ones(N)*0.1]) #trying adding bias here
            z2 = relu(np.hstack((q2,b)))
            yh = softmax(np.dot(z2, u))#N x C
            # backward pass => gradient formula adapted from class dw = (yh-y)*z, dv = (yh-y)*w*deriv_relu(q)*x
            
            dy = yh - y #N x C
            
            du = np.dot(z2.T,dy)/N 
            #not adding to biases
            du[:-1] = du[:-1] + lmbd*u[:-1] #/N when dividing lambda term by N it was leading to lots of weights being converted to NaNs
 
            dz2 = np.dot(dy,u.T)
            dz2 = np.delete(dz2, -1, axis=1)
            dq2 = relu_dv(q2)
   
            dw = np.dot(z1.T, dz2 * dq2)/N 
            #not adding to biases
            dw[:-1] = dw[:-1] + lmbd*w[:-1] #/N #M x C
            dz1 = np.dot(dz2, w.T) #N x M
            dz1 = np.delete(dz1,-1,axis=1)
            dq1 = relu_dv(q1)
            dv = np.dot(x.T, dz1 * dq1)/N
            #not adding to biases
            dv[:-1] = dv[:-1] + lmbd*v[:-1] #/N #D x M
            dparams = [dv, dw, du]
            return dparams
        
        # initialize the parameters with values in the standard normal distribution and scaled to be low
        u = np.random.randn(self.M+1,C) * 0.1 #M x C
        w = np.random.randn(self.M+1,self.M) * .01 #M x M
        v = np.random.randn(D+1,self.M) * .01 #D x M
        
        params0 = [v,w,u]

        # run the mini-batch gradient descent to update the parameters
        self.params, self.train_loss, self.test_loss = optimizer.run(gradient, x_train, y_train, x_test, y_test, params0,self.lambd,reg='l1')
        return self
    
    def predict(self, x):
        v, w, u = self.params
        # add bias to the input layer
        Nt = x.shape[0]
        x = np.column_stack([x,np.ones(Nt)*0.1])
        b1 = np.ones((Nt,1))*0.1
     
        # forward pass only using updated parameters

        q1 = np.dot(x,v)
        z1 = relu(np.hstack((q1,b1)))
        q2 = np.dot(z1,w)
        z2 = relu(np.hstack((q2,b1)))
        yh = softmax(np.dot(z2, u))#N x C
        return yh

### 2.2 Implement the cost and accuracy function



In [12]:
# Softmax cross entropy 
def logsumexp(Z):                                                # dimension N x C
    Zmax = np.max(Z,axis=1)[:,None]                              # max over C
    log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=1))
    return log_sum_exp

# cost for relu activation - two layers
def cost_relu(x, y, params):
  Nt = x.shape[0]
  v, w, u = params
  b1 = np.ones((Nt,1))*0.1
  xb = np.column_stack([x,np.ones(Nt)*0.1])
  q1 = np.dot(xb, v) 
  z1 = relu(np.hstack((q1,b1))) 
  q2 = np.dot(z1,w)
  z2 = relu(np.hstack((q2,b1)))
  q3 = np.dot(z2, u) #N x C
  nll = - np.mean(np.sum(q3*y, 1) - logsumexp(q3)) 
  return nll

# cost for relu activation - two layers with l2 regularization
def cost_relu_l2(x, y, params,lmbd):
  Nt = x.shape[0]
  v,w,u = params
  l2reg_cost = lmbd*(np.mean(np.square(v))+np.mean(np.square(w))+np.mean(np.square(v)))/(2) #used mean of each rather then total dimension value because with the biases v,w, and u aren't all the same size
  cost_reg = cost_relu(x,y,params) + l2reg_cost
  return cost_reg

# cost for relu activation - two layers with l2 regularization
def cost_relu_l1(x, y, params,lmbd):
  Nt = x.shape[1]
  v,w,u = params
  l1reg_cost = lmbd*(np.mean(np.abs(v))+np.mean(np.abs(w))+np.mean(np.abs(v)))/(2) #used mean of each rather then total dimension value because with the biases v,w, and u aren't all the same size
  cost_reg = cost_relu(x,y,params) + l1reg_cost
  return cost_reg

# Accuracy
def evaluate_acc(y, yh):
  y_pred = np.argmax(yh,axis=1)
  accuracy = np.count_nonzero(y_pred == np.argmax(y,axis=1))/y.shape[0]
  return accuracy

### 2.3 Implement the optimizer

We will use a mini-batch gradient-descent algorithm.

In [13]:
def create_mini_batch(x, y, batch_size): 
    D = x.shape[1]
    data = np.hstack((x, y))
    np.random.shuffle(data)
    mini = data[:batch_size,:]                                                    
    x_mini = mini[:,:D]
    y_mini = mini[:,D:]
    return x_mini, y_mini

In [14]:
class GradientDescent:
    
    def __init__(self, learning_rate=.001, epsilon=1e-8, batch_size=100, iters=600, epochs=50):
        self.learning_rate = learning_rate
        self.iters = iters
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.epochs = epochs
        
    def run(self, gradient_fn, x_train, y_train, x_test, y_test, params, lmbd, reg):
        epoch = 1
        train_losses = []
        test_losses = []
        for epoch in range(self.epochs):
          train_epoch_loss = []
          test_epoch_loss = []
          for t in range(self.iters):
            x_mini, y_mini = create_mini_batch(x_train, y_train, self.batch_size)

            if reg == 'l1':
              train_loss = cost_relu_l1(x_mini, y_mini, params,lmbd)
              test_loss = cost_relu_l1(x_test, y_test, params,lmbd)  
            elif reg == 'l2':
              train_loss = cost_relu_l2(x_mini, y_mini, params,lmbd)
              test_loss = cost_relu_l2(x_test, y_test, params,lmbd)
            else:
              lmbd = 0
            grad = gradient_fn(x_mini, y_mini, params,lmbd)
            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
            if t % self.iters == 0:
              print(f"Epoch: {epoch+1}, Train error: {train_loss:.4f}, Test error: {test_loss:.4f}")
              epoch += 1
            train_epoch_loss.append(train_loss)
            test_epoch_loss.append(test_loss)
          train_losses.append(np.mean(train_epoch_loss))
          test_losses.append(np.mean(test_epoch_loss))
        return params, train_losses, test_losses


## Task 3. Run the experiments

Model with 2 hidden layers and ReLu activation and L1 regularization

In [15]:
model = MLP2layer_relu_l1(lambd=0.005)
optimizer = GradientDescent(learning_rate=0.1, epochs=20,batch_size=100)

model.fit(x_train, y_train, x_test, y_test, optimizer)

plt.plot(np.arange(len(model.train_loss)), model.train_loss, '-', label='train')
plt.plot(np.arange(len(model.test_loss)), model.test_loss, '-', label='test')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

Epoch: 1, Train error: 2.3035, Test error: 2.3028
Epoch: 2, Train error: 2.3025, Test error: 2.3025
Epoch: 3, Train error: 2.3006, Test error: 2.3023
Epoch: 4, Train error: 2.3039, Test error: 2.3022


KeyboardInterrupt: ignored

In [None]:
yh = model.predict(x_test) 
accuracy = evaluate_acc(y_test, yh)
print(f'Accuracy is {accuracy*100:.1f}.')


In [26]:
lamb_list = [0.004,0.005,0.007,0.01,0.05]
optimizer = GradientDescent(learning_rate=0.1, epochs=10, iters=1000)
acc_list = []
for i,l in enumerate(lamb_list):
  model = MLP2layer_relu_l2(lambd=l)
  model.fit(x_train, y_train, x_test, y_test, optimizer)
  yh = model.predict(x_test) 
  accs = evaluate_acc(y_test, yh)
  acc_list.append(accs)
  print(f'Accuracy is {accs*100:.1f}.')
  print('for lamda='+str(l))
plt.plot(lamb_list, acc_list)
#plt.legend()
plt.xlabel('Lambda')
plt.ylabel('Accuracy')
plt.savefig('cv_reg_changelam.png')
plt.show()


Epoch: 1, Train error: 2.3014, Test error: 2.3029
Epoch: 2, Train error: 0.0608, Test error: 0.3883
Epoch: 3, Train error: 0.0715, Test error: 0.3506


  ret = umr_sum(arr, axis, dtype, out, keepdims)
  after removing the cwd from sys.path.
  import sys


Epoch: 4, Train error: nan, Test error: nan
Epoch: 5, Train error: nan, Test error: nan


KeyboardInterrupt: ignored