# COMS 4995_002 Deep Learning Assignment 1
Due on Thursday, Feb 8, 11:59pm

This assignment can be done in groups of at most 2 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Samuel Cohen, slc2206

Member 2: Jason Zhao, jsz2107

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [170]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0, norm=2):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        :param norm: type of norm in regularization.
        """
        
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.norm = norm
        self.batch_iter = 0
        
        # Init parameters
        self.parameters = {}
        self.parameters["weights"] = {}
        self.parameters["biases"] = {}
        
        # Xavier random initialization
        N = 2 / (layer_dimensions[0] + layer_dimensions[-1])
        sigma = N ** 0.5

        for i in range(self.num_layers - 1):
            
            # Set weight matrix of layer i to layer i + 1
            self.parameters["weights"][i] = np.matrix(sigma * np.random.randn(layer_dimensions[i], layer_dimensions[i + 1]))
            
            # Set bias vector biases from layer i to layer i + 1
            self.parameters["biases"][i] = np.matrix(sigma * np.random.randn(layer_dimensions[i + 1]))


    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        
        affine_prod = np.dot(W.T, A) + b.T
        
        cache = { 
            "input": A, 
            "weights": W,
            "affine_prod": affine_prod 
        }
        
        return affine_prod, cache


    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        
        return self.relu(A)


    def relu(self, X):
        """ 
        The ReLU function to calculate activations.
        """
        
        return np.maximum(0, X)

            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        
        # TODO
        pass
    
        return A, M

    
    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        
        cache = []
        prev = X

        for i in range(self.num_layers - 2):
            Z, _cache = self.affineForward(prev, self.parameters["weights"][i], self.parameters["biases"][i])
            A = self.activationForward(Z)
            
            _cache["output"] = A
            cache.append(_cache)
            
            prev = A
        
        # Propagate through last layer, no activation required
        index = self.num_layers - 2
        Z, _cache = self.affineForward(prev, self.parameters["weights"][index], self.parameters["biases"][index])
        cache.append(_cache)
        
        return Z, cache
    
    
    def softmax(self, zL):
        """
        The softmax is used for multi-class classification and computes a probability 
        in (0, 1) for each class c = 1...C, which sums to 1.
        """
        return np.exp(zL) / np.sum(np.exp(zL), axis = 0)
    
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        
        m = y.shape[0]
        AL_softmax = self.softmax(AL)       
        correct_label_probs = AL_softmax[y, range(m)]
        cost = - np.sum(np.log(correct_label_probs)) / m 
        
        if self.reg_lambda > 0:
            # TODO: Regularization
            pass
        
        # Create a 1-hot encoded matrix
        one_hot = np.zeros((AL.shape[0], m))
        one_hot[y, range(m)] = 1
        dAL = (AL_softmax - one_hot) # TODO: Should we be dividing by m?
        
        return cost, dAL

    
    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the **affine layer**. NOTE: assuming dA has already propagated through
        non-linear activation (ie. activationBackward() has already been called).
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        
        dW = np.dot(cache["input"], dA_prev.T)
        db = np.sum(dA_prev, axis=1) / dA_prev.shape[1]
        dA = np.dot(cache["weights"], dA_prev)
        
        return dA, dW, db

    
    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        
        # TODO: Is this right???
        return self.relu_derivative(dA, cache["affine_prod"])
        
        
    def relu_derivative(self, dx, cached_x):
        """
        The ReLU activation function turns the input off (to zero) when the input is negative.
        So when we are backpropagating, we need to know which gradients should be turned off. 
        (This is the reason for two arguments to this function)        
        """
        
        mask = cached_x > 0
        return np.multiply(dx, mask)

        
    def dropout_backward(self, dA, cache):

        return dA

    
    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        
        gradients = {}
        
        # Do gradient for softmax layer first
        index = self.num_layers - 2
        dA, dW, db = self.affineBackward(dAL, cache[index])
        gradients[index] = (dW, db)
        dAL = dA
        
        # Go through layers backwards where weights are indexed by the preceeding layer
        # and we already did the "last" layer because of the different activation func
        for i in range(self.num_layers - 2 - 1, -1, -1):
            dZL = self.activationBackward(dAL, cache[i])
            dA, dW, db = self.affineBackward(dZL, cache[i])
            gradients[i] = (dW, db)
            dAL = dA
            
            if self.drop_prob > 0:
                # TODO: call dropout_backward
                pass      
            
        if self.reg_lambda > 0:
            # TODO: add gradients from L2 regularization to each dW
            pass
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        
        for i in range(self.num_layers - 1):
            self.parameters["weights"][i] -= alpha * gradients[i][0]
            self.parameters["biases"][i] -= alpha * gradients[i][1].T
        
        
    def train(self, X, y, iters=1000, alpha=0.01, batch_size=100, print_every=100, holdout=0.2):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        sys.stdout.write("Shuffling training data... ")
        self.shuffle(X, y)
        sys.stdout.write("Done.\n")
        
        # Create validation test set
        if holdout:
            test_size = int(holdout * len(y))
            test_x = X[:, 0:test_size]
            test_y = y[0:test_size]
            X = X[:, test_size:]
            y = y[test_size:]
        
        # Update training set and metadata
        self.parameters["X"] = X
        self.parameters["y"] = y
        self.max_iters = int(len(y) / batch_size)
        
        # Train...
        for i in range(0, iters):
            # get minibatch
            train_x, train_y = self.get_batch(batch_size)

            # forward prop
            AL, cache = self.forwardPropagation(train_x)
            
            # compute loss
            loss, dAL = self.costFunction(AL, train_y)

            # compute gradients
            grads = self.backPropagation(dAL, train_y, cache)

            # update weights and biases based on gradient
            self.updateParameters(grads, alpha)

            if i % print_every == 0:
                # print batch loss, training error, and test error
                sys.stdout.write("iter={0:d}   batch_loss={1:.6f}   ".format(i, loss))
                
                y_pred = self.predict(X)
                misclassified = np.sum(y_pred != y)
                err = (misclassified / len(y)) * 100.
                sys.stdout.write("train_err={0:.3f}%   ".format(err))
                
                if holdout:
                    y_pred = self.predict(test_x)
                    misclassified = np.sum(y_pred != test_y)
                    err = (misclassified / len(test_y)) * 100.
                    sys.stdout.write("test_err={0:.3f}%   ".format(err))

                sys.stdout.write("\n")
                    
        # Print final test error
        if holdout:
            y_pred = self.predict(test_x)
            misclassified = np.sum(y_pred != test_y)
            err = (misclassified / len(test_y)) * 100.
            print("Test Error Rate = {0:.1f}%   Test Accuracy = {1:.1f}%".format(err, 100. - err))
            
            # Save predictions to numpy file
            save_predictions("ans1-uni.npy", y_pred)
                
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        
        AL, _ = self.forwardPropagation(X)
        probs = self.softmax(AL)
        y_pred = np.argmax(probs, axis=0)
        
        return y_pred

    
    def shuffle(self, X=None, y=None):
        """
        Shuffles X and y "in unison"
        """
        
        # Default for X and y
        if X is None: X = self.parameters["X"]
        if y is None: y = self.parameters["y"]
        
        p = np.random.permutation(X.shape[1])
        self.parameters["X"] = X[:, p]
        self.parameters["y"] = y[p]
    
    
    def get_batch(self, batch_size):
        """
        Return minibatch of samples and labels
        
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        
        # Prevent overflows
        if self.batch_iter >= self.max_iters:
            self.shuffle() # Randomize training data
            self.batch_iter = 0
        
        start = self.batch_iter * batch_size
        end = self.batch_iter * batch_size + batch_size
        batch_X = self.parameters["X"][:, start:end]
        batch_y = self.parameters["y"][start:end]
        
        self.batch_iter += 1
        
        return batch_X, batch_y

In [12]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [13]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [7]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [171]:
layer_dimensions = [X_train.shape[0], 500, 50, 100, 10] # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=10000, batch_size=256, alpha=0.0005, print_every=100, holdout=0.2)

Shuffling training data... Done.
iter=0   batch_loss=2.303439   train_err=89.250%   test_err=89.470%   
iter=100   batch_loss=2.282413   train_err=89.350%   test_err=89.850%   
iter=200   batch_loss=2.212386   train_err=83.653%   test_err=85.750%   
iter=300   batch_loss=2.130917   train_err=76.845%   test_err=80.750%   
iter=400   batch_loss=2.081946   train_err=75.767%   test_err=80.290%   
iter=500   batch_loss=1.994891   train_err=71.508%   test_err=76.820%   
iter=600   batch_loss=1.850364   train_err=69.725%   test_err=75.780%   
iter=700   batch_loss=1.837930   train_err=66.850%   test_err=73.860%   
iter=800   batch_loss=1.844302   train_err=66.617%   test_err=73.350%   
iter=900   batch_loss=1.865391   train_err=68.793%   test_err=74.600%   
iter=1000   batch_loss=1.871299   train_err=72.942%   test_err=77.660%   
iter=1100   batch_loss=1.808302   train_err=63.177%   test_err=70.890%   
iter=1200   batch_loss=1.810153   train_err=63.497%   test_err=71.180%   
iter=1300   batch

In [None]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [46]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

(1, 10000)


array([[6, 4, 5, ..., 5, 8, 6]])

## Part 2: Improving the performance

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
NN2.train(X_train, y_train, iters=1000, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')

Write down results for Part 2 here:
...