# COMS 4995_002 Deep Learning Assignment 1
Due on Thursday, Feb 8, 11:59pm

This assignment can be done in groups of at most 2 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Samuel Cohen, slc2206

Member 2: Jason Zhao, jsz2107

In [234]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [237]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = 0
        self.reg_lambda = 0
        
        #Init parameters (Key:)
        self.parameters["weights"] = {}
        self.parameters["biases"] = {}
        
        #Should be using xavier random init!!!

        #If there are N layers then there are N-1 sets off weights so (self.num_layers-1)
        for i in range(self.num_layers-1):
            #Set weight matrix of layer i to layer i+1
            self.parameters["weights"][i] = 0.01*np.random.randn(layer_dimensions[i+1], layer_dimensions[i])
            #Set bias vector biases from layer i to layer i+1 (bias vector length = num neurons in layer i+!)
            self.parameters["biases"][i] = 0.01*np.random.randn(layer_dimensions[i+1]).reshape(layer_dimensions[i+1],1)

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        
        """
        The values we return from each call of this function are stored at an layer index in a larger cache
        In backprop we use these values at each layer to calculate the gradients.
        """
        affine_prod = np.dot(W, A) + b

        cache = {
            "input":A,
            "weights":W,
            "biases":b,
            "affine_prod":affine_prod
        }

        #This cache of layer values will be stored in a larger cache of the whole network
        return affine_prod, cache


    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        return self.relu(A)


    def relu(self, X):
        """ The ReLU function to calculate activations."""
        A = np.maximum(0, X)
        return A

            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        return A, M

    
    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        #Each index of cache corresponds to a sub-cache holding values for one layer 
        cache = []
        last_layer_index = self.num_layers - 2
        
        prev_activations = X

        #Dont do dropout in the last layer as we need all 10 activations for predictions obviously
        
        
        #Propagate using ReLU up until last layer which needs softmax
        for i in range(last_layer_index):
            
            #Push the inputs through the layer to get new outputs
            affineProds, layer_cache = self.affineForward(prev_activations, self.parameters["weights"][i], self.parameters["biases"][i])
            activations = self.activationForward(affineProds)
            
            if self.drop_prob > 0:
                activations, mask = dropout(activations, prob)
                layer_cache["mask"] = mask
            
            #Add the activation output of the layer to the local cache
            layer_cache["output"] = activations
            
            cache.append(layer_cache)
            
            prev_activations = activations
            
            
        #Propagate last layer (L) using softmax
        zL, layer_cache = self.affineForward(prev_activations, self.parameters["weights"][last_layer_index], self.parameters["biases"][last_layer_index])
        AL = self.softmax(zL)
        
        layer_cache["output"] = AL
        
        cache.append(layer_cache)
        
        
        return AL, cache
 
        
    
    
    def softmax(self, zL):
        """
        The softmax is used for multi-class classification and computes a probability 
        in (0, 1) for each class c = 1...C, which sums to 1.
        """
        #The term zL represents the affine product of the last layer (L)
        return np.exp(zL) / np.sum(np.exp(zL), axis = 0)
    
    
    def costFunction(self, AL, y):
        #TODO: Does this function need to be changed?
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """

        #Get the number of training examples
        m = y.shape[0] 
        
        #Create a vector that only holds the computed probabilities corresponding to correct labels
        correct_label_probs = AL[y, range(m)]
        
        #Why do we have to divide by m? Like we do we even need an "average" of loss as opposed to a overall loss?
        cost = - np.sum(np.log(correct_label_probs)) / m 
        
        #TODO
        if self.reg_lambda > 0:
            # add regularization
            pass
       
        #Create a 1-hot encoded matrix where the row corresponds the class and the column corresponds to the example
        num_classes = layer_dimensions[-1]

        #TODO: Is there one_hot function working? Check piazza, and worth using?
        one_hot = np.zeros((num_classes, m))
        one_hot[y, range(m)] = 1
        
        # TODO: Should we be dividing by m?
        dAL = (AL - one_hot)
        return cost, dAL 

    
    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward (i.e single layer values)
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        
        dA = np.dot(cache["weights"].T, dA_prev) 
        
        dW = np.dot(cache["input"], dA_prev.T)
        
        db = np.sum(dA_prev, axis = 1)/dA_prev.shape[1]
        
        return dA, dW, db
        
    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """

        return self.relu_derivative(dA, cache["affine_prod"])
        
        
    def relu_derivative(self, dx, cached_x):
        """
        cached_x: num_neurons X num examples
        
        The ReLU activation function turns the input off (to zero) when the input is negative.
        So when we are backpropagating, we need to know which gradients should be turned off. 
        (This is the reason for two arguments to this function)        
        """
        
        mask = cached_x > 0
        relu_deriv = np.multiply(dx, mask)
        
        return relu_deriv


        
    def dropout_backward(self, dA, cache):

        return dA

    
    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        #Build a dictionary to hold gradients on the weights and biases of each layer
        gradients = {}
        last_layer_index = self.num_layers - 2
        
        #Find the dW and db for the last layer which differs in that its softmax
        dA, dW, db = self.affineBackward(dAL, cache[last_layer_index])
        #print("db.shape2: " + str(db.shape))
        #Store as a tuple at the given index
        gradients[last_layer_index] = (dW, db)
        
        #Iterate backwards propagating through the remaining layers
        for i in range(last_layer_index - 1, -1, -1):
            
            dAdZ = self.activationBackward(dA, cache[i])
            
            dA, dW, db = self.affineBackward(dAdZ, cache[i])
            
            #print("db.shape: " + str(db.shape))
            gradients[i] = (dW, db)
            
            if self.drop_prob > 0:
                #call dropout_backward
                pass      
            
        if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
            pass
          
        return gradients
        

            
            
            
  


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        
        for i in range(self.num_layers-1):
            
#             print("self.parameters[weights][i]" + str(self.parameters["weights"][i].shape))
#             print("gradient:" + str(gradients[i][0].shape))
#             print("alpha shape:" +str(alpha))
            
            
            self.parameters["weights"][i] -= alpha * gradients[i][0].T
            
            #print("gradient b :" + str(gradients[i][1].reshape((gradients[i][1].shape[0],1)).shape))
            #print("self.parameters[b] :" + str(self.parameters["biases"][i].shape))
            
#             print("shape this: " + str(gradients[i][1].reshape(1,gradients[i][1].shape[0]).T.shape))
#             print("shape this2: " + str(self.parameters["biases"][i].shape))

            #print("shape this0: " + str(self.parameters["biases"][i].shape))
    
    
            size_grad = gradients[i][1].shape[0]
            grad_reshape = (gradients[i][1].reshape(size_grad,1))
    
            #print("shape this1: " + str(grad_reshape.shape))
            #print("shape this2: " + str((self.parameters["biases"][i] - grad_reshape).shape))
            self.parameters["biases"][i] -= grad_reshape#gradients[i][1].T

            
    def shuffle(self, X, y):
        size = len(X)
        for i in range(10 * size):
            r = np.random.randint(0, size)
            tempX = X[0]
            tempy = y[0]
            X[0] = X[r]
            y[0] = y[r]
            X[r] = tempX
            y[r] = tempy
        
        
    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        #X.shape: 3072 X 50000 (i.e each training feature is a row and each example is a column)
        # Each image is 32X32 pixels = 32*32 = 1024 * 3 colors = 3072 and 50000 images total
        
        
        #Can get minibatch one of two ways, we choose to sample w/o replacement. Try with later.
        print("Shuffling...---TURNED OFF!!!! TURN BACK ON!!!!")
        #self.shuffle(X, y)
        print("Done.")
        
        #print("official shape of X: " + str(X.shape))
        for i in range(0, iters):
            # get minibatch
            train_x, train_y = self.get_batch(X, y, batch_size, i)
            
            #print("y.shape[0] must equal X.shape[1] == "+ str(y.shape[0]) + ", "+str(X.shape[1]))
            
            # forward prop
            AL, cache = self.forwardPropagation(train_x)
            
            # compute loss
            loss, dAL = self.costFunction(AL, train_y)

            # compute gradients
            gradients = self.backPropagation(dAL, train_y, cache)

            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)

            if i % print_every == 0:
                # print batch loss, training error, and test error
                sys.stdout.write("iter={0:d}   batch_loss={1:.6f}   ".format(i, loss))
                
                y_pred = self.predict(X)
                misclassified = np.sum(y_pred != y)
                err = (misclassified / len(y)) * 100.
                sys.stdout.write("train_err={0:.3f}%   ".format(err))
                
                
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        
        if dropo
        
        #         AL, cache = self.forwardPropagation(X)


        #         return AL.argmax(axis = 0)
        AL, _ = self.forwardPropagation(X)
        probs = self.softmax(AL)
        y_pred = np.argmax(probs, axis=0)
        
        return y_pred

    
    def get_batch(self, X, y, batch_size, current_iter):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        #What happens if batch_size and iterations dont perfectly align with number of samples? The last batch could be very small?? 
        
        X_batch = X[:,current_iter*batch_size : current_iter*batch_size+batch_size]
        y_batch = y[current_iter*batch_size : current_iter*batch_size+batch_size]
        
        return X_batch, y_batch

SyntaxError: invalid syntax (<ipython-input-237-ae6e387d079e>, line 368)

In [235]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [157]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [236]:
# Load the data
data_root_path = 'cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'airplane': 0, 'automobile': 1, 'bird': 2, 'cat': 3, 'deer': 4, 'dog': 5, 'frog': 6, 'horse': 7, 'ship': 8, 'truck': 9}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [232]:
print(type(X_train))
layer_dimensions = [X_train.shape[0], 40, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train)

<class 'numpy.ndarray'>
Shuffling...---TURNED OFF!!!! TURN BACK ON!!!!
Done.
iter=0   batch_loss=2.304126   train_err=91.392%   iter=100   batch_loss=2.293697   train_err=90.000%   iter=200   batch_loss=2.300693   train_err=84.600%   iter=300   batch_loss=2.220945   train_err=81.046%   iter=400   batch_loss=2.150720   train_err=80.302%   



iter=500   batch_loss=nan   train_err=90.000%   iter=600   batch_loss=nan   train_err=90.000%   iter=700   batch_loss=nan   train_err=90.000%   iter=800   batch_loss=nan   train_err=90.000%   iter=900   batch_loss=nan   train_err=90.000%   

In [199]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [200]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([3, 8, 0, 3, 3, 0, 8, 3, 8, 3])

## Part 2: Improving the performance

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
NN2.train(X_train, y_train, iters=1000, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')

Write down results for Part 2 here:
...