# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Junyi Wang, jw3564

Member 2: Kun Chen, kc3143

Member 3: Runzhou Cao, rc3121

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [3]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        for i in range(1, self.num_layers):
            d = layer_dimensions[i]
            self.parameters["W"+str(i)] = np.random.normal(0, 1./d**0.5, size = (d, layer_dimensions[i-1]))
            self.parameters["b"+str(i)] = np.zeros(d).reshape(d,1)

    # W has the shape (n(L), n(L-1)), A has the shape(n(L-1), S), b has the shape (n(L),1), A here is A(l-1)
    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.matmul(W, A) + b
        cache = (Z, A, W, b)
        # WA+b
        return Z, cache

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        # g*(WA+b)
        return self.relu(A)


    def relu(self, X):
        A = np.maximum(0,X)
        return A
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        # The mask should have the same size as A
        M = np.random.rand(A.shape[0], A.shape[1])
        # change the entry into int type, with p probability of being 0 and 1-p probability
        # of being /(1-p)
        M = 1* (M>=prob)
        A = A*M / (1-prob)
        
        return A, M

    def forwardPropagation(self, X, dropout= True):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache = []
        AL = X
        for index in range(1, self.num_layers):
            AL, temp_cache = self.affineForward(AL, self.parameters["W"+str(index)], self.parameters["b"+str(index)])
            if index != self.num_layers-1: # do not apply relu at the last layer
                AL = self.activationForward(AL)
                if self.drop_prob > 0 and dropout: # do not apply drop out to the last layer
                    AL, M = self.dropout(AL,self.drop_prob)
                    a,b,c,d = temp_cache
                    # add M in to cache
                    temp_cache = (a,b,c,d,M)
            cache.append(temp_cache)
        return AL, cache
    
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """

        dAL = AL
        dAL = np.exp(dAL) # change every element i into e^i
        denominator = np.sum(dAL, axis = 0) # get an row array containing the sum e^i of the col
        dAL = np.divide(dAL, denominator) # matrix after softmax
        # change y into one-hot format
        y_one_hot = np.zeros((len(AL),len(AL[0])))
        
        # y has format [0, 2, 4, 5 ...], change corresponding cell into 1
        y_one_hot[y, range(len(y))] = 1
        
        # cost = sum -log p
        cost = 0.0
        # only look at the right class, which is denoted by dAL[y, range(len(y))]
        cost = np.sum(-np.log(dAL[y,range(len(y))]))/len(y)
        
        # Add regularization
        if self.reg_lambda > 0:
            for i in range(1, self.num_layers - 1):
                W = self.parameters['W' + str(i)]
                # 1/2m  lambda * sum W^2
                cost += (self.reg_lambda * np.sum(np.multiply(W, W))) / (2 * len(y))

        dAL = dAL - y_one_hot
        
        
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        
        cache = (Z,A,W,B)
        dA = dAl-1
        dA_prve = dAl
        """
        if (self.drop_prob == 0):
            Z, A, W, B = cache
        else:
            Z, A, W, B, M = cache
            dA_prev = self.dropout_backward(dA_prev, cache)
        
        dZ = self.activationBackward(dA_prev, cache)
        dA = np.matmul(W.transpose(),dZ)
        dW = (1.0 / len(Z[0]))*(np.matmul(dZ, A.transpose()))
        dB = (1.0 / len(Z[0]))*(np.sum(dZ, axis=1).reshape(len(dZ),1))
        return dA, dW, dB

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        if (self.drop_prob == 0):
            Z, A, W, B = cache
        else:
            Z, _, _, _, _ = cache
        dZ = self.relu_derivative(dA, Z)
        return dZ

        
    def relu_derivative(self, dx, cached_x):
        """
        Inputs:
        dx: upstream derivative dAl
        cached_x: input of relu

        Returns:
        dx: gradient with respect to cached_x
        """
        gradient = cached_x
        # g'(l)(Z(l))
        gradient = np.where(gradient > 0., 1., 0.)
        # dA(l) * g'(l)(Z(l))
        dx = dx*gradient
        return dx

    def dropout_backward(self, dA, cache):
        _, _, _,_, M = cache
        dA = np.multiply(M, dA) / (1 - self.drop_prob)
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        
           
        dAL = dAL
        m = dAL.shape[1]
        # last layer cache only have 4 input.
        _,A,_,_ = cache[self.num_layers-2]
        W = self.parameters['W' + str(self.num_layers - 1)]
        dZ = dAL
        dW = 1 / m * np.dot(dZ, A.T)
        db = 1 / m * np.sum(dZ, axis = 1, keepdims = True)
        dAL = np.dot(W.T, dZ)
        
                
        gradients['W' + str(self.num_layers - 1)] = dW
        gradients['b' + str(self.num_layers - 1)] = db
        
        for index in reversed(range(1, self.num_layers-1)):
            dAL, dW, dB = self.affineBackward(dAL, cache[index-1])
            gradients['W'+ str(index)] = dW
            gradients['b'+ str(index)] = dB
        
        if self.reg_lambda > 0:
            # add gradients from L2 regularization to each dW
            for index in range(1, self.num_layers-1):
                dW = gradients['W'+str(index)]
                W = self.parameters['W'+str(index)]
                dW = dW + self.reg_lambda * W / m
                gradients['W'+str(index)] = dW
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        if self.reg_lambda == 0:
            for index in range(1, self.num_layers):
                self.parameters['W'+str(index)] -= alpha*gradients['W'+str(index)]
                self.parameters['b'+str(index)] -= alpha*gradients['b'+str(index)]
        else:
            m = self.parameters['W1'].shape[1]
            for index in range(1, self.num_layers):
                self.parameters['W'+str(index)] = self.parameters['W'+str(index)] * (1.0 - alpha * self.reg_lambda/m) - alpha*gradients['W'+str(index)]
                self.parameters['b'+str(index)] -= alpha*gradients['b'+str(index)]
        
    def train(self, X, y, X_valid, Y_valid, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = self.get_batch(X, y, batch_size)
            # forward prop
            AL, cache = self.forwardPropagation(X_batch)
            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)
            # compute gradients
            gradients = self.backPropagation(dAL, y_batch, cache)
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            if i % print_every == 0:
                predict_train_y, _ = self.predict(X_batch)
                train_acc = np.sum((predict_train_y == y_batch) * 1.0) / len(y_batch)
                # print cost, train and validation set accuracies
                print('Iteration:%d. Training cost is %f:'%(i, cost))
                print('Iteration:%d. Training accuracy is %f:' %(i, train_acc))
                AL, _ = self.forwardPropagation(X_valid, dropout = False)
                valid_cost, _ = self.costFunction(AL, Y_valid)
                predict_valid_y, _ = self.predict(X_valid)
                valid_acc = np.sum((predict_valid_y == Y_valid) * 1.0) / len(Y_valid)
                print('Iteration:%d. Validation cost is %f:' %(i, valid_cost))
                print('Iteration:%d. Validation accuracy is %f:' %(i, valid_acc))

                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        AL,cache = self.forwardPropagation(X, dropout = False) #10*S
        y_pred = AL.argmax(axis=0).tolist()
#         print y_pred
        return (y_pred, AL)

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        m = X.shape[1]
        index = np.random.randint(0, m - batch_size)
        X_batch = X[:, index:(index + batch_size)]
        y_batch = y[index:(index + batch_size)]

        return X_batch, y_batch

In [4]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [5]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [6]:
# Load the data
data_root_path = '/Users/mrdoggie/Desktop/Columbia/deepLearning/HW1/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

# arrange 10% of the training data to validation set
indexs = np.random.choice(50000, 5000, replace=False)
X_valid = X_train[:,indexs]
y_valid = y_train[indexs]
X_train = np.delete(X_train,indexs, axis = 1)
y_train = np.delete(y_train,indexs)

{'horse': 7, 'automobile': 1, 'deer': 4, 'dog': 5, 'frog': 6, 'cat': 3, 'truck': 9, 'ship': 8, 'airplane': 0, 'bird': 2}
Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [7]:
layer_dimensions = [X_train.shape[0], 256, 128, 64, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train,X_valid, y_valid, iters=30000, alpha=0.01, batch_size=100, print_every=1000)

Iteration:0. Training cost is 5.530208:
Iteration:0. Training accuracy is 0.070000:
Iteration:0. Validation cost is 6.779158:
Iteration:0. Validation accuracy is 0.093600:
Iteration:1000. Training cost is 1.833254:
Iteration:1000. Training accuracy is 0.400000:
Iteration:1000. Validation cost is 1.774309:
Iteration:1000. Validation accuracy is 0.359000:
Iteration:2000. Training cost is 1.768243:
Iteration:2000. Training accuracy is 0.440000:
Iteration:2000. Validation cost is 1.725846:
Iteration:2000. Validation accuracy is 0.388200:
Iteration:3000. Training cost is 1.766118:
Iteration:3000. Training accuracy is 0.420000:
Iteration:3000. Validation cost is 1.632933:
Iteration:3000. Validation accuracy is 0.420000:
Iteration:4000. Training cost is 1.448304:
Iteration:4000. Training accuracy is 0.490000:
Iteration:4000. Validation cost is 1.591975:
Iteration:4000. Validation accuracy is 0.443600:
Iteration:5000. Training cost is 1.617890:
Iteration:5000. Training accuracy is 0.470000:
It

In [13]:
NN.train(X_train, y_train,X_valid, y_valid, iters=10000, alpha=0.01, batch_size=100, print_every=1000)

Iteration:0. Training cost is 0.922966:
Iteration:0. Training accuracy is 0.780000:
Iteration:0. Validation cost is 1.502646:
Iteration:0. Validation accuracy is 0.507000:
Iteration:1000. Training cost is 1.030427:
Iteration:1000. Training accuracy is 0.720000:
Iteration:1000. Validation cost is 1.521713:
Iteration:1000. Validation accuracy is 0.500200:
Iteration:2000. Training cost is 0.900993:
Iteration:2000. Training accuracy is 0.810000:
Iteration:2000. Validation cost is 1.487785:
Iteration:2000. Validation accuracy is 0.509800:
Iteration:3000. Training cost is 0.948351:
Iteration:3000. Training accuracy is 0.790000:
Iteration:3000. Validation cost is 1.511309:
Iteration:3000. Validation accuracy is 0.510800:
Iteration:4000. Training cost is 0.803181:
Iteration:4000. Training accuracy is 0.810000:
Iteration:4000. Validation cost is 1.501946:
Iteration:4000. Validation accuracy is 0.517000:
Iteration:5000. Training cost is 0.730280:
Iteration:5000. Training accuracy is 0.850000:
It

In [14]:
# Want unnormalized scores from the network
y_predict, AL = NN.predict(X_test)
save_predictions('ans1-jw3564', AL)

In [15]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-jw3564.npy')
print(loaded_y.shape)
loaded_y[:10]

(10, 10000)


array([[  1.42988089,   4.90860308,   8.14384574, ...,  -3.49355858,
          4.49731719,   1.40285993],
       [  2.1735711 ,   8.10115565,  -1.2299805 , ...,  -5.41499105,
         -0.01416519,  -0.18027466],
       [  2.22665423,   1.6381088 ,   4.8475263 , ...,   3.27655408,
          3.74848132,   0.91275085],
       ..., 
       [ -4.8857242 ,   0.2070495 ,  -3.03800013, ...,   1.81193821,
          1.93937358,   4.55075594],
       [  2.47199964,  10.66528011,   4.65666773, ...,  -0.49885509,
          3.20934833,   1.29358813],
       [ -1.29797076,   9.2243459 ,  -3.75897839, ...,  -2.12434489,
         -5.08368989,  -0.43055831]])

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [10]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.2, reg_lambda=0.05)
NN2.train(X_train, y_train, X_valid, y_valid, iters=10000, alpha=0.1, batch_size=100, print_every=1000)
NN2.train(X_train, y_train, X_valid, y_valid, iters=30000, alpha=0.01, batch_size=100, print_every=1000)

Iteration:0. Training cost is 8.488541:
Iteration:0. Training accuracy is 0.160000:
Iteration:0. Validation cost is 8.980648:
Iteration:0. Validation accuracy is 0.096600:
Iteration:1000. Training cost is 2.806730:
Iteration:1000. Training accuracy is 0.410000:
Iteration:1000. Validation cost is 1.876535:
Iteration:1000. Validation accuracy is 0.332800:
Iteration:2000. Training cost is 2.789044:
Iteration:2000. Training accuracy is 0.430000:
Iteration:2000. Validation cost is 1.797303:
Iteration:2000. Validation accuracy is 0.377800:
Iteration:3000. Training cost is 2.413862:
Iteration:3000. Training accuracy is 0.500000:
Iteration:3000. Validation cost is 1.691910:
Iteration:3000. Validation accuracy is 0.409600:
Iteration:4000. Training cost is 2.344418:
Iteration:4000. Training accuracy is 0.470000:
Iteration:4000. Validation cost is 1.675622:
Iteration:4000. Validation accuracy is 0.408200:
Iteration:5000. Training cost is 2.169320:
Iteration:5000. Training accuracy is 0.460000:
It

In [16]:
y_predicted2, AL = NN2.predict(X_test)
save_predictions('ans2-jw3564', AL)

In [18]:
# test if your numpy file has been saved correctly
loaded_y2 = np.load('ans2-jw3564.npy')
print(loaded_y2.shape)
loaded_y2[:10]

(10, 10000)


array([[ 0.36457303,  1.44720562,  1.84649152, ..., -1.56399365,
        -0.44056646, -0.21420727],
       [ 0.44799358,  2.54524213, -1.68243275, ..., -2.06944759,
        -1.06286683, -0.88680986],
       [-0.7423423 , -1.16140796,  0.55103346, ...,  0.97944066,
         1.53817815,  0.91809987],
       ..., 
       [-1.57023787, -1.58068866, -1.17283398, ..., -0.1797116 ,
         0.23140701,  2.99212937],
       [ 2.09452944,  3.97342995,  1.89760557, ..., -1.86845484,
        -1.86227075, -1.14515179],
       [-1.66760822,  2.41973634, -2.13659698, ..., -0.98973243,
        -0.96386804, -0.45671913]])