In [1]:
import numpy as np
from scipy.optimize import minimize
from scipy.io import loadmat
from math import sqrt
import pickle

In [2]:
def initializeWeights(n_in, n_out):
    """
    # initializeWeights return the random weights for Neural Network given the
    # number of node in the input layer and output layer

    # Input:
    # n_in: number of nodes of the input layer
    # n_out: number of nodes of the output layer
       
    # Output: 
    # W: matrix of random initial weights with size (n_out x (n_in + 1))"""

    epsilon = sqrt(6) / sqrt(n_in + n_out + 1)
    W = (np.random.rand(n_out, n_in + 1) * 2 * epsilon) - epsilon
    return W

In [3]:
def sigmoid(z):
    """# Notice that z can be a scalar, a vector or a matrix
    # return the sigmoid of input z"""
    sigmoidresult = 1.0 / (1.0 + np.exp(-z))

    return sigmoidresult

In [4]:
def nnObjFunction(params, *args):
    """% nnObjFunction computes the value of objective function (negative log
    %   likelihood error function with regularization) given the parameters
    %   of Neural Networks, thetraining data, their corresponding training
    %   labels and lambda - regularization hyper-parameter.
    % Input:
    % params: vector of weights of 2 matrices w1 (weights of connections from
    %     input layer to hidden layer) and w2 (weights of connections from
    %     hidden layer to output layer) where all of the weights are contained
    %     in a single vector.
    % n_input: number of node in input layer (not include the bias node)
    % n_hidden: number of node in hidden layer (not include the bias node)
    % n_class: number of node in output layer (number of classes in
    %     classification problem
    % training_data: matrix of training data. Each row of this matrix
    %     represents the feature vector of a particular image
    % training_label: the vector of truth label of training images. Each entry
    %     in the vector represents the truth label of its corresponding image.
    % lambda: regularization hyper-parameter. This value is used for fixing the
    %     overfitting problem.
    % Output:
    % obj_val: a scalar value representing value of error function
    % obj_grad: a SINGLE vector of gradient value of error function
    % NOTE: how to compute obj_grad
    % Use backpropagation algorithm to compute the gradient of error function
    % for each weights in weight matrices.
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % reshape 'params' vector into 2 matrices of weight w1 and w2
    % w1: matrix of weights of connections from input layer to hidden layers.
    %     w1(i, j) represents the weight of connection from unit j in input
    %     layer to unit i in hidden layer.
    % w2: matrix of weights of connections from hidden layer to output layers.
    %     w2(i, j) represents the weight of connection from unit j in hidden
    %     layer to unit i in output layer."""

    n_input, n_hidden, n_class, training_data, training_label, lambdaval = args

    # Venkat: Set the kth label as 1 . set 0th label 1 for label 0 etc.
    label = np.array(training_label);
    rows = label.shape[0];
    rowsIndex =np.array([i for i in range(rows)])
    training_label = np.zeros((rows,10))
    training_label[rowsIndex,label.astype(int)]=1

    w1 = params[0:n_hidden * (n_input + 1)].reshape( (n_hidden, (n_input + 1)))
    w2 = params[(n_hidden * (n_input + 1)):].reshape((n_class, (n_hidden + 1)))

    # Venkat : Add bias and feed forward
    BiasTerm = np.ones(training_data.shape[0])
    training_data = np.column_stack((training_data,BiasTerm))
    num_samples = training_data.shape[0]
    
    # Venkat: find the output using sigmoid
    HiddenOut = sigmoid(np.dot(training_data,w1.T))
    
    # Venkat :Add new bias term
    NewBias = np.ones(HiddenOut.shape[0])
    HiddenOutput = np.column_stack((HiddenOut, NewBias))

    # Find the final output using sigmoid
    FinalOutput = sigmoid(np.dot(HiddenOutput,w2.T))
    
    # Find the error and then use the formula to find the Gradient and value.
    Delta = FinalOutput - training_label
    
    # Using the formula  shared in handout. 
    Gradient_w2 = np.dot(Delta.T,HiddenOutput)
    Gradient_w1 = np.dot(((1-HiddenOutput)*HiddenOutput* (np.dot(Delta,w2))).T,training_data)
    
    # remove zero rows hidden
    # np.delete(Gradient_w1 )
    Gradient_w1 = np.delete(Gradient_w1, n_hidden,0)
    
    # Implementing the formula from the document
    # Find the sum of elements on the axis
    logFinal = np.log(FinalOutput)
    logOneFinal  = np.log(1-FinalOutput)
    o_part_1 = (np.sum(-1*(training_label*logFinal+(1-training_label)*logOneFinal)))
    o_part_1 = o_part_1/num_samples
    sw1 = np.sum(np.square(w1)) #
    sw2 = np.sum(np.square(w2))
    o_part_2 = (lambdaval/(2*num_samples))* (sw1 +  sw2)
    obj_val = o_part_1 + o_part_2

    # concatenate
    # regularization will not impact for lambdaval 0, for others it will
    Gradient_w1 = Gradient_w1 + lambdaval * w1
    Gradient_w2 = Gradient_w2 + lambdaval * w2

    # obj_grad 
    obj_grad = np.array([])  
    obj_grad = np.concatenate((Gradient_w1.flatten(), Gradient_w2.flatten()),0)
    obj_grad = obj_grad/num_samples
    
    return (obj_val,obj_grad)


In [5]:
def preprocess():
    """ Input:
     Although this function doesn't have any input, you are required to load
     the MNIST data set from file 'mnist_all.mat'.
     Output:
     train_data: matrix of training set. Each row of train_data contains
       feature vector of a image
     train_label: vector of label corresponding to each image in the training
       set
     validation_data: matrix of training set. Each row of validation_data
       contains feature vector of a image
     validation_label: vector of label corresponding to each image in the
       training set
     test_data: matrix of training set. Each row of test_data contains
       feature vector of a image
     test_label: vector of label corresponding to each image in the testing
       set
     Some suggestions for preprocessing step:
     - feature selection"""

    mat = loadmat('mnist_all.mat')  # loads the MAT object as a Dictionary

    # Pick a reasonable size for validation data

    # ------------Initialize preprocess arrays----------------------#
    train_preprocess = np.zeros(shape=(50000, 784))
    train_data = np.zeros(shape=(50000, 784))
    validation_preprocess = np.zeros(shape=(10000, 784))
    test_preprocess = np.zeros(shape=(10000, 784))
    train_label_preprocess = np.zeros(shape=(50000,))
    validation_label_preprocess = np.zeros(shape=(10000,))
    test_label_preprocess = np.zeros(shape=(10000,))
    # ------------Initialize flag variables----------------------#
    train_len = 0
    validation_len = 0
    test_len = 0
    train_label_len = 0
    validation_label_len = 0
    # ------------Start to split the data set into 6 arrays-----------#
    for key in mat:
        # -----------when the set is training set--------------------#
        if "train" in key:
            label = key[-1]  # record the corresponding label
            tup = mat.get(key)
            sap = range(tup.shape[0])
            tup_perm = np.random.permutation(sap)
            tup_len = len(tup)  # get the length of current training set
            tag_len = tup_len - 1000  # defines the number of examples which will be added into the training set

            # ---------------------adding data to training set-------------------------#
            train_preprocess[train_len:train_len + tag_len] = tup[tup_perm[1000:], :]
            train_len += tag_len

            train_label_preprocess[train_label_len:train_label_len + tag_len] = label
            train_label_len += tag_len

            # ---------------------adding data to validation set-------------------------#
            validation_preprocess[validation_len:validation_len + 1000] = tup[tup_perm[0:1000], :]
            validation_len += 1000

            validation_label_preprocess[validation_label_len:validation_label_len + 1000] = label
            validation_label_len += 1000

            # ---------------------adding data to test set-------------------------#
        elif "test" in key:
            label = key[-1]
            tup = mat.get(key)
            sap = range(tup.shape[0])
            tup_perm = np.random.permutation(sap)
            tup_len = len(tup)
            test_label_preprocess[test_len:test_len + tup_len] = label
            test_preprocess[test_len:test_len + tup_len] = tup[tup_perm]
            test_len += tup_len
            # ---------------------Shuffle,double and normalize-------------------------#
    train_size = range(train_preprocess.shape[0])
    train_perm = np.random.permutation(train_size)
    train_data = train_preprocess[train_perm]
    train_data = np.double(train_data)
    train_data = train_data / 255.0
    train_label = train_label_preprocess[train_perm]
    
    validation_size = range(validation_preprocess.shape[0])
    vali_perm = np.random.permutation(validation_size)
    validation_data = validation_preprocess[vali_perm]
    validation_data = np.double(validation_data)
    validation_data = validation_data / 255.0
    validation_label = validation_label_preprocess[vali_perm]

    test_size = range(test_preprocess.shape[0])
    test_perm = np.random.permutation(test_size)
    test_data = test_preprocess[test_perm]
    test_data = np.double(test_data)
    test_data = test_data / 255.0
    test_label = test_label_preprocess[test_perm]

    features_to_delete = []
    selected_feature = []
    Number_of_Features = np.shape(validation_data)[1]

    # we have to ignore features that are not of importance to us.  
    # we use numpy.ptp
    # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ptp.html
    
    for i in range(Number_of_Features):
        # If feature is of no importance in training data
        if np.ptp(train_data[:,i]) == 0:
            # if feature is of no importance in validation data
            if np.ptp(validation_data[:,i]) == 0:
                # same check on test data
                if np.ptp(test_data[:,i]) == 0:
                    features_to_delete.append(i)

    for i in range(Number_of_Features):
        if (i not in features_to_delete):
            selected_feature.append(i);
            
#     print (selected_feature)
    
    train_data = np.delete(train_data, features_to_delete, axis=1)
    validation_data = np.delete(validation_data, features_to_delete, axis=1)
    test_data = np.delete(test_data, features_to_delete, axis=1)

    return train_data, train_label, validation_data, validation_label, test_data, test_label

In [6]:
def nnPredict(w1, w2, data):
    """% nnPredict predicts the label of data given the parameter w1, w2 of Neural
    % Network.
    % Input:
    % w1: matrix of weights of connections from input layer to hidden layers.
    %     w1(i, j) represents the weight of connection from unit i in input
    %     layer to unit j in hidden layer.
    % w2: matrix of weights of connections from hidden layer to output layers.
    %     w2(i, j) represents the weight of connection from unit i in input
    %     layer to unit j in hidden layer.
    % data: matrix of data. Each row of this matrix represents the feature
    %       vector of a particular image
    % Output:
    % label: a column vector of predicted labels"""

    """% nnPredict predicts the label of data given the parameter w1, w2 of Neural
    % Network.
    % Input:
    % w1: matrix of weights of connections from input layer to hidden layers.
    %     w1(i, j) represents the weight of connection from unit i in input
    %     layer to unit j in hidden layer.
    % w2: matrix of weights of connections from hidden layer to output layers.
    %     w2(i, j) represents the weight of connection from unit i in input
    %     layer to unit j in hidden layer.
    % data: matrix of data. Each row of this matrix represents the feature
    %       vector of a particular image
       
    % Output: 
    % label: a column vector of predicted labels""" 
    # Number of Items  
    Num_of_Items=data.shape[0]    

    # Add a bias term
    Bias = np.zeros([len(data), 1])
    DataWithBias = np.append(data, Bias ,1)
    
    hidden_input = np.dot(DataWithBias ,w1.T)
    hidden_output = sigmoid(hidden_input)
    print (hidden_output.shape)
    
    # Second layer - Adding Bias Term   
    Bias = np.zeros([len(hidden_output), 1])
    FinalDataWithBias = np.append(hidden_output, Bias, 1)
    final_input = np.dot(FinalDataWithBias, w2.T)
    final_output = sigmoid(final_input)

    #Initialize an dummy output array
    ListAns = [-1]*Num_of_Items
    for i in range(Num_of_Items):
        ListAns[i] = np.argmax(final_output[i]);
    labels = np.array(ListAns)
    
    return labels

In [7]:
"""**************Neural Network Script Starts here********************************"""

train_data, train_label, validation_data, validation_label, test_data, test_label = preprocess()

#  Train Neural Network

# set the number of nodes in input unit (not including bias unit)
n_input = train_data.shape[1]
print (n_input)

# set the number of nodes in hidden unit (not including bias unit)
n_hidden = 4

# set the number of nodes in output unit
n_class = 10

# initialize the weights into some random matrices
initial_w1 = initializeWeights(n_input, n_hidden)
initial_w2 = initializeWeights(n_hidden, n_class)

# unroll 2 weight matrices into single column vector
initialWeights = np.concatenate((initial_w1.flatten(), initial_w2.flatten()), 0)

# set the regularization hyper-parameter
lambdaval = 0

args = (n_input, n_hidden, n_class, train_data, train_label, lambdaval)

# Train Neural Network using fmin_cg or minimize from scipy,optimize module. Check documentation for a working example

opts = {'maxiter': 50}  # Preferred value.
nn_params = minimize(nnObjFunction, initialWeights, jac=True, args=args, method='CG', options=opts)
# print (nn_params)
print (len(nn_params.x))
# In Case you want to use fmin_cg, you may have to split the nnObjectFunction to two functions nnObjFunctionVal
# and nnObjGradient. Check documentation for this function before you proceed.
# nn_params, cost = fmin_cg(nnObjFunctionVal, initialWeights, nnObjGradient,args = args, maxiter = 50)

# Reshape nnParams from 1D vector into w1 and w2 matrices
w1 = nn_params.x[0:n_hidden * (n_input + 1)].reshape((n_hidden, (n_input + 1)))
w2 = nn_params.x[(n_hidden * (n_input + 1)):].reshape((n_class, (n_hidden + 1)))

# Test the computed parameters
print("Predicting on training data")
predicted_label = nnPredict(w1, w2, train_data)
print("Done training data")
# find the accuracy on Training Dataset

print('\n Training set Accuracy:' + str(100 * np.mean((predicted_label == train_label).astype(float))) + '%')

print("Predicting on validation data")
predicted_label = nnPredict(w1, w2, validation_data)
print("Done validating data")
# find the accuracy on Validation Dataset

print('\n Validation set Accuracy:' + str(100 * np.mean((predicted_label == validation_label).astype(float))) + '%')

print("Predicting on testing data")
predicted_label = nnPredict(w1, w2, test_data)
print("Done on testing data")
# find the accuracy on Validation Dataset

print('\n Test set Accuracy:' + str(100 * np.mean((predicted_label == test_label).astype(float))) + '%')


719
2930
Predicting on training data
(50000, 4)
Done training data

 Training set Accuracy:52.70399999999999%
Predicting on validation data
(10000, 4)
Done validating data

 Validation set Accuracy:53.239999999999995%
Predicting on testing data
(10000, 4)
Done on testing data

 Test set Accuracy:53.42%
