In [1]:
import pickle #import all essentials
import gzip
import numpy as np
import os
from PIL import Image
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping, TensorBoard
from sklearn import svm

Using TensorFlow backend.


In [4]:
filename = 'mnist.pkl.gz'
f = gzip.open(filename, 'rb') #unpack zip
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()

"""Since the data loaded is in tuples, we split the data into 
data points and target points"""
training_matrix = training_data[0] 
training_target = training_data[1]
validation_matrix = validation_data[0]
validation_target = validation_data[1]
test_matrix = test_data[0]
test_target = test_data[1]
print(training_matrix.shape)

(50000, 784)


In [6]:
USPSMat  = []
USPSTar  = []
curPath  = 'USPSdata/USPSdata/Numerals'
savedImg = []

for j in range(0,10): #preparing USPS data
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28)) #resize
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255 #image features
            USPSMat.append(imgdata)
            USPSTar.append(j)
USPSMat = np.asarray(USPSMat)

# Logistic Regression using Softmax

In [7]:
def loss(h, y): #loss function for Logistic
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

def softmax(z):
    ma = np.max(z, axis=1).reshape((-1,1))
    ex = np.exp(z-ma)
    den = np.sum(ex, axis=1).reshape((-1,1))
    return ex/den #softmax function exp(a)/sum(exp(a))

def one_hot(a, num_classes):
    return np.squeeze(np.eye(num_classes)[a.reshape(-1)]) #onehot encoding converts a sample vector to samples x number_of_class

def logistic_regression(X, y):
    target = one_hot(y, 10) #convert target to one hot
    iterations = 4000
    learning_rate = 0.04
    weights = np.zeros([X.shape[1], 10]) #empty weight matrix
    for _ in range(iterations):
        for i in range(0, 50000, 100):
            x = np.dot(X[i:i+100], weights) #training batch-wise with 100 samples at time
            h = softmax(x) #applying softmax
            gradient_des = np.array(np.dot(X[i:i+100].T, (h - target[i:i+100])) / target[i:i+100].shape[0]) #calculate gradient descent
            weights -= learning_rate * gradient_des #update weights
    return weights

def eval_model(test_matrix, test_target, weights): #calculate accuracy
    ctr = 0
    c = np.dot(test_matrix, weights)
    ycap = softmax(c) #10000 x 10
    y_cap = np.argmax(ycap, axis=1) #converts one hot to predicted target vector
    for i in range(y_cap.shape[0]):
        if(y_cap[i] == test_target[i]):
            ctr += 1
    acc = (ctr/test_matrix.shape[0]) * 100 #calculate accuracy
    actual = pd.Series(test_target, name='actual')
    predicted = pd.Series(y_cap, name='predicted')
    print(pd.crosstab(actual, predicted)) #confusion matrix
    return acc, y_cap
    

weights = logistic_regression(training_matrix, training_target)

In [8]:
acc, LR_predicted = eval_model(test_matrix, test_target, weights)
print("Accuracy - Logistic Regression on MNIST test data = "+str(acc))
print("\n")
print("Accuracy - Logistic Regression on MNIST validation data = "+str(eval_model(validation_matrix, validation_target, weights)))
print("\n")
print("Accuracy - Logistic Regression on USPS data = "+str(eval_model(np.asarray(USPSMat), np.asarray(USPSTar), weights)))

predicted    0     1    2    3    4    5    6    7    8    9
actual                                                      
0          953     0    1    4    1    5    8    3    5    0
1            0  1115    3    3    0    1    3    2    8    0
2            5    12  914   20    7    8   11   10   43    2
3            4     2   14  928    3   20    3   10   19    7
4            2     1    7    4  918    0    7    5    9   29
5            9     4    4   43    9  759   17    8   33    6
6            9     3    6    4    5   17  910    1    3    0
7            2     7   19   12    5    1    0  945    5   32
8            7    13    7   24    7   26   10   13  856   11
9            8     7    1   11   25    8    0   25   11  913


Accuracy - Logistic Regression on MNIST test data = 92.11


predicted    0     1    2    3    4    5    6     7    8    9
actual                                                       
0          957     0    5    2    2    7    7     4    6    1
1            0  1042

# Neural Network 



In [9]:
def get_model(first_layer_nodes, second_layer_nodes, third_layer_nodes, input_size, drop_out):
    model = Sequential()
    model.add(Dense(first_layer_nodes, input_dim=input_size)) #first layer nodes
    model.add(Activation('relu')) #activation relu
    model.add(Dropout(drop_out))
    model.add(Dense(second_layer_nodes))
    model.add(Activation('relu'))
    model.add(Dropout(drop_out))
    model.add(Dense(third_layer_nodes))
    model.add(Activation('softmax')) #output layer with activation softmax since there are multiple classes to predict
    model.summary()
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [32]:
def one_hot(a, num_classes):
    return np.squeeze(np.eye(num_classes)[a.reshape(-1)])

def _nn(data, target, test_data, test_target, test_data1, test_target1, test_data2, test_target2):
    input_size = data.shape[1]
    drop_out = 0.2
    first_layer_nodes = 512
    second_layer_nodes = 512
    third_layer_nodes = 10
    model = get_model(first_layer_nodes, second_layer_nodes, third_layer_nodes, input_size, drop_out)
    validation_data_split = 0.2
    num_epochs = 50 #50 num of epochs
    model_batch_size = 128 
    tb_batch_size = 32
    early_patience = 500
    
    tensorboard_cb = TensorBoard(log_dir='logs', batch_size=tb_batch_size, write_graph=True)
    earlystopping_cb = EarlyStopping(monitor='val_loss', verbose=1, patience=early_patience, mode='min')
    
    history = model.fit(data, #training 
                       target,
                       validation_split=validation_data_split,
                       epochs=num_epochs,
                       batch_size=model_batch_size,
                       callbacks = [tensorboard_cb,earlystopping_cb]
                       )
#     %matplotlib inline
#     df = pd.DataFrame(history.history)
#     df.plot(subplots=True, grid=True, figsize=(10, 15))
    NN_predicted = []
    yy = model.predict(np.array(test_data)) #output predicted labels
    #print(yy.shape)
    for i in yy:
        NN_predicted.append(np.argmax(i))
    #print(NN_predicted)
    
    ne1, score1 = model.evaluate(test_data, test_target, batch_size=128) #evaluating the accuracy for test data
    ne2, score2 = model.evaluate(test_data1, test_target1, batch_size=128)
    ne3, score3 = model.evaluate(test_data2, test_target2, batch_size=128)
    print("Accuracy for MNIST test Data = "+str(score1*100))
    print("Accuracy for MNIST validation Data = "+str(score3*100))
    print("Accuracy for USPS Data = "+str(score2*100))
    
    return NN_predicted

In [33]:
NN_predicted = _nn(training_matrix, 
                   one_hot(training_target, 10), 
                   test_matrix, 
                   one_hot(test_target, 10), 
                   np.asarray(USPSMat), 
                   one_hot(np.asarray(USPSTar), 10), 
                   validation_matrix, 
                   one_hot(validation_target, 10))
actual = pd.Series(test_target, name='actual')
predicted = pd.Series(NN_predicted, name='predicted')
print(pd.crosstab(actual, predicted)) #confusion matrix

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 512)               401920    
_________________________________________________________________
activation_28 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_19 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_29 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_20 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 10)                5130      
__________

Epoch 50/50
Accuracy for MNIST test Data = 98.22
Accuracy for MNIST validation Data = 98.18
Accuracy for USPS Data = 48.27741387024648
predicted    0     1     2    3    4    5    6    7    8    9
actual                                                       
0          970     0     2    1    0    1    2    1    1    2
1            0  1120     3    3    0    1    1    3    4    0
2            0     0  1022    3    1    0    1    2    3    0
3            0     0     2  995    0    2    0    3    4    4
4            1     0     4    0  961    0    3    2    2    9
5            1     0     0   12    0  871    2    0    2    4
6            3     2     1    1    3    4  942    0    2    0
7            0     0    15    2    0    0    0  995    4   12
8            1     0     6    4    2    1    0    2  955    3
9            0     2     0    2    7    4    0    3    0  991


# Support Vector Machine

In [18]:
#Linear SVM
def linear_classification_default_gamma(data, target, test_matrix1, test_target1, 
                                        test_matrix2, test_target2, 
                                        test_matrix3, test_target3):
    classification = svm.SVC(kernel='linear', gamma='auto') #gamma set to default with linear kernel
    classification.fit(data, target)
    predicted_val1 = classification.predict(test_matrix1) #predict op labels
    predicted_val2 = classification.predict(test_matrix2)
    predicted_val3 = classification.predict(test_matrix3)
    ctr1, ctr2, ctr3 = 0, 0, 0
    for i in range(len(test_target1)): #calculate accuracy
        if(predicted_val1[i] == test_target1[i]):
            ctr1 += 1
        if(predicted_val2[i] == test_target2[i]):
            ctr2 += 1
    for i in range(len(test_target3)):
        if(predicted_val3[i] == test_target3[i]):
            ctr3 += 1
    print("Accuracy for MNIST test data = "+str((ctr1/len(test_target1))*100))
    actual1 = pd.Series(test_target1, name='tt1')
    pred1 = pd.Series(predicted_val1, name='pd1')
    print("Confusion Matrix for test data")
    print(pd.crosstab(actual1, pred1)) #confusion matrix
    print("Accuracy for MNIST validation data = "+str((ctr2/len(test_target2))*100))
    actual2 = pd.Series(test_target2, name='tt2')
    pred2 = pd.Series(predicted_val2, name='pd2')
    print("Confusion Matrix for validation data")
    print(pd.crosstab(actual2, pred2))
    print("Accuracy for USPS data = "+str((ctr3/len(test_target3))*100))
    actual3 = pd.Series(test_target3, name='tt3')
    pred3 = pd.Series(predicted_val3, name='pd3')
    print("Confusion Matrix for USPS data")
    print(pd.crosstab(actual3, pred3))
    
linear_classification_default_gamma(training_matrix, training_target, 
                                    test_matrix, test_target, 
                                    validation_matrix, validation_target, 
                                    np.asarray(USPSMat), np.asarray(USPSTar))

Accuracy for MNIST test data = 93.89999999999999
Accuracy for MNIST validation data = 94.23
Accuracy for USPS data = 29.12645632281614


In [34]:
#RBF SVM with gamma default
def rbf_classification_default_gamma(data, target, test_matrix1, test_target1, 
                                        test_matrix2, test_target2, 
                                        test_matrix3, test_target3):
    classification = svm.SVC(kernel='rbf', gamma='auto') #gamma set to 1 with rbf kernel
    classification.fit(data, target)
    predicted_val1 = classification.predict(test_matrix1)
    predicted_val2 = classification.predict(test_matrix2)
    predicted_val3 = classification.predict(test_matrix3)
    ctr1, ctr2, ctr3 = 0, 0, 0
    for i in range(len(test_target1)):
        if(predicted_val1[i] == test_target1[i]):
            ctr1 += 1
        if(predicted_val2[i] == test_target2[i]):
            ctr2 += 1
    for i in range(len(test_target3)):
        if(predicted_val3[i] == test_target3[i]):
            ctr3 += 1
    print("Accuracy for MNIST test data on svm rbf gamma(default) = "+str((ctr1/len(test_target1))*100))
    actual1 = pd.Series(test_target1, name='tt1')
    pred1 = pd.Series(predicted_val1, name='pd1')
    print("Confusion Matrix for test data")
    print(pd.crosstab(actual1, pred1))
    print("Accuracy for MNIST validation data on svm rbf gamma(default) = "+str((ctr2/len(test_target2))*100))
    actual2 = pd.Series(test_target2, name='tt2')
    pred2 = pd.Series(predicted_val2, name='pd2')
    print("Confusion Matrix for validation data")
    print(pd.crosstab(actual2, pred2))
    print("Accuracy for USPS data on svm rbf gamma(default) = "+str((ctr3/len(test_target3))*100))
    actual3 = pd.Series(test_target3, name='tt3')
    pred3 = pd.Series(predicted_val3, name='pd3')
    print("Confusion Matrix for USPS data")
    print(pd.crosstab(actual3, pred3))
    return predicted_val1
    
SVM_predicted = rbf_classification_default_gamma(training_matrix, training_target, 
                                    test_matrix, test_target, 
                                    validation_matrix, validation_target, 
                                    np.asarray(USPSMat), np.asarray(USPSTar))

Accuracy for MNIST test data on svm rbf gamma(default) = 94.35
Confusion Matrix for test data
pd1    0     1    2    3    4    5    6    7    8    9
tt1                                                   
0    967     0    1    0    0    5    4    1    2    0
1      0  1120    2    3    0    1    3    1    5    0
2      9     1  962    7   10    1   13   11   16    2
3      1     1   14  950    1   17    1   10   11    4
4      1     1    7    0  937    0    7    2    2   25
5      7     4    5   33    7  808   11    2   10    5
6     10     3    4    1    5   10  924    0    1    0
7      2    13   22    5    7    1    0  954    4   20
8      4     6    6   14    8   24   10    8  891    3
9     10     6    0   12   33    5    1   14    6  922
Accuracy for MNIST validation data on svm rbf gamma(default) = 94.48
Confusion Matrix for validation data
pd2    0     1    2    3    4    5    6     7    8    9
tt2                                                    
0    972     0    3    2    

In [5]:
#RBF SVM with gamma 1
def rbf_classification_default_gamma(data, target, test_matrix1, test_target1, 
                                        test_matrix2, test_target2, 
                                        test_matrix3, test_target3):
    classification = svm.SVC(kernel='rbf', gamma=1) #gamma set to 1 with rbf kernel. takes whole night to run and gives sad accuracy
    classification.fit(data, target)
    predicted_val1 = classification.predict(test_matrix1)
    predicted_val2 = classification.predict(test_matrix2)
    predicted_val3 = classification.predict(test_matrix3)
    ctr1, ctr2, ctr3 = 0, 0, 0
    for i in range(len(test_target1)):
        if(predicted_val1[i] == test_target1[i]):
            ctr1 += 1
        if(predicted_val2[i] == test_target2[i]):
            ctr2 += 1
    for i in range(len(test_target3)):
        if(predicted_val3[i] == test_target3[i]):
            ctr3 += 1
    print("Accuracy for MNIST test data on svm rbf gamma(1) = "+str((ctr1/len(test_target1))*100))
    actual1 = pd.Series(test_target1, name='tt1')
    pred1 = pd.Series(predicted_val1, name='pd1')
    print("Confusion Matrix for test data")
    print(pd.crosstab(actual1, pred1))
    print("Accuracy for MNIST validation data on svm rbf gamma(1) = "+str((ctr2/len(test_target2))*100))
    actual2 = pd.Series(test_target2, name='tt2')
    pred2 = pd.Series(predicted_val2, name='pd2')
    print("Confusion Matrix for validation data")
    print(pd.crosstab(actual2, pred2))
    print("Accuracy for USPS data on svm rbf gamma(1) = "+str((ctr3/len(test_target3))*100))
    actual3 = pd.Series(test_target3, name='tt3')
    pred3 = pd.Series(predicted_val3, name='pd3')
    print("Confusion Matrix for USPS data")
    print(pd.crosstab(actual3, pred3))
    
rbf_classification_default_gamma(training_matrix, training_target, 
                                    test_matrix, test_target, 
                                    validation_matrix, validation_target, 
                                    np.asarray(USPSMat), np.asarray(USPSTar))

Accuracy for MNIST test data on svm rbf gamma(1) = 17.59
Accuracy for MNIST validation data on svm rbf gamma(1) = 18.240000000000002
Accuracy for USPS data on svm rbf gamma(1) = 10.000500025001251


# Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier


def rf_classification(data, target, test_matrix1, test_target1, 
                      test_matrix2, test_target2, 
                      test_matrix3, test_target3):
    classification = RandomForestClassifier(n_estimators=1000, max_depth=60, random_state=0) #1000 decision trees with each tree of depth upto 60
    classification.fit(data, target)
    predicted_val1 = classification.predict(test_matrix1)
    predicted_val2 = classification.predict(test_matrix2)
    predicted_val3 = classification.predict(test_matrix3)
    ctr1, ctr2, ctr3 = 0, 0, 0
    for i in range(len(test_target1)):
        if(predicted_val1[i] == test_target1[i]):
            ctr1 += 1
        if(predicted_val2[i] == test_target2[i]):
            ctr2 += 1
    for i in range(len(test_target3)):
        if(predicted_val3[i] == test_target3[i]):
            ctr3 += 1
    print("Accuracy for MNIST test data on random forest = "+str((ctr1/len(test_target1))*100))
    actual1 = pd.Series(test_target1, name='tt1')
    pred1 = pd.Series(predicted_val1, name='pd1')
    print("Confusion Matrix for test data")
    print(pd.crosstab(actual1, pred1))
    print("Accuracy for MNIST validation data on random forest = "+str((ctr2/len(test_target2))*100))
    actual2 = pd.Series(test_target2, name='tt2')
    pred2 = pd.Series(predicted_val2, name='pd2')
    print("Confusion Matrix for validation data")
    print(pd.crosstab(actual2, pred2))
    print("Accuracy for USPS data on svm random forest = "+str((ctr3/len(test_target3))*100))
    actual3 = pd.Series(test_target3, name='tt3')
    pred3 = pd.Series(predicted_val3, name='pd3')
    print("Confusion Matrix for USPS data")
    print(pd.crosstab(actual3, pred3))
    return predicted_val1
    
RF_predicted = rf_classification(training_matrix, training_target, 
                                    test_matrix, test_target, 
                                    validation_matrix, validation_target, 
                                    np.asarray(USPSMat), np.asarray(USPSTar))

Accuracy for MNIST test data on random forest = 97.06
Confusion Matrix for test data
pd1    0     1     2    3    4    5    6    7    8    9
tt1                                                    
0    969     0     0    0    0    2    3    1    4    1
1      0  1123     3    3    0    2    2    0    1    1
2      6     0  1000    5    3    0    4    8    6    0
3      0     0     8  976    0    7    0    9    8    2
4      1     0     1    0  956    0    6    0    2   16
5      3     0     0   11    3  860    6    1    5    3
6      5     3     0    0    2    4  941    0    3    0
7      1     3    18    1    1    0    0  992    2   10
8      3     0     6    9    5    5    3    4  929   10
9      7     5     1   11   12    2    1    4    6  960
Accuracy for MNIST validation data on random forest = 97.5
Confusion Matrix for validation data
pd2    0     1    2     3    4    5    6     7    8    9
tt2                                                     
0    980     0    3     0    0   

In [37]:
majority_voted = []
for i in range(len(LR_predicted)):
    a = [] 
    a.append(LR_predicted[i])
    a.append(NN_predicted[i])
    a.append(RF_predicted[i])
    a.append(SVM_predicted[i]) #predicted labels from all the classifiers
    majority_voted.append(max(a,key=a.count)) #pick predicted class with most number of occurences in the list
ctr = 0
for i in range(len(majority_voted)):
    if(majority_voted[i] == test_target[i]):
        ctr += 1 
print("Result of majority voting = " + str((ctr/len(majority_voted))*100) + "%")

Result of majority voting = 95.89999999999999%
