## Load MNIST on Python 3.x

In [1]:
import pickle
import gzip

from sklearn.metrics import confusion_matrix

In [2]:
filename = 'mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()

In [5]:
training_data

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([5, 0, 4, ..., 8, 4, 8], dtype=int64))

## Load USPS on Python 3.x

In [None]:
from PIL import Image
import os
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.sparse

In [None]:
USPSMat  = []
USPSTar  = []
curPath  = 'USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

### Loading data

In [None]:
train_feat = training_data[0]
train_tar = training_data[1]
val_feat = validation_data[0]
val_tar = validation_data[1]
test_feat = test_data[0]
test_tar = test_data[1]

In [None]:
print(len(training_data[0][1,:]))#each image is of 28*28 --> so 784 features for each image(i.e., for each sample). 1 here represents the first image
print(len(training_data[1])) #training_data[1] is the ACTUAL target value for each image and this is also 50,000

# LOGISTIC REGRESSION

In [None]:
# Performing one-hot encoding on the target values (i.e., 10 classes)
def onehot(tar):
    n = tar.shape[0]
    oh = scipy.sparse.csr_matrix((np.ones(n), (tar, np.array(range(n)))))
    oh = np.array(oh.todense()).T
    return oh

def softmax(x):
    x -= np.max(x)
    softmax = (np.exp(x).T / np.sum(np.exp(x), axis = 1)).T
    return softmax

#finding probabilities and predictions given a set of input data
def probsandpreds(features):
    probabilities = softmax(np.dot(features, w)) #a vector of 10 probabilities corresponding to each class
    predictions = np.argmax(probabilities, axis = 1) #maximum probability as the class
    return probabilities, predictions

def Loss(w, feat, tar, la):
    n = feat.shape[0]
    tar_oh = onehot(tar)
    Y = np.dot(feat, w) #predicting target using linear regression 
    probs = softmax(Y)
    loss = (-1/n)  * np.sum(tar_oh * np.log(probs)) + ((la/2) * np.sum(w*w))
    gradient = (-1/n) * np.dot(feat.T, (tar_oh - probs)) + la*w
    return loss, gradient

#Accuracy
def GetAccuracy(x, y):
    probs, preds = probsandpreds(x)
    accuracy = sum(preds == y)/(float(len(y)))
    return accuracy

In [None]:
w = np.zeros([train_feat.shape[1], len(np.unique(train_tar))]) #w will be a matrix from each feature to all the output classes, so 784x10
la = 10
learningRate = 0.01
losses = []
x1 = 0
xn = 256

for i in range(x1, xn):
    loss, gradient = Loss(w, train_feat, train_tar, la)
    losses.append(loss)
    w = w - (learningRate * gradient)

In [None]:
def LR(train_feat, train_tar):
    for i in range(x1, xn):
        loss, gradient = Loss(w, train_feat, train_tar, la)
        losses.append(loss)
        w = w - (learningRate * gradient)

In [223]:
print ('---------- Logistic Regression using Stochastic Gradient Descent --------------------')
print("Lambda = " + str(la/np.subtract(xn, x1))) # lambda is La/no. of samples
print("eta = " + str(learningRate))
print("Validation Accuracy = " + str(GetAccuracy(val_feat, val_tar)*100))
print("Testing Accuracy = " + str(GetAccuracy(test_feat, test_tar)*100))
print("USPS Accuracy = " + str(GetAccuracy(USPSMat, USPSTar)*100))

---------- Logistic Regression using Stochastic Gradient Descent --------------------
Lambda = 0.0390625
eta = 0.01
Validation Accuracy = 73.99
Testing Accuracy = 0.7235
USPS Accuracy = 0.24251212560628033


In [217]:
print('########################### CONFUSION MATRICES FOR LOGISTIC REGRESSION ###########################')
probs_val, preds_val = probsandpreds(val_feat)
print("\nConfusion Matrix of Validation Data: \n\n" + str(confusion_matrix(val_tar, preds_val)))
probs_t, preds_t = probsandpreds(test_feat)
print("\nConfusion Matrix of Testing Data: \n\n" + str(confusion_matrix(test_tar, preds_t)))
probs_usps, preds_usps = probsandpreds(USPSMat)
print("\n Confusion Matrix of USPS Data: \n\n" + str(confusion_matrix(USPSTar, preds_usps)))


Confusion Matrix of Validation Data: 

[[957   0   2   5   0   0   7   2  18   0]
 [  1 954   7  16   0   0   4   1  81   0]
 [ 75  10 773  38   3   0  31   9  45   6]
 [ 49   4  17 882   0   0   3   2  65   8]
 [ 59   8   3   4 578   0  41   1  78 211]
 [400   7  23 242   5   0  28   5 178  27]
 [ 96   9  12   3   0   0 830   0  17   0]
 [ 73  21  15  10   2   0   1 877  55  36]
 [ 50  13  10  97   0   0   7   4 822   6]
 [ 74   7   8  27   7   0   0  36  76 726]]

Confusion Matrix of Testing Data: 

[[963   0   2   4   0   0   4   0   7   0]
 [  0 994   6  18   0   0   5   0 112   0]
 [ 96   5 749  70   4   0  34   6  65   3]
 [ 54   1  19 865   0   0   7   7  51   6]
 [ 54   6   9   7 534   0  58   2  93 219]
 [366   9  14 247   1   0  30   9 195  21]
 [134   4  11   4   2   0 781   0  22   0]
 [ 47  26  33   7   3   0   1 814  55  42]
 [ 65   2   6 106   1   0  16   8 761   9]
 [ 70   6  11  25  14   0   3  39  67 774]]

 Confusion Matrix of USPS Data: 

[[1392    2  259   54  107

# SUPPORT VECTOR MACHINE (SVM)

In [291]:
from sklearn import svm

C = 0.1
gamma = 0.1
clf = svm.SVC(kernel='linear', C=C, gamma = gamma, random_state = 123)
clf.fit(train_feat, train_tar)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [292]:
from sklearn.metrics import accuracy_score

# Getting Validation dataset accuracy
val_pred_svm = clf.predict(val_feat)
acc_val_svm = accuracy_score(val_tar, val_pred_svm)

# # Getting Testing dataset accuracy
# test_pred_svm = clf.predict(test_feat)
# acc_test_svm = accuracy_score(test_tar, test_pred_svm)

# # Getting USPS dataset Accuracy
# usps_pred_svm = clf.predict(USPSMat)
# usps_acc_svm = accuracy_score(USPSTar, usps_pred_svm)

print ('---------- Support Vector Machine (SVM) --------------------')
print("Regularization Parameter/Penalty(C) = " + str(C))
print("gamma = " + str(gamma))
print("SVM Validation Accuracy: ", acc_val_svm*100)
# print("SVM Test Accuracy: ", acc_test_svm*100)
#print("SVM USPS Accuracy: ", usps_acc_svm*100)

---------- Support Vector Machine (SVM) --------------------
Regularization Parameter/Penalty(C) = 0.1
gamma = 0.1
SVM Validation Accuracy:  94.81


In [259]:
print('########################### CONFUSION MATRICES FOR SVM ###########################')
print("\nConfusion Matrix of Validation Data: \n\n" + str(confusion_matrix(val_tar, val_pred_svm)))
print("\nConfusion Matrix of Testing Data: \n\n" + str(confusion_matrix(test_tar, test_pred_svm)))
print("\n Confusion Matrix of USPS Data: \n\n" + str(confusion_matrix(USPSTar, usps_pred_svm)))


Confusion Matrix of Validation Data: 

[[ 968    0    6    0    2    4    8    0    1    2]
 [   0 1045    3    5    1    0    0    3    7    0]
 [   6   10  924   12    4    3    8    9   11    3]
 [   5    6   17  939    1   37    0    2   19    4]
 [   3    7   12    0  921    1    4    5    5   25]
 [  13    4    9   41    3  810   17    2   13    3]
 [   7    2   10    1   12   10  924    0    1    0]
 [   2    4   14   15    8    3    0 1025    1   18]
 [   6   16   14   29    2   26    4    6  897    9]
 [   4    6    5   11   31    5    1   37    8  853]]

Confusion Matrix of Testing Data: 

[[ 955    0    6    1    0    7    8    1    1    1]
 [   0 1116    5    4    0    1    2    1    6    0]
 [   6   13  957   14    4    3   10    8   15    2]
 [   5    2   18  943    3   13    2    6   14    4]
 [   2    0    9    1  940    0    5    3    2   20]
 [  17    6    5   39    6  782   10    2   20    5]
 [  10    5   14    0    6   16  904    1    2    0]
 [   1    6   20   14

# RANDOM FOREST

In [220]:
from sklearn.ensemble import RandomForestClassifier

estimators = 100
clf_rf = RandomForestClassifier(n_estimators = estimators, criterion = 'entropy') # using ENTROPY to measure the split
clf_rf.fit(train_feat, train_tar) #training the model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [224]:
from sklearn.metrics import accuracy_score

# Getting Validation dataset accuracy
val_pred_rf = clf_rf.predict(val_feat)
acc_val_rf = accuracy_score(val_tar, val_pred_rf)

# Getting Testing dataset accuracy
test_pred_rf = clf_rf.predict(test_feat)
acc_test_rf = accuracy_score(test_tar, test_pred_rf)

# Getting USPS dataset Accuracy
usps_pred_rf = clf_rf.predict(USPSMat)
usps_acc = accuracy_score(USPSTar, usps_pred_rf)

print ('------------- Random Forest --------------------')
print("Number of Trees in the forest = " + str(estimators))
print("Random Forrest Validation Accuracy: ", acc_val_rf*100)
print("Random Forrest Test Accuracy: ", acc_test_rf*100)
print("Random Forrest USPS Accuracy: ", usps_acc*100)

------------- Random Forest --------------------
Number of Trees in the forest = 100
Random Forrest Validation Accuracy:  97.11999999999999
Random Forrest Test Accuracy:  96.81
Random Forrest USPS Accuracy:  38.46192309615481


In [225]:
print('########################### CONFUSION MATRICES FOR RANDOM FOREST CLASSIFIER ###########################')
print("\nConfusion Matrix of Validation Data: \n\n" + str(confusion_matrix(val_tar, val_pred_rf)))
print("\nConfusion Matrix of Testing Data: \n\n" + str(confusion_matrix(test_tar, test_pred_rf)))
print("\n Confusion Matrix of USPS Data: \n\n" + str(confusion_matrix(USPSTar, usps_pred_rf)))


Confusion Matrix of Validation Data: 

[[ 978    0    4    0    0    0    2    0    5    2]
 [   0 1052    5    1    1    1    1    0    2    1]
 [   2    0  965    1    2    1    4    8    4    3]
 [   3    0    4  999    0    8    1    4    7    4]
 [   0    5    1    1  948    1    2    1    4   20]
 [   5    2    3   15    2  869   10    1    5    3]
 [   1    0    0    0    2    2  959    0    3    0]
 [   0    5   10    1    4    0    0 1057    0   13]
 [   1    3    6    6    2    7    5    2  970    7]
 [   5    3    2   11    6    4    0    9    6  915]]

Confusion Matrix of Testing Data: 

[[ 970    0    0    0    0    1    4    1    4    0]
 [   0 1120    3    4    0    2    3    1    2    0]
 [   5    0  999    5    2    0    4    9    7    1]
 [   1    0   13  970    0    8    0    9    7    2]
 [   1    0    2    0  952    0    4    1    2   20]
 [   2    0    0   16    3  856    6    1    5    3]
 [   7    3    1    0    3    4  938    0    2    0]
 [   2    4   19    2

# NEURAL NETWORK

In [324]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam, RMSprop
import keras.backend as K

# Converting Target Variables
train_tar_cat = keras.utils.to_categorical(train_tar, 10)
val_tar_cat = keras.utils.to_categorical(val_tar, 10)
test_tar_cat = keras.utils.to_categorical(test_tar, 10)
usps_tar_cat = keras.utils.to_categorical(USPSTar, 10)

# Converting a list of USPS arrays into single array
usps_feat = np.vstack(USPSMat)

K.clear_session()
model = Sequential()
model.add(Dense(units = 32, input_dim = 784, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 32, input_dim=32, activation='relu')) #USE 32 FOR IDEAL
model.add(Dropout(0.2))
model.add(Dense(10, activation = 'softmax'))

model.compile('adam', 'categorical_crossentropy', metrics = ['accuracy'])

h = model.fit(train_feat, train_tar_cat,
             batch_size = 128,
             epochs = 20,
             verbose = 0)

In [325]:
val_pred_nn = model.predict_classes(val_feat)
test_pred_nn = model.predict_classes(test_feat)
usps_pred_nn = model.predict_classes(usps_feat)

loss_val, accuracy_val = model.evaluate(val_feat, val_tar_cat, verbose = False)
print("\nValidation CrossEntropy: ", loss_val)
print("\nValidation Accuracy: ", accuracy_val*100)

loss_test, accuracy_test = model.evaluate(test_feat, test_tar_cat, verbose = False)
print("\nTest CrossEntropy: ", loss_test)
print("\nTest Accuracy: ", (round(accuracy_test*100, 4)))

loss_usps, accuracy_usps = model.evaluate(usps_feat, usps_tar_cat, verbose = False)
print("\nUSPS CrossEntropy: ", loss_usps)
print("\nUSPS Accuracy: ", round(accuracy_usps*100, 4))


Validation CrossEntropy:  0.12830573420338331

Validation Accuracy:  96.5

Test CrossEntropy:  0.13939059649528934

Test Accuracy:  95.88

USPS CrossEntropy:  4.807576025382919

USPS Accuracy:  36.4018


In [255]:
print('########################### CONFUSION MATRICES FOR NEURAL NETWORKS ###########################')
print("\nConfusion Matrix of Validation Data: \n\n" + str(confusion_matrix(val_tar, val_pred_nn)))
print("\nConfusion Matrix of Testing Data: \n\n" + str(confusion_matrix(test_tar, test_pred_nn)))
print("\n Confusion Matrix of USPS Data: \n\n" + str(confusion_matrix(USPSTar, usps_pred_nn)))


Confusion Matrix of Validation Data: 

[[ 971    0    2    2    0    1    5    2    4    4]
 [   0 1048    1    6    1    2    1    0    4    1]
 [   4    4  939   11    3    1    2   14   12    0]
 [   1    4    6  983    1   19    0    4    9    3]
 [   1    5    4    0  932    0    5    1    4   31]
 [   7    1    3   17    2  863   12    1    5    4]
 [   3    1    3    0    3    6  950    0    1    0]
 [   6    8    4    5    2    0    0 1060    0    5]
 [   3    9    2   11    1   15    1    3  951   13]
 [   7    2    0    9   15    2    0   10    4  912]]

Confusion Matrix of Testing Data: 

[[ 964    0    1    1    0    2    7    3    1    1]
 [   0 1120    3    3    0    0    3    1    5    0]
 [   8    3  964   14    6    1    5   14   17    0]
 [   0    2    9  976    0   11    0    7    5    0]
 [   2    1    3    0  921    0    7    4    4   40]
 [   6    1    0   25    1  829   10    2   13    5]
 [   8    5    2    1    6   13  919    0    4    0]
 [   2   11   14    6


# ENSEMBLE CLASSIFIER

In [265]:
from statistics import mode

val_final_pred = np.array([])
test_final_pred = np.array([])
usps_final_pred = np.array([])

for i in range(0, len(val_feat)):
    try:
        val_final_pred = np.append(val_final_pred, mode([preds_val[i], val_pred_svm[i], val_pred_rf[i], val_pred_nn[i]]))
    except:
        val_final_pred = np.append(val_final_pred, max([preds_t[i], val_pred_svm[i], val_pred_rf[i], val_pred_nn[i]]))

for i in range(0, len(test_feat)):
    try:
        test_final_pred = np.append(test_final_pred, mode([preds_t[i], test_pred_svm[i], test_pred_rf[i], test_pred_nn[i]]))
    except:
        test_final_pred = np.append(test_final_pred, max([preds_t[i], test_pred_svm[i], test_pred_rf[i], test_pred_nn[i]]))
        
for i in range(0, len(USPSTar)):
    try:
        usps_final_pred = np.append(usps_final_pred, mode([preds_usps[i], usps_pred_svm[i], usps_pred_rf[i], usps_pred_nn[i]]))
    except:
        usps_final_pred = np.append(usps_final_pred, max([preds_usps[i], usps_pred_svm[i], usps_pred_rf[i], usps_pred_nn[i]]))
        
print ('------------- Ensemble Classifier --------------------')
print("Ensemble Classifier Validation Accuracy: ", accuracy_score(val_tar, val_final_pred)*100)
print("Ensemble Classifier Test Accuracy: ", accuracy_score(test_tar, test_final_pred)*100)
print("Ensemble Classifier USPS Accuracy: ", accuracy_score(USPSTar, usps_final_pred)*100)

------------- Ensemble Classifier --------------------
Ensemble Classifier Validation Accuracy:  95.87
Ensemble Classifier Test Accuracy:  95.71
Ensemble Classifier USPS Accuracy:  37.21186059302965


In [268]:
print('########################### CONFUSION MATRICES FOR ENSEMBLE CLASSIFIER ###########################')
print("\nConfusion Matrix of Validation Data: \n\n" + str(confusion_matrix(val_tar, val_final_pred)))
print("\nConfusion Matrix of Testing Data: \n\n" + str(confusion_matrix(test_tar, test_final_pred)))
print("\n Confusion Matrix of USPS Data: \n\n" + str(confusion_matrix(USPSTar, usps_final_pred)))

########################### CONFUSION MATRICES FOR ENSEMBLE CLASSIFIER ###########################

Confusion Matrix of Validation Data: 

[[ 975    0    3    0    0    1    1    0    6    5]
 [   0 1046    2    4    1    1    2    0    7    1]
 [   6    1  933    7    4    1    4   13   16    5]
 [   2    1    4  978    1   13    0    5   17    9]
 [   2    6    1    0  924    0    4    1    6   39]
 [   8    2    3   16    1  846   17    3   11    8]
 [   3    1    0    0    2    2  954    0    2    3]
 [   3    6    6    1    1    0    0 1055    4   14]
 [   5    4    5    8    0   12    2    3  957   13]
 [   6    2    1   12    8    2    0    8    3  919]]

Confusion Matrix of Testing Data: 

[[ 967    0    1    0    0    1    6    1    4    0]
 [   0 1114    3    5    0    0    3    1    9    0]
 [   7    0  969   14    3    0   11    8   19    1]
 [   1    1    7  975    0    3    2    8   10    3]
 [   2    0    4    0  927    0    6    1    4   38]
 [   5    1    0   21    1  