In [1]:
import pickle
import gzip
import numpy as np
from sklearn.svm import SVC,SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_validate,StratifiedShuffleSplit,GridSearchCV
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
from sklearn.externals import joblib
import pandas as pd
np.random.seed(666)

In [28]:
def accuracy(test_data,test_target,classifier):
    correct=0
    conf_mat = confusion_matrix(test_target,classifier.predict(test_data))
    for i in range(len(conf_mat)):
        correct += conf_mat[i][i]
    return correct/len(test_data),conf_mat

def more_metrics(conf_mat):
    true_positives = 0
    precision = []
    recall = []
    for i in range(len(conf_mat)):
        true_positives += conf_mat.iloc[i,i]
    conf_mat = np.matrix(conf_mat)
    tp_fp = np.array(np.sum(conf_mat,axis=1)).ravel()
    relevant_elements = np.array(np.sum(conf_mat,axis=0)).ravel()
    for i in range(len(conf_mat)):
        precision.append(conf_mat[i,i]/tp_fp[i])
        recall.append(conf_mat[i,i]/relevant_elements[i])
    return true_positives,precision,recall

In [4]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = training_data[0]
train_target = training_data[1]
val_data = validation_data[0]
val_target = validation_data[1]
test_target = test_data[1]
test_data = test_data[0]

In [4]:
'''filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = np.append(training_data[0],validation_data[0],axis=0)
train_target = np.append(training_data[1],validation_data[1],axis=0)
test_target = test_data[1]
test_data = test_data[0]'''

In [5]:
scaler = StandardScaler()
scaler.fit(train_data)
processed_train_data = scaler.transform(train_data)
scaler.fit(val_data)
processed_val_data = scaler.transform(val_data)
scaler.fit(test_data)
processed_test_data = scaler.transform(test_data)

In [6]:
baseClassifier = RandomForestClassifier(n_estimators=10)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [37]:
highVarClassifier = RandomForestClassifier(n_estimators=1500)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
#joblib.dump(superhighVarClassifier,'./models/randomForestModel.joblib')
#joblib.load('./models/randomForestModel.joblib') 
superhighVarClassifier = RandomForestClassifier(n_estimators=2500)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
classifier = baseClassifier
classifier.fit(processed_train_data, train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Validating Models on different HyperParams

In [51]:
acc,conf_mat = accuracy(processed_val_data,val_target,classifier)
print("The Accuracy for Validation is: "+str(acc))
print("The Confusion Matrix is: ")
print(pd.DataFrame(conf_mat))
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Validation is: 0.8902
The Confusion Matrix is: 
     0    1    2    3    4    5    6    7    8    9
0  929    0   14    4    5    6   16    1   15    1
1    0  903   95    7    6   10    9    1   32    1
2    6    1  924   14    4    4    9   10   15    3
3    7    2   38  931    1   14    1    3   29    4
4    3    4   15    0  914    3    4    7    7   26
5   19    0   19  110    6  700   11    3   39    8
6    3    0   10    2   18    4  919    0   11    0
7    4    6   16   23   29    6    0  903    9   94
8    9    5   20   14    8   10    3    4  930    6
9    4    0   10   17   36   10    1   18   16  849
The Precision & Recall is: 
   Precision     Recall
0  93.743693  94.410569
1  84.868421  98.045603
2  93.333333  79.586563
3  90.388350  82.976827
4  92.980671  88.997079
5  76.502732  91.264668
6  95.036194  94.450154
7  82.844037  95.052632
8  92.170466  84.315503
9  88.345473  85.584677


# Pre Process USPS

In [9]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

# Processing USPS

In [10]:
scaler.fit(USPSMat)
processed_USPSDat = scaler.transform(USPSMat)

(0.14825741287064353,
 array([[ 244,    4,  273,  428,    1,   23,   88,    0,  933,    6],
        [  31,    1,  180,  690,    2,    7,    8,    0, 1077,    4],
        [ 145,    4,  448,  440,    4,   31,   18,    0,  904,    5],
        [  48,    0,  132,  639,    0,   19,    7,    0, 1154,    1],
        [  63,    1,   88,  385,   10,   23,   10,    0, 1415,    5],
        [  93,    3,  162,  434,    6,   67,   28,    0, 1206,    1],
        [ 172,    1,  208,  432,   13,   78,  157,    1,  933,    5],
        [  53,    7,  135,  604,    2,   23,   21,    0, 1154,    1],
        [  70,    2,   92,  342,    8,   45,   48,    0, 1391,    2],
        [  33,    4,  131,  579,    4,   18,    5,    2, 1216,    8]]))

# Testing the Random Forest Model

In [46]:
acc,conf_mat = accuracy(processed_test_data,test_target,classifier)
print("The Accuracy for Testing on MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on MNIST is: 0.9284
The Confusion Matrix is: 
[[ 970    0    1    1    0    3    2    2    1    0]
 [   0 1088    2   13    2   16    5    0    8    1]
 [  13    1  956    9    5    2    6   11   27    2]
 [   8    2   19  930    0   20    1    7   19    4]
 [   3    2    6    6  927    4    8    1    5   20]
 [  15    1    5   36    3  817    5    2    6    2]
 [  15    3    3    1    8   10  909    0    9    0]
 [   5    6   30   14   14    2    1  919   13   24]
 [   9    1   11   30    8   16    6    4  878   11]
 [   6    2    7   28   46    9    2    6   13  890]]
The Precision & Recall is: 
   Precision     Recall
0  98.979592  92.911877
1  95.859031  98.372514
2  92.635659  91.923077
3  92.079208  87.078652
4  94.399185  91.510365
5  91.591928  90.878754
6  94.885177  96.190476
7  89.396887  96.533613
8  90.143737  89.683350
9  88.206145  93.291405


In [45]:
acc,conf_mat = accuracy(processed_USPSDat,USPSTar,classifier)
print("The Accuracy for Testing on USPS is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on USPS is: 0.120006000300015
The Confusion Matrix is: 
[[ 126    0  493  161   63   28   95   13  936   85]
 [  15    0  246   96    1    9    5    2 1594   32]
 [  80    1  394  101   10   14   19    3 1360   17]
 [  66    0  263  343    3   23   15    0 1278    9]
 [  32    0  262  186   25   35   36    2 1373   49]
 [  83    0  411  166   27   51   47    0 1186   29]
 [ 214    0  361   88   78   27  117   11 1079   25]
 [  50    0  105  126   44    5   15    1 1635   19]
 [ 100    0  154  222   83   59   57    0 1298   27]
 [  59    2  206  245   18   19   14    0 1392   45]]
The Precision & Recall is: 
   Precision     Recall
0   6.300000  15.272727
1   0.000000   0.000000
2  19.709855  13.609672
3  17.150000  19.780854
4   1.250000   7.102273
5   2.550000  18.888889
6   5.850000  27.857143
7   0.050000   3.125000
8  64.900000   9.885005
9   2.250000  13.353116
