In [1]:
import pickle
import gzip
import numpy as np
from sklearn.svm import SVC,SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_validate,StratifiedShuffleSplit,GridSearchCV
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
from sklearn.externals import joblib
import pandas as pd
np.random.seed(666)

In [2]:
def accuracy(test_data,test_target,classifier):
    correct=0
    conf_mat = confusion_matrix(test_target,classifier.predict(test_data))
    for i in range(len(conf_mat)):
        correct += conf_mat[i][i]
    return correct/len(test_data),conf_mat

def more_metrics(conf_mat):
    true_positives = 0
    precision = []
    recall = []
    for i in range(len(conf_mat)):
        true_positives += conf_mat.iloc[i,i]
    conf_mat = np.matrix(conf_mat)
    tp_fp = np.array(np.sum(conf_mat,axis=1)).ravel()
    relevant_elements = np.array(np.sum(conf_mat,axis=0)).ravel()
    for i in range(len(conf_mat)):
        precision.append(conf_mat[i,i]/tp_fp[i])
        recall.append(conf_mat[i,i]/relevant_elements[i])
    return true_positives,precision,recall

In [3]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = training_data[0]
train_target = training_data[1]
val_data = validation_data[0]
val_target = validation_data[1]
test_target = test_data[1]
test_data = test_data[0]

In [4]:
scaler = StandardScaler()
scaler.fit(train_data)
processed_train_data = scaler.transform(train_data)
scaler.fit(val_data)
processed_val_data = scaler.transform(val_data)
scaler.fit(test_data)
processed_test_data = scaler.transform(test_data)

In [5]:
baseClassifier = RandomForestClassifier(n_estimators=100)
baseClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [6]:
highVarClassifier = RandomForestClassifier(n_estimators=1500)
highVarClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=4,
            oob_score=False, random_state=None, verbose=True,
            warm_start=True)

In [7]:
#joblib.dump(superhighVarClassifier,'./models/randomForestModel.joblib')
#joblib.load('./models/randomForestModel.joblib') 
# Warning: Working with 8 cores, if you use around <6 then decrease n_jobs 
superhighVarClassifier = RandomForestClassifier(n_estimators=2500)
superhighVarClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=6,
            oob_score=False, random_state=None, verbose=True,
            warm_start=True)

In [None]:
#supersuperhighVarClassifier = RandomForestClassifier(n_estimators=3000,n_jobs=6)

In [8]:
classifier = joblib.load('./models/randomForestModel.joblib') 
#classifier.fit(processed_train_data, train_target)

# Validating Models on different HyperParams

In [12]:
acc,conf_mat = accuracy(processed_val_data,val_target,classifier)
print("The Accuracy for Validation is: "+str(acc))
print("The Confusion Matrix is: ")
print(pd.DataFrame(conf_mat))
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Recall"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Precision"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Validation is: 0.9892
The Confusion Matrix is: 
     0     1    2     3    4    5    6     7    8    9
0  987     0    0     0    1    0    3     0    0    0
1    0  1062    1     0    0    0    0     1    0    0
2    1     0  987     0    0    0    0     1    1    0
3    0     0    3  1020    0    3    0     2    1    1
4    0     6    0     0  973    0    0     2    0    2
5    0     0    1     1    1  908    4     0    0    0
6    2     0    0     0    0    2  963     0    0    0
7    0     4    2     0    1    0    0  1082    0    1
8    2     8    2     4    0    7    0     2  981    3
9    2     1    0     5   11    4    0     9    0  929
The Precision & Recall is: 
      Recall  Precision
0  99.596367  99.295775
1  99.812030  98.242368
2  99.696970  99.096386
3  99.029126  99.029126
4  98.982706  98.581560
5  99.234973  98.268398
6  99.586350  99.278351
7  99.266055  98.453139
8  97.224975  99.796541
9  96.670135  99.252137


# Pre Process USPS

In [13]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

# Processing USPS

In [14]:
scaler.fit(USPSMat)
processed_USPSDat = scaler.transform(USPSMat)

# Testing the Random Forest Model

In [15]:
acc,conf_mat = accuracy(processed_test_data,test_target,classifier)
print("The Accuracy for Testing on MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Recall"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Precision"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on MNIST is: 0.9668
The Confusion Matrix is: 
[[ 965    1    1    0    0    4    8    1    0    0]
 [   0 1127    2    1    0    2    2    1    0    0]
 [   8    1  997    5    2    2    2   13    2    0]
 [   0    0    6  985    0    4    0   12    2    1]
 [   1    0    2    0  960    1    6    4    1    7]
 [   2    1    0    9    0  873    3    2    2    0]
 [   9    4    1    0    4    3  937    0    0    0]
 [   0    8   16    0    0    0    0 1001    0    3]
 [   4    1    7   17    9   23    3    8  893    9]
 [   8    7    1   12   30    7    1   10    3  930]]
The Precision & Recall is: 
      Recall  Precision
0  98.469388  96.790371
1  99.295154  98.000000
2  96.608527  96.515005
3  97.524752  95.724004
4  97.759674  95.522388
5  97.869955  94.994559
6  97.807933  97.401247
7  97.373541  95.152091
8  91.683778  98.892580
9  92.170466  97.894737


In [16]:
acc,conf_mat = accuracy(processed_USPSDat,USPSTar,classifier)
print("The Accuracy for Testing on USPS is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Recall"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Precision"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on USPS is: 0.4097704885244262
The Confusion Matrix is: 
[[ 650   11  267   52  450  169   62   85    2  252]
 [  44  552  117   99   49  106   28  990   14    1]
 [  97   24 1271   66   49  209   17  259    5    2]
 [  37    5   88 1303   53  314    3  177    4   16]
 [  12  192   53   22 1095  185   15  385   23   18]
 [ 140   23  132   54   19 1501   22  101    5    3]
 [ 292   42  206   18   80  364  856  130    2   10]
 [  36  320  364  230   39  270   28  698    5   10]
 [  35   40  142  193  101 1163   57   93  166   10]
 [  15  254  217  304  246  133    8  629   91  103]]
The Precision & Recall is: 
      Recall  Precision
0  32.500000  47.864507
1  27.600000  37.730690
2  63.581791  44.487224
3  65.150000  55.659974
4  54.750000  50.206327
5  75.050000  34.005437
6  42.800000  78.102190
7  34.900000  19.678602
8   8.300000  52.365931
9   5.150000  24.235294
