In [1]:
import pickle
import gzip
import numpy as np
from sklearn.svm import SVC,SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_validate,StratifiedShuffleSplit,GridSearchCV
from sklearn.metrics import confusion_matrix
from PIL import Image
import os
from sklearn.externals import joblib
import pandas as pd
np.random.seed(666)

In [2]:
def accuracy(test_data,test_target,classifier):
    correct=0
    conf_mat = confusion_matrix(test_target,classifier.predict(test_data))
    for i in range(len(conf_mat)):
        correct += conf_mat[i][i]
    return correct/len(test_data),conf_mat

def more_metrics(conf_mat):
    true_positives = 0
    precision = []
    recall = []
    for i in range(len(conf_mat)):
        true_positives += conf_mat.iloc[i,i]
    conf_mat = np.matrix(conf_mat)
    tp_fp = np.array(np.sum(conf_mat,axis=1)).ravel()
    relevant_elements = np.array(np.sum(conf_mat,axis=0)).ravel()
    for i in range(len(conf_mat)):
        precision.append(conf_mat[i,i]/tp_fp[i])
        recall.append(conf_mat[i,i]/relevant_elements[i])
    return true_positives,precision,recall

In [3]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = training_data[0]
train_target = training_data[1]
val_data = validation_data[0]
val_target = validation_data[1]
test_target = test_data[1]
test_data = test_data[0]

In [4]:
scaler = StandardScaler()
scaler.fit(train_data)
processed_train_data = scaler.transform(train_data)
scaler.fit(val_data)
processed_val_data = scaler.transform(val_data)
scaler.fit(test_data)
processed_test_data = scaler.transform(test_data)

In [5]:
baseClassifier = RandomForestClassifier(n_estimators=100)
baseClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [6]:
highVarClassifier = RandomForestClassifier(n_estimators=1500)
highVarClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=4,
            oob_score=False, random_state=None, verbose=True,
            warm_start=True)

In [7]:
#joblib.dump(superhighVarClassifier,'./models/randomForestModel.joblib')
#joblib.load('./models/randomForestModel.joblib') 
# Warning: Working with 8 cores, if you use around <6 then decrease n_jobs 
superhighVarClassifier = RandomForestClassifier(n_estimators=2500)
superhighVarClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=6,
            oob_score=False, random_state=None, verbose=True,
            warm_start=True)

In [None]:
#supersuperhighVarClassifier = RandomForestClassifier(n_estimators=3000,n_jobs=6)

In [None]:
classifier = joblib.load('./models/randomForestModel.joblib') 
classifier.fit(processed_train_data, train_target)

# Validating Models on different HyperParams

In [None]:
acc,conf_mat = accuracy(processed_val_data,val_target,classifier)
print("The Accuracy for Validation is: "+str(acc))
print("The Confusion Matrix is: ")
print(pd.DataFrame(conf_mat))
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

# Pre Process USPS

In [None]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

# Processing USPS

In [None]:
scaler.fit(USPSMat)
processed_USPSDat = scaler.transform(USPSMat)

# Testing the Random Forest Model

In [None]:
acc,conf_mat = accuracy(processed_test_data,test_target,classifier)
print("The Accuracy for Testing on MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

In [None]:
acc,conf_mat = accuracy(processed_USPSDat,USPSTar,classifier)
print("The Accuracy for Testing on USPS is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))