In [38]:
import pickle
import gzip
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
from PIL import Image
import os
from sklearn.ensemble import RandomForestClassifier
np.random.seed(666)

In [2]:
def softmax(weights,train_data):
    bias = np.ones((np.shape(train_data)[0],1))
    train_withBias = np.hstack((train_data,bias))
    num = np.dot(weights,train_withBias.T)
    # High value Fix
    # https://houxianxu.github.io/2015/04/23/logistic-softmax-regression/
    num = np.subtract(num,np.max(num,axis=0))
    num = np.exp(num)
    # Fix softmax when using batch size 1 the dimension of deno changes
    if(len(train_data) == 1):
        deno = np.sum(num,axis=0)
    else:
        deno = np.sum(num,axis=1)
        deno = deno.reshape((10,1))
    return np.divide(num,deno)

def accuracy(predicted,target):
    correct = 0
    confusion_mat = np.zeros((10,10))
    for i in range(len(target)):
        if(predicted[i] == target[i]):
            correct+=1
        confusion_mat[target[i]][predicted[i]] =confusion_mat[target[i]][predicted[i]] +1
    return correct/len(target),pd.DataFrame(np.matrix(confusion_mat,dtype="int32"))

def one_hot_vect(tuple_data,classes):
    one_hot_encoded=np.zeros((len(tuple_data[1]),len(classes)))
    identity = np.identity(len(classes))
    for i in range(len(tuple_data[1])):
        one_hot_encoded[i] = np.add(one_hot_encoded[i],identity[tuple_data[1][i]])
    return one_hot_encoded

def more_metrics(conf_mat):
    true_positives = 0
    precision = []
    recall = []
    for i in range(len(conf_mat)):
        true_positives += conf_mat.iloc[i,i]
    conf_mat = np.matrix(conf_mat)
    tp_fp = np.array(np.sum(conf_mat,axis=1)).ravel()
    relevant_elements = np.array(np.sum(conf_mat,axis=0)).ravel()
    for i in range(len(conf_mat)):
        precision.append(conf_mat[i,i]/tp_fp[i])
        recall.append(conf_mat[i,i]/relevant_elements[i])
    return true_positives,precision,recall

# Importing Models

In [3]:
#np.savetxt('./models/logisticModel', weights)
#np.loadtxt('./models/logisticModel')

In [4]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = np.append(training_data[0],validation_data[0],axis=0)
train_target = np.append(training_data[1],validation_data[1])
test_target = test_data[1]
test_data = test_data[0]

In [5]:
scaler = StandardScaler()
scaler.fit(train_data)
processed_train_data = scaler.transform(train_data)
scaler.fit(test_data)
processed_test_data = scaler.transform(test_data)

In [6]:
rf = joblib.load("./models/randomForestModel.joblib")
lr = np.loadtxt("./models/logisticModel")
nn = joblib.load("./models/DNN_lowHidden.joblib")
svm = joblib.load("./models/SVMpdfModel3.joblib")



In [10]:
# Preserve order since LR is hardcoded and methods like rf depend on predic_proba :3
classifiers = [lr,nn,rf,svm]

In [8]:
def ensemble(classifiers,processed_test_data,test_data,weights):
    num_classifiers = len(classifiers)
    print("Working on Logistic Regression")
    lr_pred = np.multiply(softmax(classifiers[0],processed_test_data),weights[0])
    print("Working on Neural Network")
    nn_pred = np.multiply(classifiers[1].predict(test_data,verbose=True),weights[1])
    print("Working on Random Forest")
    rf_pred = np.multiply(classifiers[2].predict_proba(processed_test_data),weights[2])
    sumProb = (np.transpose(lr_pred)+rf_pred+nn_pred)
    wtAvg = np.divide(sumProb,len(classifiers))
    return np.argmax(wtAvg,axis=1)

In [69]:
# Brute Force to find best weights
def findWeights(classifiers,processed_test_data,test_data,test_target,weights)
    tracker = []
    for weight1 in range(1,4):
        for weight2 in range(1,4):
            for weight3 in range(1,4):
                if(not(weight1 == weight2 == weight3)):
                    weights=[weight1,weight2,weight3]
                    predicted = ensemble(classifiers,processed_test_data,test_data,weights)
                    acc,_ = accuracy(predicted,test_target)
                    print()
                    print("For weights "+str(weights)+" the accuracy is: "+str(acc))
                    tracker.append([weights,acc])
                    print()
    max_acc = 0
    tracker = np.asarray(tracker)
    for i,sample in enumerate(tracker):
    if(max_acc<tracker[i][1]):
        max_acc=tracker[i][1]
        opt_weight = tracker[1][0]
    return opt_weight

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 1, 2] the accuracy is: 0.1007

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 1, 3] the accuracy is: 0.1008

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 2, 1] the accuracy is: 0.1006

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 2, 2] the accuracy is: 0.1006

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 2, 3] the accuracy is: 0.1006

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 3, 1] the accuracy is: 0.1006

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For weights [1, 3, 2] the accuracy is: 0.1006

Working on Logistic Regression
Working on Neural Network
Working on Random Forest

For wei

In [89]:
opt_weight = findWeights(classifiers,processed_test_data,test_data,test_target,weights)

[1, 1, 3]

In [None]:
svm_pred = 

In [104]:
predicted = ensemble(classifiers,processed_test_data,test_data,opt_weight)
acc,conf_mat = accuracy(predicted,test_target)
print("The Accuracy for MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(pd.DataFrame(conf_mat))
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

Working on Logistic Regression
Working on Neural Network
Working on Random Forest
The Accuracy for MNIST is: 0.9832
The Confusion Matrix is: 
     0     1     2    3    4    5    6     7    8    9
0  972     1     0    1    0    0    2     1    3    0
1    0  1129     1    2    0    1    0     2    0    0
2    2     0  1011    1    1    1    1    11    3    1
3    1     0     1  981    0   13    0     8    3    3
4    0     0     2    1  962    0    3     2    1   11
5    2     0     0    3    0  881    2     1    1    2
6    4     2     0    1    2    2  943     0    4    0
7    1     1     6    2    0    0    0  1016    2    0
8    0     0     2    4    3    2    1     5  953    4
9    1     1     0    2    7    5    1     5    3  984
The Precision & Recall is: 
   Precision     Recall
0  99.183673  98.880977
1  99.471366  99.559083
2  97.965116  98.826979
3  97.128713  98.296593
4  97.963340  98.666667
5  98.766816  97.348066
6  98.434238  98.950682
7  98.832685  96.669838
8  97.843

In [18]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

In [19]:
#target_USPS = one_hot_vect((1,USPSTar),np.unique(USPSTar))
scaler.fit(USPSMat)
processed_USPSDat = scaler.transform(USPSMat)


In [98]:
predicted = ensemble(classifiers,processed_USPSDat,[USPSMat],opt_weight)
acc,conf_mat = accuracy(predicted,USPSTar)
print("The Accuracy for USPS is: "+str(acc))
print("The Confusion Matrix is: ")
print(pd.DataFrame(conf_mat))
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Precision"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Recall"]
print(pd.concat([df,df1],axis=1))

Working on Logistic Regression
Working on Neural Network
Working on Random Forest
The Accuracy for USPS is: 0.5059252962648132
The Confusion Matrix is: 
     0    1     2     3     4     5     6    7    8    9
0  836   71    54    13    32    54   116   48  150   37
1    4  575     4     4    71     3    15  234   12  104
2   66  144  1548    71    30    58   201  215   90   54
3   93  144    86  1465    23    68    44  566  328  259
4  186  321    13     3  1002     5    38   31   69  141
5  154  131    86   347   129  1647   226   50  362   93
6  112   91    79     6    34    15  1201    6   51    6
7  150  349    57    21   320    55    40  690   87  659
8   90   92    66    61   301    82    69  148  824  317
9  309   82     6     9    58    13    50   12   27  330
The Precision & Recall is: 
   Precision     Recall
0  59.248760  41.800000
1  56.042885  28.750000
2  62.494954  77.438719
3  47.626788  73.250000
4  55.389718  50.100000
5  51.069767  82.350000
6  75.015615  60.050000


In [21]:
def ensemblePlus(classifiers,processed_test_data,test_data):
    num_classifiers = len(classifiers)
    print("Working on Logistic Regression")
    lr_pred = softmax(classifiers[0],processed_test_data)
    print("Working on Neural Network")
    nn_pred = classifiers[1].predict(test_data,verbose=True)
    print("Working on Random Forest")
    rf_pred = classifiers[2].predict_proba(processed_test_data)
    print("Working on SVM Might take a while")
    if(len(np.shape(test_data))==3):
        test_data = np.squeeze(test_data)
    svm_pred = classifiers[3].predict_proba(test_data)
    
    featureVec = np.vstack([np.argmax(nn_pred,axis=1),np.argmax(rf_pred,axis=1),np.argmax(svm_pred,axis=1),np.argmax(lr_pred,axis=0)])
    
    return np.transpose(featureVec)

In [None]:
def ensemblePlusPlus(classifiers,processed_test_data,test_data):
    num_classifiers = len(classifiers)
    print("Working on Logistic Regression")
    lr_pred = softmax(classifiers[0],processed_test_data)
    print("Working on Neural Network")
    nn_pred = classifiers[1].predict(test_data,verbose=True)
    print("Working on Random Forest")
    rf_pred = classifiers[2].predict_proba(processed_test_data)
    print("Working on SVM Might take a while")
    if(len(np.shape(test_data))==3):
        test_data = np.squeeze(test_data)
    svm_pred = classifiers[3].predict_proba(test_data)
    
    featureVec = np.vstack([np.argmax(nn_pred,axis=1),np.argmax(rf_pred,axis=1),np.argmax(svm_pred,axis=1),np.argmax(lr_pred,axis=0)])
    
    return np.transpose(featureVec)

In [11]:
#train_ensembleOutput = ensemblePlus(classifiers,processed_train_data,train_data)

Working on Logistic Regression
Working on Neural Network
Working on Random Forest
Working on SVM Might take a while


In [53]:
superRF = RandomForestClassifier(n_estimators=5000,verbose=True)
superRF


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [54]:
superRF.fit(train_ensembleOutput,train_target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:  1.1min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=None,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [14]:
#test_ensembleOutput =  ensemblePlus(classifiers,processed_test_data,test_data)

Working on Logistic Regression
Working on Neural Network
Working on Random Forest
Working on SVM Might take a while


In [39]:
#test_ensembleOutput_USPS = ensemblePlus(classifiers,processed_USPSDat,[USPSMat])


Working on Logistic Regression
Working on Neural Network
Working on Random Forest
Working on SVM Might take a while


In [55]:
accuracy(superRF.predict(test_ensembleOutput),test_target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:    4.7s finished


(0.9812,      0     1     2    3    4    5    6     7    8    9
 0  972     1     0    1    0    0    2     1    3    0
 1    0  1129     2    2    0    1    0     1    0    0
 2    5     1  1007    6    1    3    0     4    4    1
 3    1     0     1  988    0    5    0     6    4    5
 4    1     0     4    1  957    0    3     3    1   12
 5    2     0     0    9    0  874    1     1    2    3
 6    5     2     0    1    1    3  941     1    4    0
 7    1     7    11    2    0    0    0  1001    4    2
 8    0     0     2    5    2    2    1     5  954    3
 9    0     0     0    1    8    4    1     1    5  989)

In [56]:
accuracy(superRF.predict(test_ensembleOutput_USPS),USPSTar)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5000 out of 5000 | elapsed:   10.6s finished


(0.49537476873843694,      0    1     2     3    4     5     6    7    8    9
 0  848    6    72    95  171   159   107  116   93  333
 1   71  577   146   163  299   119    90  356   95   84
 2   83    4  1469    85    9   166    62   48   67    6
 3   20    4    67  1500    1   296     7   18   65   22
 4   33   83    33    24  988   133    33  309  305   59
 5   84   10    52    83    1  1594    13   44   88   31
 6  152   17   156    37   31   323  1126   34   70   54
 7   55  265   216   542   22    76    10  640  160   14
 8  156   16    79   304   29   426    49   79  833   29
 9   38  140    61   276  123    71     7  568  384  332)