In [1]:
import pickle
import gzip
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_validate,StratifiedShuffleSplit,GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from PIL import Image
import os
np.random.seed(666)

In [3]:
def accuracy(test_data,test_target,classifier):
    correct=0
    conf_mat = confusion_matrix(test_target,classifier.predict(test_data))
    for i in range(len(conf_mat)):
        correct += conf_mat[i][i]
    return correct/len(test_data),conf_mat

def more_metrics(conf_mat):
    true_positives = 0
    precision = []
    recall = []
    for i in range(len(conf_mat)):
        true_positives += conf_mat.iloc[i,i]
    conf_mat = np.matrix(conf_mat)
    tp_fp = np.array(np.sum(conf_mat,axis=1)).ravel()
    relevant_elements = np.array(np.sum(conf_mat,axis=0)).ravel()
    for i in range(len(conf_mat)):
        precision.append(conf_mat[i,i]/tp_fp[i])
        recall.append(conf_mat[i,i]/relevant_elements[i])
    return true_positives,precision,recall

In [4]:
'''
experimentation = RUN in Notebook gives LDA and PCA
model = Display accuracy for the best model
'''
mode = 'model'

In [5]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = np.append(training_data[0],validation_data[0],axis=0)
train_target = np.append(training_data[1],validation_data[1])
test_target = test_data[1]
test_data = test_data[0]

In [6]:
if(mode == 'experimentation'):
    scaler = MinMaxScaler(feature_range=[0, 1])
    processed_train_data = scaler.fit_transform(train_data)
    processed_test_data = scaler.fit_transform(test_data)

    pca = PCA().fit(train_data)
    lda = LinearDiscriminantAnalysis().fit(processed_train_data,train_target)

    plt.figure(figsize=(10,10))
    plt.plot(np.cumsum(lda.explained_variance_ratio_))
    plt.xlabel('Number of Components for LDA')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Variance')
    plt.grid()
    plt.show()

    plt.figure(figsize=(15,15))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components for PCA')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Variance')
    plt.grid()
    plt.show()

    pca = PCA(n_components=500)
    pca_processed_train_data = pca.fit_transform(train_data)
    pca_processed_test_data = pca.fit_transform(test_data)

    lda = LinearDiscriminantAnalysis(n_components=8)
    lda_processed_train_data = lda.fit(train_data, train_target).transform(train_data)
    lda_processed_test_data = lda.transform(test_data)

    # Set the parameters by cross-validation
    # Ref https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]},
                        {'kernel': ['linear'],'gamma': [1e-1,1e-2,1e-3,1e-4], 'C': [1,2,3,4,5]},
                        {'kernel': ['poly'],'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]},
                        {'kernel': ['sigmoid'],'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]}]
    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(verbose=True,cache_size=7000), tuned_parameters, cv=3,
                           scoring='%s_macro' % score)
        clf.fit(lda_processed_train_data, train_target)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = test_target, clf.predict(lda_processed_test_data)
        print(classification_report(y_true, y_pred))
        print()

    #joblib.dump(clf,'./models/svmGridSearch.joblib')
    clf = joblib.load("models/svmGridSearch.joblib")

    clf.best_params_

    clf.best_estimator_

In [7]:
baseline = SVC(kernel='linear',cache_size=7024,verbose=True,probability=True)
baseline

SVC(C=1.0, cache_size=7024, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [8]:
baseline2 = SVC(kernel='rbf', gamma =1,cache_size=7000,verbose=True,probability=True)
baseline2

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [9]:
baseline3 = SVC(kernel='rbf',cache_size=7000,verbose=True,probability=True)
baseline3

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [10]:
model4= SVC(kernel='rbf', C=2,gamma = 0.05,cache_size=7000,verbose=True,probability=True)
model4

In [11]:
#classifier.fit(train_data,train_target)
#model5 = joblib.load("./models/SVMSlideModel.joblib")
#model5



SVC(C=2, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [12]:
#model6 = joblib.load("./models/SVMpdfModel2.joblib")
#model6

SVC(C=1.0, cache_size=7024, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [14]:
model7 = joblib.load("./models/SVMpdfModel3.joblib")
model7

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [6]:
#model8= SVC(kernel='rbf', C=2,cache_size=7000, max_iter=2000,verbose=True,probability=True)
#model8

SVC(C=2, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=2000, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [16]:
classifier = model7
classifier

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [17]:
#model6
acc,conf_mat = accuracy(test_data,test_target,classifier)
print("The Accuracy for Testing on MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Recall"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Precision"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on MNIST is: 0.9435
The Confusion Matrix is: 
[[ 967    0    1    0    0    5    4    1    2    0]
 [   0 1120    2    3    0    1    3    1    5    0]
 [   9    1  962    7   10    1   13   11   16    2]
 [   1    1   14  950    1   17    1   10   11    4]
 [   1    1    7    0  937    0    7    2    2   25]
 [   7    4    5   33    7  808   11    2   10    5]
 [  10    3    4    1    5   10  924    0    1    0]
 [   2   13   22    5    7    1    0  954    4   20]
 [   4    6    6   14    8   24   10    8  891    3]
 [  10    6    0   12   33    5    1   14    6  922]]
The Precision & Recall is: 
      Recall  Precision
0  98.673469  95.647873
1  98.678414  96.969697
2  93.217054  94.037146
3  94.059406  92.682927
4  95.417515  92.956349
5  90.582960  92.660550
6  96.450939  94.866530
7  92.801556  95.114656
8  91.478439  93.987342
9  91.377602  93.985729


In [22]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

In [23]:
acc,conf_mat = accuracy(USPSMat,USPSTar,classifier)
print("The Accuracy for Testing on MNIST is: "+str(acc))
print("The Confusion Matrix is: ")
print(conf_mat)
_,precision,recall = more_metrics(pd.DataFrame(conf_mat))
print("The Precision & Recall is: ")
df = pd.DataFrame(np.multiply(precision,100))
df.columns = ["Recall"]
df1 = pd.DataFrame(np.multiply(recall,100))
df1.columns = ["Precision"]
print(pd.concat([df,df1],axis=1))

The Accuracy for Testing on MNIST is: 0.38541927096354817
The Confusion Matrix is: 
[[ 573    2  428   19  285  248   73   44    6  322]
 [ 110  429  285  137  273  180   46  501   22   17]
 [ 128   18 1402   59   39  198   61   57   23   14]
 [  76    3  186 1123   11  483    5   70   27   16]
 [  18   67   91   14 1167  267   22  194   69   91]
 [ 108   17  257  102   25 1367   60   43   15    6]
 [ 197    7  489   24   98  394  748   13    7   23]
 [  50  225  457  265   57  416   15  452   41   22]
 [  73   25  209  193   87 1006   95   41  244   27]
 [  26  166  228  278  213  165    8  499  214  203]]
The Precision & Recall is: 
      Recall  Precision
0  28.650000  42.163355
1  21.450000  44.734098
2  70.135068  34.771825
3  56.150000  50.722674
4  58.350000  51.751663
5  68.350000  28.937341
6  37.400000  66.019417
7  22.600000  23.615465
8  12.200000  36.526946
9  10.150000  27.395412
