In [1]:
import pickle
import gzip
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_validate,StratifiedShuffleSplit,GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from PIL import Image
import os
np.random.seed(666)

In [2]:
def accuracy(test_data,test_target,classifier):
    correct=0
    conf_mat = confusion_matrix(test_target,classifier.predict(test_data))
    for i in range(len(conf_mat)):
        correct += conf_mat[i][i]
    return correct/len(test_data),conf_mat

In [3]:
'''
experimentation = RUN in Notebook gives LDA and PCA
model = Display accuracy for the best model
'''
mode = 'model'

In [4]:
filename = '../mnist.pkl.gz'
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()
train_data = np.append(training_data[0],validation_data[0],axis=0)
train_target = np.append(training_data[1],validation_data[1])
test_target = test_data[1]
test_data = test_data[0]

In [5]:
if(mode == 'experimentation'):
    scaler = MinMaxScaler(feature_range=[0, 1])
    processed_train_data = scaler.fit_transform(train_data)
    processed_test_data = scaler.fit_transform(test_data)

    pca = PCA().fit(train_data)
    lda = LinearDiscriminantAnalysis().fit(processed_train_data,train_target)

    plt.figure(figsize=(10,10))
    plt.plot(np.cumsum(lda.explained_variance_ratio_))
    plt.xlabel('Number of Components for LDA')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Variance')
    plt.grid()
    plt.show()

    plt.figure(figsize=(15,15))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components for PCA')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Variance')
    plt.grid()
    plt.show()

    pca = PCA(n_components=500)
    pca_processed_train_data = pca.fit_transform(train_data)
    pca_processed_test_data = pca.fit_transform(test_data)

    lda = LinearDiscriminantAnalysis(n_components=8)
    lda_processed_train_data = lda.fit(train_data, train_target).transform(train_data)
    lda_processed_test_data = lda.transform(test_data)

    # Set the parameters by cross-validation
    # Ref https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]},
                        {'kernel': ['linear'],'gamma': [1e-1,1e-2,1e-3,1e-4], 'C': [1,2,3,4,5]},
                        {'kernel': ['poly'],'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]},
                        {'kernel': ['sigmoid'],'gamma': [1e-1,1e-2,1e-3,1e-4],'C': [1,2,3,4,5]}]
    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(verbose=True,cache_size=7000), tuned_parameters, cv=3,
                           scoring='%s_macro' % score)
        clf.fit(lda_processed_train_data, train_target)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = test_target, clf.predict(lda_processed_test_data)
        print(classification_report(y_true, y_pred))
        print()

    #joblib.dump(clf,'./models/svmGridSearch.joblib')
    clf = joblib.load("models/svmGridSearch.joblib")

    clf.best_params_

    clf.best_estimator_

In [6]:
baseline = SVC(kernel='linear',cache_size=7024,verbose=True,probability=True)
baseline

SVC(C=1.0, cache_size=7024, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [7]:
baseline2 = SVC(kernel='rbf', gamma =1,cache_size=7000,verbose=True,probability=True)
baseline2

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [8]:
baseline3 = SVC(kernel='rbf',cache_size=7000,verbose=True,probability=True)
baseline3

SVC(C=1.0, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [10]:
model4= SVC(kernel='rbf', C=2,gamma = 0.05,cache_size=7000,verbose=True,probability=True)

In [11]:
classifier = model4
classifier

SVC(C=2, cache_size=7000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [None]:
classifier.fit(train_data,train_target)

[LibSVM]

In [16]:
#baseline
accuracy(test_data,test_target,classifier)

(0.9403, array([[ 957,    0,    4,    1,    1,    6,    9,    1,    0,    1],
        [   0, 1122,    3,    2,    0,    1,    2,    1,    4,    0],
        [   8,    6,  967,   11,    3,    3,    7,    8,   17,    2],
        [   4,    3,   16,  947,    1,   15,    0,    9,   13,    2],
        [   1,    1,   10,    1,  942,    2,    4,    2,    3,   16],
        [  10,    4,    3,   36,    6,  803,   13,    1,   14,    2],
        [   9,    2,   13,    1,    6,   16,  909,    1,    1,    0],
        [   1,    8,   21,   10,    8,    1,    0,  957,    3,   19],
        [   8,    4,    6,   25,    7,   26,    6,    7,  877,    8],
        [   7,    7,    2,   11,   33,    4,    0,   18,    5,  922]]))

In [18]:
USPSMat  = []
USPSTar  = []
curPath  = '../USPSdata/Numerals'
savedImg = []

for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)

In [34]:
lda_USPS_test_data = lda.transform(USPSMat)

In [19]:
# Baseline
accuracy(USPSMat,USPSTar,classifier)

(0.29236461823091153,
 array([[ 368,    0,  503,  149,  161,  345,   67,  200,    9,  198],
        [  39,  300,  534,  214,  242,  212,   25,  371,   41,   22],
        [ 113,   62, 1288,   97,   35,  263,   67,   42,   21,   11],
        [  47,   60,  346,  832,   13,  610,    6,   48,   25,   13],
        [  21,   19,  242,   81,  769,  231,   16,  475,   74,   72],
        [  34,   13,  656,  215,   30,  949,   24,   35,   30,   14],
        [ 143,   17,  857,   50,   64,  341,  455,   32,    1,   40],
        [  26,   74,  220,  623,   37,  304,   11,  583,   95,   27],
        [ 110,   10,  344,  434,   84,  708,   68,   60,  152,   30],
        [  11,   34,  218,  576,  130,  115,    5,  617,  143,  151]]))

In [None]:
accuracy(USPSMat,USPSTar,classifier)