In [1]:
import os
import pathlib
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
from timeit import default_timer as timer

from LoadData import LoadData

import os
import sys
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics, svm
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.tree._tree import TREE_LEAF

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.utils import parallel_backend


TESTING = True
DECISION_TREE = False
SUPPORT_VECTOR = True
NEURAL_NET = False
K_NEAREST = False
BOOSTING = False
NORMALIZE_DATA = False
USE_PCA = True
DataSetName = "MNIST"


In [2]:
cwd = pathlib.Path().absolute()
training_data_path = "{}/mnist-train-data.csv".format(cwd)
testing_data_path = "{}/mnist-test-data.csv".format(cwd)

with parallel_backend('threading'):
    training_labels, training_data, _ = LoadData(training_data_path, normalize=NORMALIZE_DATA)
    testing_labels, testing_data, _ = LoadData(testing_data_path, normalize=NORMALIZE_DATA)

Scaler = StandardScaler().fit(training_data)
        
training_data = Scaler.transform(training_data)
testing_data = Scaler.transform(testing_data)


Attempting to load: mnist-train-data.csv

Loading Complete
Data Statistics: 
   Number of Entries: 60000 
   Shape of Entry: (785,)

Attempting to load: mnist-test-data.csv

Loading Complete
Data Statistics: 
   Number of Entries: 10000 
   Shape of Entry: (785,)



In [3]:
def FindCorrectPredictions(prediction_array, actual_array):
    idx = np.argwhere(actual_array != prediction_array)
    actual = actual_array[idx]
    predicted = prediction_array[idx]
    pred_actual_hstack = np.hstack((predicted, actual))
    final_hstack = np.hstack((idx, pred_actual_hstack))
    df = pd.DataFrame(final_hstack, columns=["Index Of Image", "Predicted Image", "Actual Image"])
    return df

In [4]:
def prune_index11(clf, index, threshold):
    try:
        """
        prune_index(dt.tree_, 0, 3)
        https://stackoverflow.com/questions/49428469/pruning-decision-trees
        """

        if clf.tree_.value[index][0].sum() < threshold:
            # turn node into a leaf by "unlinking" its children
            clf.tree_.children_left[index] = TREE_LEAF
            clf.tree_.children_right[index] = TREE_LEAF
        # if there are children, visit them as well
        if clf.tree_.children_left[index] != TREE_LEAF:
            prune_index(clf.tree_.children_left[index], threshold)
            prune_index(clf.tree_.children_right[index], threshold)

    except Exception as err:
        print("Exception occurred while pruning Decision Tree. \n", err)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

In [None]:
"""
TRAINING TIME

"""

In [None]:
# limit = 4000
# testing_subset = testing_data[:limit, :]
# training_subset = training_data[:limit, :]
# testing_sublabels = testing_labels[:limit]
# training_sublabels = training_labels[:limit ,]

solvers = ["Linear", "rbf"]

svm_linear_runtime = [0.0]
svm_linear_accuracy = [0.0]

svm_rbf_runtime = [0.0]
svm_rbf_accuracy = [0.0]

In [None]:
with parallel_backend('threading'):
    for solver in solvers:
        clf = svm.SVC(kernel=solver.lower(), verbose=3, max_iter=1000)
        for i in range(1, 11, 1):
            print("{} - Training Size: {}%".format(solver, (i * 10)))
            start_time = timer()
            with parallel_backend('threading'):
                clf.fit(training_data[:int((60000 * (0.1 * i))), :], training_labels[:int((60000 * (0.1 * i)))])
            end_time = timer()
            elapsed_time = end_time - start_time
            print(elapsed_time)
            if solver == "Linear":
                svm_linear_accuracy.append(clf.score(testing_data, testing_labels))
                svm_linear_runtime.append(elapsed_time)
            else:
                svm_rbf_accuracy.append(clf.score(testing_data, testing_labels))
                svm_rbf_runtime.append(elapsed_time)

In [None]:
svm_linear_accuracy = np.asarray(svm_linear_accuracy)
svm_linear_runtime = np.asarray(svm_linear_runtime)
svm_rbf_accuracy = np.asarray(svm_rbf_accuracy)
svm_rbf_runtime = np.asarray(svm_rbf_runtime)

In [None]:
svm_linear_accuracy.tofile('svm_linear_accuracy_{}.csv'.format(DataSetName),sep=',',format='%.3f')
svm_linear_runtime.tofile('svm_linear_runtime_{}.csv'.format(DataSetName),sep=',',format='%.3f')
svm_rbf_accuracy.tofile('svm_rbf_accuracy_{}.csv'.format(DataSetName),sep=',',format='%.3f')
svm_rbf_runtime.tofile('svm_rbf_runtime_{}.csv'.format(DataSetName),sep=',',format='%.3f')

In [None]:
"""
Results

"""

In [None]:
colors = ["tab:orange", "tab:blue", "tab:green", "tab:red"]
solvers = ["Linear", "rbf"]

run = [svm_linear_runtime, svm_rbf_runtime]
acc = [svm_linear_accuracy, svm_rbf_accuracy]

for solver in range(len(solvers)):
    with plt.style.context('ggplot'):
        fig0, ax0 = plt.subplots()
        ax0.set_xlabel("Percent of Training Set")
        ax0.set_ylabel("Accuracy (%)", color='tab:orange')
        ax0.set_title("Accuracy vs Training Set Size vs Training Time {} \n {}".format(solvers[solver], DataSetName))
        ax0.tick_params(axis='y', labelcolor="black")
        ax0.set_ylim(0, 1.1)
        ax3 = ax0.twinx()
        ax3.set_ylabel("Training Time (s)", color="tab:blue")
        ax3.set_ylim(0, max(max(adam_runtime), max(sgd_runtime)) + 10)
        ax3.tick_params(axis='y', labelcolor="black")
        for i in range(1):        
            ax0.plot([i for i in range(11)], acc[solver], colors[i], marker='o', label=solvers[solver])
            ax3.plot([i for i in range(11)], run[solver], colors[i+1], marker="1", label="{} training-time".format(solvers[solver]))
        fig0.tight_layout()
        dir = "{}/Training_{}_{}_Set_Size_Impact_vs_Training_Time.png".format(cwd, solvers[solver], DataSetName)
        plt.savefig(dir)
#         plt.close("all")