In [1]:
import pathlib
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.offline as py
from matplotlib.patches import Ellipse
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

from LoadData import LoadData

plt.style.use('ggplot')
np.random.seed(0)
py.init_notebook_mode(connected=True)


reduced_datasets = {"Principal Component Reduced_MNIST":"PrincipalComponentReduced_MNIST.csv",
                    "Random Forest Reduced_MNIST":"RandomForestReduced_MNIST.csv",
                    "Independent Component Reduced_MNIST": "IndependentComponentReduced_MNIST.csv",
                    "Random Projection Reduced_MNIST":"RandomProjectionReduced_MNIST.csv"}
accuracy = []
run_time = []

In [2]:

def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()

    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)

    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))


def plot_scatter(gmm, X, label=True, ax=None, title=None, EM=False, method_name=None):
    fig, ax = plt.subplots(figsize=(8,6))
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='tab10', zorder=2,
                   edgecolors="white")
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)

    if EM:
        w_factor = 0.2 / gmm.weights_.max()
        for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
            draw_ellipse(pos, covar, alpha=w * w_factor)

    ax.set_xlabel("First Principal Component")
    ax.set_ylabel("Second Principal Component")
    ax.set_title(title)

    ax.grid(True)
    fig.tight_layout()
    temp_text = title.split("\n")
    fname = temp_text[0].strip().replace(" ", "_")

    fig.savefig("{}_{}.png".format(fname, method_name.replace(" ", "")))

In [3]:

for key, val in reduced_datasets.items():
    mnist_data = pd.read_csv(val)
    labels = np.loadtxt("{}_Labels.csv".format(key.replace(" ", "")), delimiter=",")

    temp_text = key.split("_")
    n_components = mnist_data.shape[1] - 1
    dataset_name = temp_text[1].replace(" ", "")
    method_name = temp_text[0].replace(" ", "")

    data_sample = mnist_data

    data_y = labels  # dependent variable
    data_x = mnist_data  # independent variable

    # Standardising the values
    pca_std = StandardScaler().fit_transform(data_x.values)

    # Call the PCA method with 2 components.
    pca = PCA(n_components=2)
    x_2 = pca.fit(data_x).transform(data_x)

    # KMEANS

    # Set a KMeans clustering with 10 components cuz there are 10 class labels
    n_clusters = 10
    kmeans = KMeans(n_clusters=n_clusters).fit_transform(x_2)

    np.savetxt("KMeansDimensionalityReductionDataset_{}.csv".format(key), kmeans, delimiter=",", fmt="%.6f")
    np.savetxt("KMeansDimensionalityReductionDataset_{}_Labels.csv".format(key), X=labels, delimiter=",", fmt="%.0f")

    # Expectation Maximization
    em_components = 10
    gmm = GaussianMixture(n_components=em_components, random_state=0)

    np.savetxt("ExpectationMaximizationReductionDataset_{}.csv".format(key), kmeans, delimiter=",", fmt="%.6f")
    np.savetxt("ExpectationMaximizationReductionDataset_{}_Labels.csv".format(key), X=labels, delimiter=",", fmt="%.0f")
print()




In [None]:
reduced_datasets = {"PrincipalComponentReduced_MNIST": "PrincipalComponentReduced_MNIST.csv",
                    "RandomForestReduced_MNIST": "RandomForestReduced_MNIST.csv",
                    "IndependentComponentReduced_MNIST": "IndependentComponentReduced_MNIST.csv",
                    "RandomProjectionReduced_MNIST": "RandomProjectionReduced_MNIST.csv"}

In [None]:
# Baseline
cwd = pathlib.Path().absolute()

training_labels, training_data, _ = LoadData("{}/mnist-train-data.csv".format(cwd), normalize=True, size=size)

X_train, X_test, y_train, y_test = train_test_split(training_data, training_labels, test_size=0.33, random_state=0)
print(training_data.shape)

start_time = time.time()

nn_estimator = MLPClassifier(hidden_layer_sizes=(100,), verbose=3, n_iter_no_change=30)
nn_estimator.fit(X_train, y_train)

end_time = time.time()

elapsed_time = round((end_time - start_time), 2)
run_time.append(elapsed_time)
testing_acc = round(nn_estimator.score(X_test, y_test), 2)
accuracy.append(testing_acc)

df = pd.DataFrame(nn_estimator.loss_curve_, columns=["Loss"])
df.plot()
plt.title("{} Loss \n Testing Accuracy={}%\nElapsed Time={}sec".format("Baseline", testing_acc, elapsed_time))
plt.xlabel("Iterations")
plt.xlim(0, 105)
plt.ylim(0, 1.05)
plt.ylabel("Loss")
plt.tight_layout()
plt.savefig("{}_Loss.png".format("Baseline"))

print("Elapsed Time: {}".format(elapsed_time))
print("Training set score: %f" % nn_estimator.score(X_train, y_train))
print("Test set score: %f" % testing_acc)

In [None]:

for n in ["Kmeans", "Expect"]:
    temp_text = ""
    if n == "Kmeans":
        temp_text = "KMeansDimensionalityReductionDataset_"
    else:
        temp_text = "ExpectationMaximizationReductionDataset_"
        
    for key, val in reduced_datasets.items():
        print(key)

        training_data = np.loadtxt((temp_text+reduced_datasets[key]), delimiter=",")
        t = reduced_datasets[key].split(".")
        r = t[0] + "_Labels."+t[1]
        training_labels = np.loadtxt(r, delimiter=",")

        X_train, X_test, y_train, y_test = train_test_split(training_data, training_labels, test_size=0.33, random_state=0)
        print(training_data.shape)
        start_time = time.time()
        if key == "IndependentComponentReduced_MNIST":
            nn_estimator = MLPClassifier(hidden_layer_sizes=(100,), verbose=3, learning_rate_init=0.001)
        else:
            nn_estimator = MLPClassifier(hidden_layer_sizes=(100,), verbose=3)
        nn_estimator.fit(X_train, y_train)

        end_time = time.time()
        elapsed_time = round((end_time - start_time), 3)

        run_time.append(elapsed_time)
        testing_acc = round(nn_estimator.score(X_test, y_test), 2)
        accuracy.append(testing_acc)

        df = pd.DataFrame(nn_estimator.loss_curve_, columns=["Loss"])
        df.plot()
        plt.title("{} Loss \n Testing Accuracy={}%\nElapsed Time={}sec".format(key, testing_acc, elapsed_time))
        plt.xlabel("Iterations")
        plt.xlim(0, 105)
        plt.ylim(0, 1.05)
        plt.ylabel("Loss")
        plt.tight_layout()
        plt.savefig("{}_Loss.png".format(key))

        print("Elapsed Time: {}".format(elapsed_time))
        print("Training set score: %f" % nn_estimator.score(X_train, y_train))
        print("Test set score: %f" % testing_acc)

    plt.close('all')

    d = {"Baseline": accuracy[0], "PCA": accuracy[1], "RandomForest": accuracy[2], "ICA": accuracy[3],
         "RandomProjection": accuracy[4]}
    testing_acc_df = pd.DataFrame(data=d, index=[0])
    testing_acc_df.plot(kind="bar", title="Testing Accuracy on {} Dimensionality Reduced Dataset".format(n), use_index=False)
    plt.xlabel(["Baseline", "PCA", "RandomForest", "ICA", "RandomProjection"])
    plt.ylabel("Accuracy(%)")
    plt.tight_layout()
    plt.savefig("TestingAccuracy_{}_DimensionReduced.png".format(n))

    d2 = {"Baseline": run_time[0], "PCA": run_time[1], "RandomForest": run_time[2], "ICA": run_time[3],
          "RandomProjection": run_time[4]}
    run_time_df = pd.DataFrame(data=d2, index=[0])
    run_time_df.plot(kind="bar", title="Run Time on {} Dimensionality Reduced Dataset".format(n), use_index=False)
    plt.ylabel("Time(sec)")
    plt.xlabel(["Baseline", "PCA", "RandomForest", "ICA", "RandomProjection"])
    plt.tight_layout()
    plt.savefig("RunTime_{}_DimensionReduced.png".format(n))
    print()