In [1]:
import os
import sys

import pandas as pd
import numpy as np
import pickle
import unsupervised_learning_util as utl
import clustering_utl as cl_utl
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA, FastICA, KernelPCA
from sklearn.manifold import TSNE, LocallyLinearEmbedding, MDS, Isomap, SpectralEmbedding
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples, homogeneity_completeness_v_measure
from sklearn.metrics import homogeneity_score, calinski_harabasz_score, davies_bouldin_score
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from warnings import simplefilter

plt.tight_layout()
plt.style.use("ggplot")
mpl.rcParams['figure.figsize'] = [8, 6]
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['savefig.dpi'] = 500


NJOBS = 32
VERBOSE = 0

%matplotlib inline

utl.check_folder("Part4")

C:\Users\joshu\OneDrive - Georgia Institute of Technology\Georgia-Tech\CS 7641 - Machine Learning\Assignments\Unsupervised Learning and Dimensionality Reduction\Part4 folder already exists.




In [None]:
gathered_data = utl.setup(["MNIST"])
gathered_data_fashion = utl.setup(["Fashion-MNIST"])

mnist = {}
fashion_mnist = {}
# mnist_not_scaled = {}
# fashion_mnist_not_scaled = {}

mnist['train_X'], mnist['train_y'], \
mnist['valid_X'], mnist['valid_y'], \
mnist['test_X'], mnist['test_y'] = utl.split_data(gathered_data["MNIST"]["X"],
                                                  gathered_data["MNIST"]["y"], minMax=True)
# mnist_not_scaled['train_X'], mnist_not_scaled['train_y'], \
# mnist_not_scaled['valid_X'], mnist_not_scaled['valid_y'], \
# mnist_not_scaled['test_X'], mnist_not_scaled['test_y'] = utl.split_data(
#     gathered_data["MNIST"]["X"], gathered_data["MNIST"]["y"], scale=False)

fashion_mnist['train_X'], fashion_mnist['train_y'], \
fashion_mnist['valid_X'], fashion_mnist['valid_y'], \
fashion_mnist['test_X'], fashion_mnist['test_y'] = utl.split_data(gathered_data_fashion["Fashion-MNIST"]["X"],
                                                                  gathered_data_fashion["Fashion-MNIST"]["y"],
                                                                  minMax=True)

# fashion_mnist_not_scaled['train_X'], fashion_mnist_not_scaled['train_y'], \
# fashion_mnist_not_scaled['valid_X'], fashion_mnist_not_scaled['valid_y'], \
# fashion_mnist_not_scaled['test_X'], fashion_mnist_not_scaled['test_y'] = utl.split_data(
#     gathered_data_fashion["Fashion-MNIST"]["X"], gathered_data_fashion["Fashion-MNIST"]["y"], scale=False)

# Load Reduced Datasets

In [None]:
with open(f"{os.getcwd()}/DimensionalityReduction/PCA_MNIST_Reduced_Dataset.pkl", "rb") as input_file:
    pca_mnist_reduced_data = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/DimensionalityReduction/PCA_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    pca_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(f"PCA MNIST: {pca_mnist_reduced_data.shape}")
print(f"PCA Fashion: {pca_fashion_reduced_data.shape}")

In [None]:
with open(f"{os.getcwd()}/DimensionalityReduction/ICA_MNIST_Reduced_Dataset.pkl", "rb") as input_file:
    ica_mnist_reduced_data = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/DimensionalityReduction/ICA_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    ica_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(f"ICA MNIST: {ica_mnist_reduced_data.shape}")
print(f"ICA Fashion: {ica_fashion_reduced_data.shape}")

In [None]:
with open(f"{os.getcwd()}/DimensionalityReduction/RF_MNIST_Reduced_Dataset.pkl", "rb") as input_file:
    rf_mnist_reduced_data = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/DimensionalityReduction/RF_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    rf_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(f"RF MNIST: {rf_mnist_reduced_data.shape}")
print(f"RF Fashion: {rf_fashion_reduced_data.shape}")

In [None]:
with open(f"{os.getcwd()}/DimensionalityReduction/RP_MNIST_Reduced_Dataset.pkl", "rb") as input_file:
    rp_mnist_reduced_data = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/DimensionalityReduction/RP_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    rp_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(f"RP MNIST: {rp_mnist_reduced_data.shape}")
print(f"RP Fashion: {rp_fashion_reduced_data.shape}")

# K-Means Clustering

### PCA

In [None]:
X = pca_mnist_reduced_data.iloc[:, 0:-1]
y = pca_mnist_reduced_data.iloc[:, -1]
pca_kmeans_mnist_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="PCA_KMeans_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_PCA_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(pca_kmeans_mnist_results, output_file)
    output_file.close()

In [None]:
X = pca_fashion_reduced_data.iloc[:, 0:-1]
y = pca_fashion_reduced_data.iloc[:, -1]
pca_kmeans_fashion_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="PCA_KMeans_Fashion_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_PCA_Fashion_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(pca_kmeans_fashion_results, output_file)
    output_file.close()

In [None]:
with open(f"{os.getcwd()}/Part4/KMeans_PCA_MNIST_Results.pkl", "rb") as input_file:
    pca_kmeans_mnist_results  = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/Part4/KMeans_PCA_Fashion_MNIST_Results.pkl", "rb") as input_file:
    pca_kmeans_fashion_results = pickle.load(input_file)
    input_file.close()

In [None]:
cl_utl.plot_combined_kmeans(mnist_X=pca_mnist_reduced_data.iloc[:, 0:-1], 
                            fashion_X=pca_fashion_reduced_data.iloc[:, 0:-1], max_clusters=40, save_name="PCA_")

### ICA

In [None]:
X = ica_mnist_reduced_data.iloc[:, 0:-1]
y = ica_mnist_reduced_data.iloc[:, -1]
ica_kmeans_mnist_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="ICA_KMeans_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_ICA_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(ica_kmeans_mnist_results, output_file)
    output_file.close()

In [None]:
X = ica_fashion_reduced_data.iloc[:, 0:-1]
y = ica_fashion_reduced_data.iloc[:, -1]
ica_kmeans_fashion_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="ICA_KMeans_Fashion_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_ICA_Fashion_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(ica_kmeans_fashion_results, output_file)
    output_file.close()

In [None]:
with open(f"{os.getcwd()}/Part4/KMeans_ICA_MNIST_Results.pkl", "rb") as input_file:
    ica_kmeans_mnist_results = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/Part4/KMeans_ICA_Fashion_MNIST_Results.pkl", "rb") as input_file:
    ica_kmeans_fashion_results = pickle.load(input_file)
    input_file.close()

In [None]:
cl_utl.plot_combined_kmeans(mnist_X=ica_mnist_reduced_data.iloc[:, 0:-1], 
                            fashion_X=ica_fashion_reduced_data.iloc[:, 0:-1], max_clusters=40, save_name="ICA_")

### Randomized Projections

In [None]:
X = rp_mnist_reduced_data.iloc[:, 0:-1]
y = rp_mnist_reduced_data.iloc[:, -1]
rp_kmeans_mnist_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="RP_KMeans_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_RP_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(rp_kmeans_mnist_results, output_file)
    output_file.close()

In [None]:
X = rp_fashion_reduced_data.iloc[:, 0:-1]
y = rp_fashion_reduced_data.iloc[:, -1]
rp_kmeans_fashion_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="RP_KMeans_Fashion_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_RP_Fashion_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(rp_kmeans_fashion_results, output_file)
    output_file.close()

In [None]:
with open(f"{os.getcwd()}/Part4/KMeans_RP_MNIST_Results.pkl", "rb") as input_file:
    rp_kmeans_mnist_results = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/Part4/KMeans_RP_Fashion_MNIST_Results.pkl", "rb") as input_file:
    prp_kmeans_fashion_results = pickle.load(input_file)
    input_file.close()

In [None]:
cl_utl.plot_combined_kmeans(mnist_X=rp_mnist_reduced_data.iloc[:, 0:-1], 
                            fashion_X=rp_fashion_reduced_data.iloc[:, 0:-1], max_clusters=40 save_name="RP_")

### Random Forest

In [None]:
X = rf_mnist_reduced_data.iloc[:, 0:-1]
y = rf_mnist_reduced_data.iloc[:, -1]
rf_kmeans_mnist_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="RF_KMeans_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_RF_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(rf_kmeans_mnist_results, output_file)
    output_file.close()

In [None]:
X = rf_fashion_reduced_data.iloc[:, 0:-1]
y = rf_fashion_reduced_data.iloc[:, -1]
rf_kmeans_fashion_results = cl_utl.run_kmeans(data_X=X, data_y=y,
                            max_clusters=40, dataset_name="RF_KMeans_Fashion_MNIST", verbose=1)
with open(f"{os.getcwd()}/Part4/KMeans_RF_Fashion_MNIST_Results.pkl", "wb") as output_file:
    pickle.dump(rf_kmeans_fashion_results, output_file)
    output_file.close()

In [None]:
with open(f"{os.getcwd()}/Part4/KMeans_RF_MNIST_Results.pkl", "rb") as input_file:
    rf_kmeans_mnist_results = pickle.load(input_file)
    input_file.close()
with open(f"{os.getcwd()}/Part4/KMeans_RF_Fashion_MNIST_Results.pkl", "rb") as input_file:
    rf_kmeans_fashion_results = pickle.load(input_file)
    input_file.close()

In [None]:
cl_utl.plot_combined_kmeans(mnist_X=rf_mnist_reduced_data.iloc[:, 0:-1], 
                            fashion_X=rf_fashion_reduced_data.iloc[:, 0:-1], max_clusters=40, save_name="RF_")

# Expectation Maximization 

In [None]:
em_lim = 2000

### PCA

In [None]:
X = pca_mnist_reduced_data.iloc[:em_lim, 0:-1]
y = pca_mnist_reduced_data.iloc[:em_lim, -1]
pca_em_mnist_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="PCA_EM_MNIST", standalone=True)

In [None]:
X = pca_fashion_reduced_data.iloc[:em_lim, 0:-1]
y = pca_fashion_reduced_data.iloc[:em_lim, -1]
pca_em_fashion_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="PCA_EM_Fashion-MNIST", 
                                       standalone=True)

In [None]:
cl_utl.plot_em(pca_em_mnist_results, pca_em_fashion_results, extra_name="PCA_EM")

### ICA

In [None]:
X = ica_mnist_reduced_data.iloc[:em_lim, 0:-1]
y = ica_mnist_reduced_data.iloc[:em_lim, -1]
ica_em_mnist_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="ICA_EM_MNIST", standalone=True)

In [None]:
X = ica_fashion_reduced_data.iloc[:em_lim, 0:-1]
y = ica_fashion_reduced_data.iloc[:em_lim, -1]
ica_em_fashion_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="ICA_EM_Fashion-MNIST", 
                                       standalone=True)

In [None]:
cl_utl.plot_em(ica_em_mnist_results, ica_em_fashion_results, extra_name="ICA_EM")

### Randomized Projections

In [None]:
X = rp_mnist_reduced_data.iloc[:em_lim, 0:-1]
y = rp_mnist_reduced_data.iloc[:em_lim, -1]
rp_em_mnist_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="RP_EM_MNIST", standalone=True)

In [None]:
X = rp_fashion_reduced_data.iloc[:em_lim, 0:-1]
y = rp_fashion_reduced_data.iloc[:em_lim, -1]
rp_em_fashion_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="RP_EM_Fashion-MNIST",
                                      standalone=True)

In [None]:
cl_utl.plot_em(rp_em_mnist_results, rp_em_fashion_results, extra_name="RP_EM")

### Random Forest

In [None]:
X = rf_mnist_reduced_data.iloc[:em_lim, 0:-1]
y = rf_mnist_reduced_data.iloc[:em_lim, -1]
rf_em_mnist_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="RF_EM_MNIST", standalone=True)

In [None]:
X = rf_fashion_reduced_data.iloc[:, 0:-1]
y = rf_fashion_reduced_data.iloc[:, -1]
rf_em_fashion_results = cl_utl.run_em(data_X=X, data_y=y, max_components=30, dataset_name="RF_EM_Fashion-MNIST", 
                                      standalone=True)

In [None]:
cl_utl.plot_em(rf_em_mnist_results, rf_em_fashion_results, extra_name="RF_EM")

# Neural Network on Dimensionality Reduced Datasets

## Fashion-MNIST Dataset

In [None]:
# plt.savefig(f"{os.getcwd()}/Clustering/EM_Combined_AIC_BIC.png")
with open(f"{os.getcwd()}/DimensionalityReduction/PCA_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    pca_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(pca_fashion_reduced_data.shape)

with open(f"{os.getcwd()}/DimensionalityReduction/ICA_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    ica_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(ica_fashion_reduced_data.shape)

with open(f"{os.getcwd()}/DimensionalityReduction/RP_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    rp_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(rp_fashion_reduced_data.shape)

with open(f"{os.getcwd()}/DimensionalityReduction/RF_Fashion_Reduced_Dataset.pkl", "rb") as input_file:
    rf_fashion_reduced_data = pickle.load(input_file)
    input_file.close()
print(rf_fashion_reduced_data.shape)

cl_utl.run_nn(pca_data_X=pca_fashion_reduced_data.iloc[:, :-1], pca_data_y=pca_fashion_reduced_data.iloc[:, -1],
          ica_data_X=ica_fashion_reduced_data.iloc[:, :-1], ica_data_y=ica_fashion_reduced_data.iloc[:, -1],
          rand_proj_data_X=rp_fashion_reduced_data.iloc[:, :-1],
          rand_proj_data_y=rp_fashion_reduced_data.iloc[:, -1],
          rand_forest_data_X=rf_fashion_reduced_data.iloc[:, :-1],
          rand_forest_data_y=rf_fashion_reduced_data.iloc[:, -1],
          base_line_data_X=fashion_mnist["train_X"].iloc[:limit, :],
          base_line_data_y=fashion_mnist["train_y"].iloc[:limit])