In [None]:
import os
import sys
import itertools

import pandas as pd
import numpy as np
import pickle
import unsupervised_learning_util as utl
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, FastICA, KernelPCA
from sklearn.manifold import TSNE, LocallyLinearEmbedding, MDS, Isomap, SpectralEmbedding
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, silhouette_samples, homogeneity_completeness_v_measure
from sklearn.metrics import homogeneity_score, calinski_harabasz_score, davies_bouldin_score
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
from warnings import simplefilter
from scipy import linalg

plt.tight_layout()
plt.style.use("ggplot")
mpl.rcParams['figure.figsize'] = [8, 6]
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['savefig.dpi'] = 500


NJOBS = 32
VERBOSE = 0

%matplotlib inline

In [None]:
gathered_data = utl.setup(["MNIST"])
mnist_scaled = {}
mnist_scaled['train_X'], mnist_scaled['train_y'], \
mnist_scaled['valid_X'], mnist_scaled['valid_y'], \
mnist_scaled['test_X'], mnist_scaled['test_y'] = utl.split_data(gathered_data["MNIST"]["X"],
                                                                gathered_data["MNIST"]["y"], scale=True)
mnist_not_scaled = {}
mnist_not_scaled['train_X'], mnist_not_scaled['train_y'], \
mnist_not_scaled['valid_X'], mnist_not_scaled['valid_y'], \
mnist_not_scaled['test_X'], mnist_not_scaled['test_y'] = utl.split_data(gathered_data["MNIST"]["X"],
                                                                        gathered_data["MNIST"]["y"], 
                                                                        scale=False)


# Determine Scaling

In [None]:
temp_folder = "Clustering/" + "KMeans/"
utl.check_folder(temp_folder)
save_dir = os.getcwd() + temp_folder
limit = 5000
idx = [i for i in range(2, 15, 2)]
cols = [limit]


inertia_results_scaled = pd.DataFrame(columns=["Scaled"],
                               index=idx,
                               data=np.zeros(shape=(len(idx), len(cols))))
inertia_results_not_scaled = pd.DataFrame(columns=["Not Scaled"],
                               index=idx,
                               data=np.zeros(shape=(len(idx), len(cols))))

silhouette_average_results_scaled = pd.DataFrame(columns=["Scaled"], index=idx,
                                          data=np.zeros(shape=(len(idx), len(cols))))
silhouette_average_results_not_scaled = pd.DataFrame(columns=["Not Scaled"], index=idx,
                                          data=np.zeros(shape=(len(idx), len(cols))))
silhouette_sample_results_scaled = {}
silhouette_sample_results_not_scaled = {}
best_inertia = -1e12
best_inertia_num_cluster = 0
best_silhouette = 0
best_silhouette_num_cluster = 0
print("Starting K-Means Clustering")

for _df in ["Scaled", "Not Scaled"]:
    for k in idx:
        print(f"\t\t{_df} Number of Clusters: {k}")
        if _df == "Scaled":
            temp_train_X = mnist_scaled["train_X"].iloc[:limit, :]
            k_means = KMeans(n_clusters=k, verbose=VERBOSE, random_state=42).fit(temp_train_X)
            inertia = k_means.inertia_
            inertia_results_scaled.loc[k, "Scaled"] = inertia
            silhouette_average = silhouette_score(temp_train_X, k_means.labels_)
            silhouette_average_results_scaled.loc[k, "Scaled"] = silhouette_average
            temp_silhouette_sample_results = silhouette_samples(temp_train_X, k_means.labels_)
            silhouette_sample_results_scaled[f"NumClusters_{k} DataSize_{limit}"] = temp_silhouette_sample_results
            print(f"\t{_df} Current Intertia: {inertia}, Silhouette: {silhouette_average}")
            if inertia > best_inertia:
                best_inertia = inertia
                best_inertia_num_cluster = k
                print(f"\t{_df} New Best Inertia: {best_inertia}")
                print(f"\t\t{_df} Inertia Best Number of Clusters: {best_inertia_num_cluster}")
            if silhouette_average > best_silhouette:
                best_silhouette = silhouette_average
                best_silhouette_num_cluster = k
                print(f"\t{_df} New Best Silhouette: {best_silhouette}")
                print(f"\t\t{_df} Silhouette Best Number of Clusters: {best_silhouette_num_cluster}")
        elif _df == "Not Scaled":
            temp_train_X = mnist_not_scaled["train_X"].iloc[:limit, :]
            k_means = KMeans(n_clusters=k, verbose=VERBOSE, random_state=42).fit(temp_train_X)
            inertia = k_means.inertia_
            inertia_results_not_scaled.loc[k, "Not Scaled"] = inertia
            silhouette_average = silhouette_score(temp_train_X, k_means.labels_)
            silhouette_average_results_not_scaled.loc[k, "Not Scaled"] = silhouette_average
            temp_silhouette_sample_results = silhouette_samples(temp_train_X, k_means.labels_)
            silhouette_sample_results_not_scaled[f"NumClusters_{k} DataSize_{limit}"] = temp_silhouette_sample_results
            print(f"\t{_df} Current Intertia: {inertia}, Silhouette: {silhouette_average}")
            if inertia > best_inertia:
                best_inertia = inertia
                best_inertia_num_cluster = k
                print(f"\t{_df} New Best Inertia: {best_inertia}")
                print(f"\t\t{_df} Inertia Best Number of Clusters: {best_inertia_num_cluster}")
            if silhouette_average > best_silhouette:
                best_silhouette = silhouette_average
                best_silhouette_num_cluster = k
                print(f"\t{_df} New Best Silhouette: {best_silhouette}")
                print(f"\t\t{_df} Silhouette Best Number of Clusters: {best_silhouette_num_cluster}")


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

inertia_results_scaled.plot(ax=ax1, label="Scaled")
inertia_results_not_scaled.plot(ax=ax1, label="Not Scaled")
ax1.set_title(f"K Means Clustering\nInertia", fontsize=15, weight='bold')
ax1.grid(which='major', linestyle='-', linewidth='0.5', color='white')
ax1.set_xlabel("K Clusters", fontsize=15, weight='heavy')
ax1.set_ylabel("Inertia", fontsize=15, weight='heavy')
ax1.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
plt.tight_layout()

silhouette_average_results_scaled.plot(ax=ax2, label="Scale")
silhouette_average_results_not_scaled.plot(ax=ax2, label="Not Scaled")
ax2.set_title(f"K Means Clusters\nSilhouette", fontsize=15, weight='bold')
ax2.grid(which='major', linestyle='-', linewidth='0.5', color='white')
ax2.set_xlabel("K Clusters", fontsize=15, weight='heavy')
ax2.set_ylabel("Average Silhouette Score", fontsize=15, weight='heavy')
ax2.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
plt.tight_layout()

plt.savefig(f"{os.getcwd()}/{temp_folder}KMEans_Combined_Scaled_Vs_NotScaled.png", bbox_inches='tight')
plt.close("all")

# K-Means Clustering

In [None]:
gathered_data = utl.setup(["MNIST"])
gathered_data_fashion = utl.setup(["Fashion-MNIST"])
mnist = {}
fashion_mnist = {}
mnist['train_X'], mnist['train_y'], \
mnist['valid_X'], mnist['valid_y'], \
mnist['test_X'], mnist['test_y'] = utl.split_data(gathered_data["MNIST"]["X"], gathered_data["MNIST"]["y"], scale=True)

fashion_mnist['train_X'], fashion_mnist['train_y'], \
fashion_mnist['valid_X'], fashion_mnist['valid_y'], \
fashion_mnist['test_X'], fashion_mnist['test_y'] = utl.split_data(gathered_data_fashion["Fashion-MNIST"]["X"],
                                                                  gathered_data_fashion["Fashion-MNIST"]["y"], scale=True)

In [None]:
temp_folder = "Clustering/" + "KMeans/"
utl.check_folder(temp_folder)
save_dir = os.getcwd() + temp_folder
limit = 5000
idx = [i for i in range(2, 31, 1)]
cols = ["Inertia", "Silhouette", "Homogeneity", "Completeness", "Harmonic_Mean", "Calinski_Harabasz", "Davies_Bouldin"]

In [None]:
mnist_results = pd.DataFrame(columns=cols, index=idx,
                               data=np.zeros(shape=(len(idx), len(cols))))

fashion_results = pd.DataFrame(columns=cols, index=idx,
                                          data=np.zeros(shape=(len(idx), len(cols))))

print("Starting K-Means Clustering")
for _df in ["MNIST", "Fashion-MNIST"]:
    for k in idx:
        if _df == "MNIST":
            temp_train_X = mnist["train_X"].iloc[:limit, :]
            temp_train_y = mnist["train_y"].iloc[:limit]
            k_means = KMeans(n_clusters=k, verbose=VERBOSE).fit(temp_train_X)
            inertia = k_means.inertia_
            silhouette_average = silhouette_score(temp_train_X, k_means.labels_, sample_size=limit)
            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(temp_train_y, k_means.labels_)
            mnist_results.loc[k, "Inertia"] = inertia
            mnist_results.loc[k, "Silhouette"] = silhouette_average
            mnist_results.loc[k, "Calinski_Harabasz"] = calinski_harabasz_score(temp_train_X, k_means.labels_)
            mnist_results.loc[k, "Davies_Bouldin"] = davies_bouldin_score(temp_train_X, k_means.labels_)
            mnist_results.loc[k, "Homogeneity"] = homogeneity
            mnist_results.loc[k, "Completeness"] = completeness
            mnist_results.loc[k, "Harmonic_Mean"] = v_measure
            print(f"\n\t{_df} - k={k} \n{mnist_results.loc[k]}")

        elif _df == "Fashion-MNIST":
            temp_train_X = fashion_mnist["train_X"].iloc[:limit, :]
            temp_train_y = fashion_mnist["train_y"].iloc[:limit]
            k_means = KMeans(n_clusters=k, verbose=VERBOSE).fit(temp_train_X)
            inertia = k_means.inertia_
            silhouette_average = silhouette_score(temp_train_X, k_means.labels_, sample_size=limit)
            homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(temp_train_y, k_means.labels_)
            fashion_results.loc[k, "Inertia"] = inertia
            fashion_results.loc[k, "Silhouette"] = silhouette_average
            fashion_results.loc[k, "Calinski_Harabasz"] = calinski_harabasz_score(temp_train_X, k_means.labels_)
            fashion_results.loc[k, "Davies_Bouldin"] = davies_bouldin_score(temp_train_X, k_means.labels_)
            fashion_results.loc[k, "Homogeneity"] = homogeneity
            fashion_results.loc[k, "Completeness"] = completeness
            fashion_results.loc[k, "Harmonic_Mean"] = v_measure
            print(f"\n\t{_df} - k={k} \n{fashion_results.loc[k]}")


In [None]:
with open(f"{os.getcwd()}/{temp_folder}/MNIST_Results.pkl", "rb") as in_file:
    mnist_results = pickle.load(in_file)
    in_file.close()

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/Fashion_Results.pkl", "rb") as in_file:
    fashion_results = pickle.load(in_file)
    in_file.close()

In [None]:
mnist_results.shape

In [None]:
temp = fashion_results / fashion_results.iloc[0]
temp2 = mnist_results / mnist_results.iloc[0]

In [None]:
fashion_results[["Silhouette", "Homogeneity", "Completeness", "Harmonic_Mean"]].plot()

In [None]:
mnist_results[["Silhouette", "Homogeneity", "Completeness", "Harmonic_Mean"]].plot()

In [None]:
temp[["Silhouette", "Completeness", "Davies_Bouldin"]].plot()

In [None]:
temp2[["Silhouette", "Completeness", "Davies_Bouldin"]].plot()

In [None]:
temp[["Silhouette", "Calinski_Harabasz", "Inertia"]].plot()

In [None]:
temp2[["Silhouette", "Calinski_Harabasz", "Inertia"]].plot()

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/MNIST_Results.pkl", "wb") as out_file:
    pickle.dump(mnist_results, out_file)
    out_file.close()

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/Fashion_Results.pkl", "wb") as out_file:
    pickle.dump(fashion_results, out_file)
    out_file.close()

## Elbow Method

In [None]:
limit=5000

In [None]:
plt.close("all")
end = 31
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# ax1_secondary = ax1.twinx()
# ax2_secondary = ax2.twinx()

mnist_model = KMeans()
mnist_visualizer = KElbowVisualizer(mnist_model, k=(2, end), ax=ax1, timings=False)
mnist_visualizer.fit(mnist["train_X"].iloc[:limit, :])

fashion_model = KMeans()
fashion_visualizer = KElbowVisualizer(fashion_model, k=(2, end), ax=ax2, timings=False)
fashion_visualizer.fit(fashion_mnist["train_X"].iloc[:limit, :])

# mnist_results[["Silhouette", "Homogeneity", "Completeness", "Harmonic_Mean"]].iloc[:end].plot(ax=ax1_secondary, linestyle="--")
ax1.set_title(f"K Means Clustering\nDistortion MNIST", fontsize=15, weight='bold')
# ax1.grid(which='major', linestyle='-', linewidth='0.5', color='white')
ax1.set_xlabel("K Clusters", fontsize=15, weight='heavy')
ax1.set_ylabel("Distortion", fontsize=15, weight='heavy')
ax1.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)

# fashion_results[["Silhouette", "Homogeneity", "Completeness", "Harmonic_Mean"]].iloc[:end].plot(ax=ax2_secondary, linestyle="--")
ax2.set_title(f"K Means Clustering\nDistortion Fashion MNIST", fontsize=15, weight='bold')
# ax2.grid(which='major', linestyle='-', linewidth='0.5', color='white')
ax2.set_xlabel("K Clusters", fontsize=15, weight='heavy')
ax2.set_ylabel("Distortion", fontsize=15, weight='heavy')
ax2.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)

plt.savefig(f"{os.getcwd()}/{temp_folder}KMEans_Elbow_Method_Combined.png", bbox_inches='tight')

## Silhouette Method

In [None]:
plt.close("all")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
mnist_cluster_count = 10
fashion_cluster_count = 9

mnist_model = KMeans(n_clusters=mnist_cluster_count, random_state=42)
fashion_model = KMeans(n_clusters=fashion_cluster_count, random_state=42)

mnist_vis = SilhouetteVisualizer(mnist_model, ax=ax1, 
                                 colors='yellowbrick').fit(mnist["train_X"].iloc[:limit, :]).finalize()
fashion_mnist_vis = SilhouetteVisualizer(fashion_model, ax=ax2, 
                                         colors='yellowbrick').fit(fashion_mnist["train_X"].iloc[:limit, :]).finalize()

ax1.set_title(f"Silhouette Plot of KMEans Clustering\non MNIST with {mnist_cluster_count} Clusters", fontsize=15, weight='bold')
ax1.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
ax1.set_xlabel("Silhouette Coefficient Values", fontsize=15, weight='heavy')
ax1.set_ylabel("Cluster Label", fontsize=15, weight='heavy')

ax2.set_title(f"Silhouette Plot of KMEans Clustering\non Fashion MNIST with {fashion_cluster_count} Clusters", fontsize=15, weight='bold')
ax2.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
ax2.set_xlabel("Silhouette Coefficient Values", fontsize=15, weight='heavy')
ax2.set_ylabel("Cluster Label", fontsize=15, weight='heavy')

plt.tight_layout()

plt.savefig(f"{os.getcwd()}/{temp_folder}KMEans_Silhouette_Combined.png", bbox_inches='tight')

# Expectation Maximization 

In [None]:
index = np.arange(1, 10, 1).astype(np.int)
types = ["Full"]
columns = ["AIC_Full", "BIC_Full"]
em_mnist_results = pd.DataFrame(columns=columns, index=index, data=np.zeros(shape=(index.shape[0], len(columns))))
em_fashion_results = pd.DataFrame(columns=columns, index=index, data=np.zeros(shape=(index.shape[0], len(columns))))

In [None]:
X = mnist["train_X"].iloc[:limit, :]
for idx in index:
    print(f"N_Components: {idx}")
    for _type in types:
        temp_gmm = GaussianMixture(n_components=idx, n_init=10, covariance_type=_type.lower(), warm_start=True, max_iter=500).fit(X)
        em_mnist_results.loc[idx, f"AIC_{_type}"] = temp_gmm.aic(X)
        em_mnist_results.loc[idx, f"BIC_{_type}"] = temp_gmm.bic(X)

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/MNIST_EM_Results.pkl", "wb") as out_file:
    pickle.dump(em_mnist_results, out_file)
    out_file.close()

In [None]:
X = fashion_mnist["train_X"].iloc[:limit, :]
for idx in index:
    print(f"N_Components: {idx}")
    for _type in types:
        temp_gmm = GaussianMixture(n_components=idx, n_init=10, covariance_type=_type.lower(), warm_start=True, max_iter=500).fit(X)
        em_fashion_results.loc[idx, f"AIC_{_type}"] = temp_gmm.aic(X)
        em_fashion_results.loc[idx, f"BIC_{_type}"] = temp_gmm.bic(X)


In [None]:
with open(f"{os.getcwd()}/{temp_folder}/Fashion_EM_Results.pkl", "wb") as out_file:
    pickle.dump(em_fashion_results, out_file)
    out_file.close()

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/MNIST_EM_Results.pkl", "rb") as in_file:
    em_mnist_results = pickle.load(in_file)
    in_file.close()

In [None]:
with open(f"{os.getcwd()}/{temp_folder}/Fashion_EM_Results.pkl", "rb") as in_file:
    em_fashion_results = pickle.load(in_file)
    in_file.close()

In [None]:
plt.close("all")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

em_mnist_results[["AIC_Full", "BIC_Full"]].plot(ax=ax1)
ax1.set_title(f"AIC / BIC Comparison\n MNIST", fontsize=15, weight='bold')
ax1.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
ax1.set_xlabel("N Components", fontsize=15, weight='heavy')
ax1.set_ylabel("Information Criterion", fontsize=15, weight='heavy')

em_fashion_results[["AIC_Full", "BIC_Full"]].plot(ax=ax2)
ax2.set_title(f"AIC / BIC Comparison\n Fashion-MNIST", fontsize=15, weight='bold')
ax2.legend(loc="best", markerscale=1.1, frameon=True,
                   edgecolor="black", fancybox=True, shadow=True)
ax2.set_xlabel("N Components", fontsize=15, weight='heavy')
ax2.set_ylabel("Information Criterion", fontsize=15, weight='heavy')

plt.tight_layout()

plt.savefig(f"{os.getcwd()}/{temp_folder}EM_AicBic_Combined.png", bbox_inches='tight')

In [None]:
em_mnist_results[["AIC_Full", "BIC_Full"]].plot()

In [None]:
X = fashion_mnist["train_X"].iloc[:2000, :]
for idx in index:
    print(f"N_Components: {idx}")
    for _type in types:
        temp_gmm = GaussianMixture(n_components=idx, n_init=10, covariance_type=_type.lower()).fit(X)
        em_fashion_results.loc[idx, f"AIC_{_type}"] = temp_gmm.aic(X)
        em_fashion_results.loc[idx, f"BIC_{_type}"] = temp_gmm.bic(X)
