In [None]:
# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import time

In [None]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data

In [None]:
class GaussianMixtureWrapper:
    NAME = 'GMM'
    def __init__(self, n_components):
        self.n_components = n_components
        self.model = GaussianMixture(n_components=n_components)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
    
    def bic(self, X):
        return self.model.bic(X)

    def aic(self, X):
        return self.model.aic(X)

class KMeansWrapper:
    NAME = 'KMeans'
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.model = KMeans(n_clusters=n_clusters)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
    
    def inertia(self):
        return self.model.inertia_

In [None]:
DATASET_NAME = 'mnist'
# DATASET_NAME = 'wine_quality'

METHOD = GaussianMixtureWrapper
# METHOD = KMeansWrapper
K_FOLDS = 5
SAVE = True

In [None]:
if DATASET_NAME == 'wine_quality':
    DATASET_STR = 'Wine Quality'
    X, y, _, _ = load_wine_quality_data()
elif DATASET_NAME == 'mnist':
    DATASET_STR = 'MNIST'
    X, y, _, _ = load_mnist_data()
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

In [None]:
df_path = f'results/{DATASET_NAME}/{METHOD.NAME}_metrics.csv'
if os.path.exists(df_path):
    df = pd.read_csv(df_path)
    df.set_index('num_components', inplace=True)
    df['num_components'] = df.index
else:
    df = pd.DataFrame()

In [None]:
# get the number of instances of each label in y
for i in np.unique(y):
    print(f'Label {i}: {np.sum(y == i)} instances')

In [None]:
n = X.shape[0] // K_FOLDS
metrics = {
    'calinski_harabasz_score': calinski_harabasz_score,
    'davies_bouldin_score': davies_bouldin_score,
    'adjusted_rand_score': adjusted_rand_score,
    'adjusted_mutual_info_score': adjusted_mutual_info_score,
    'homogeneity_score': homogeneity_score,
    'completeness_score': completeness_score,
    'silhouette_score': silhouette_score,
}

if DATASET_NAME == 'wine_quality':
    range_ = range(2, 21)
elif DATASET_NAME == 'mnist':
    # range_ = range(2, 21)
    range_ = [25, 30, 40, 50, 100, 150, 195]
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

for components in tqdm(range_):
    if components in df.index:
        continue
    print(components)
    metric_lists = {k: [] for k in metrics.keys()}
    training_time_list = []
    evaluation_time_list = []
    bic_list = []
    aic_list = []
    inertia_list = []
    for i in range(K_FOLDS):
        X_train = np.concatenate([X[:i * n], X[(i + 1) * n:]])
        y_train = np.concatenate([y[:i * n], y[(i + 1) * n:]])
        X_test = X[i * n:(i + 1) * n]
        y_test = y[i * n:(i + 1) * n]

        model = METHOD(components)
        t0 = time.perf_counter()
        model.fit(X_train)
        t1 = time.perf_counter()
        training_time_list.append(t1 - t0)

        t0 = time.perf_counter()
        y_test_pred = model.predict(X_test) 
        t1 = time.perf_counter()
        evaluation_time_list.append(t1 - t0)

        if METHOD.NAME == 'KMeans':
            inertia_list.append(model.inertia())
        elif METHOD.NAME == 'GMM':
            aic_list.append(model.aic(X_test))
            bic_list.append(model.bic(X_test))
  
        for metric_str, metric in metrics.items():
            if metric_str in ['adjusted_rand_score', 'adjusted_mutual_info_score', 'homogeneity_score', 'completeness_score']:
                metric_lists[metric_str].append(metric(y_test, y_test_pred))
            else:
                metric_lists[metric_str].append(metric(X_test, y_test_pred))

    df.loc[components, 'num_components'] = components
    df.loc[components, 'training_time_mean'] = np.mean(training_time_list)
    df.loc[components, 'training_time_std'] = np.std(training_time_list)
    df.loc[components, 'evaluation_time_mean'] = np.mean(evaluation_time_list)
    df.loc[components, 'evaluation_time_std'] = np.std(evaluation_time_list)
    for metric_str, metric_list in metric_lists.items():
        df.loc[components, f'{metric_str}_mean'] = np.mean(metric_list)
        df.loc[components, f'{metric_str}_std'] = np.std(metric_list)

    if METHOD.NAME == 'KMeans':
        df.loc[components, 'inertia_mean'] = np.std(inertia_list)
        df.loc[components, 'inertia_std'] = np.std(inertia_list)
    elif METHOD.NAME == 'GMM':
        df.loc[components, 'aic_mean'] = np.mean(aic_list)
        df.loc[components, 'aic_std'] = np.std(aic_list)
        df.loc[components, 'bic_mean'] = np.mean(bic_list)
        df.loc[components, 'bic_std'] = np.std(bic_list)
df['num_components'] = df.index

In [None]:
if SAVE:
    os.makedirs(os.path.dirname(df_path), exist_ok=True)
    df.to_csv(df_path, index=False)