In [10]:
# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [11]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import time

In [12]:
class GaussianMixtureWrapper:
    NAME = 'GMM'
    def __init__(self, n_components):
        self.n_components = n_components
        self.model = GaussianMixture(n_components=n_components)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
    
    def bic(self, X):
        return self.model.bic(X)

    def aic(self, X):
        return self.model.aic(X)

class KMeansWrapper:
    NAME = 'KMeans'
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.model = KMeans(n_clusters=n_clusters)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
    
    def inertia(self):
        return self.model.inertia_

In [13]:
DATASET_NAME = 'mnist'
# DATASET_NAME = 'wine_quality'

# DIM_RED_METHOD = 'LLE'
# DIM_RED_METHOD = 'PCA'
DIM_RED_METHOD = 'ICA'

# METHOD = GaussianMixtureWrapper
METHOD = KMeansWrapper
K_FOLDS = 5
SAVE = True

In [14]:
if DATASET_NAME == 'wine_quality':
    DATASET_STR = 'Wine Quality'
    X_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_train.pkl', 'rb'))
    y_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_train.pkl', 'rb'))
    X_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_test.pkl', 'rb'))
    y_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_test.pkl', 'rb'))
elif DATASET_NAME == 'mnist':
    DATASET_STR = 'MNIST'
    X_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_train.pkl', 'rb'))
    y_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_train.pkl', 'rb'))
    X_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_test.pkl', 'rb'))
    y_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_test.pkl', 'rb'))
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

In [15]:
df_path = f'results/step_3/{DATASET_NAME}/{METHOD.NAME}_{DIM_RED_METHOD}_metrics.csv'
if os.path.exists(df_path):
    df = pd.read_csv(df_path)
    df.set_index('num_components', inplace=True)
    df['num_components'] = df.index
else:
    df = pd.DataFrame()

In [16]:
# get the number of instances of each label in y
for i in np.unique(y_train):
    print(f'Label {i}: {np.sum(y_train == i)} instances')

Label 0: 5923 instances
Label 1: 6742 instances
Label 2: 5958 instances


In [17]:
n = X_train.shape[0] // K_FOLDS
metrics = {
    'calinski_harabasz_score': calinski_harabasz_score,
    'davies_bouldin_score': davies_bouldin_score,
    'adjusted_rand_score': adjusted_rand_score,
    'adjusted_mutual_info_score': adjusted_mutual_info_score,
    'homogeneity_score': homogeneity_score,
    'completeness_score': completeness_score,
    'silhouette_score': silhouette_score,
}

if DATASET_NAME == 'wine_quality':
    range_ = list(range(2, 21, 2))
    range_ += [25, 50, 75, 100, 195]
elif DATASET_NAME == 'mnist':
    range_ = list(range(2, 21, 2))
    range_ += [25, 50, 75, 100, 195]
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

for components in tqdm(range_):
    if components in df.index:
        continue
    print(components)
    metric_lists = {k: [] for k in metrics.keys()}
    training_time_list = []
    evaluation_time_list = []
    inertia_list = []
    aic_list = []
    bic_list = []
    for i in range(K_FOLDS):
        X_train_k = np.concatenate([X_train[:i * n], X_train[(i + 1) * n:]])
        y_train_k = np.concatenate([y_train[:i * n], y_train[(i + 1) * n:]])
        X_test_k = X_train[i * n:(i + 1) * n]
        y_test_k = y_train[i * n:(i + 1) * n]

        model = METHOD(components)
        t0 = time.perf_counter()
        model.fit(X_train_k)
        t1 = time.perf_counter()
        training_time_list.append(t1 - t0)

        t0 = time.perf_counter()
        y_test_pred = model.predict(X_test_k) 
        t1 = time.perf_counter()
        evaluation_time_list.append(t1 - t0)
  
        if METHOD.NAME == 'KMeans':
            inertia_list.append(model.inertia())
        elif METHOD.NAME == 'GMM':
            aic_list.append(model.aic(X_test))
            bic_list.append(model.bic(X_test))

        for metric_str, metric in metrics.items():
            if metric_str in ['adjusted_rand_score', 'adjusted_mutual_info_score', 'homogeneity_score', 'completeness_score']:
                metric_lists[metric_str].append(metric(y_test_k, y_test_pred))
            else:
                metric_lists[metric_str].append(metric(X_test_k, y_test_pred))

    if METHOD.NAME == 'KMeans':
        df.loc[components, 'inertia_mean'] = np.mean(inertia_list)
        df.loc[components, 'inertia_std'] = np.std(inertia_list)
    elif METHOD.NAME == 'GMM':
        df.loc[components, 'aic_mean'] = np.mean(aic_list)
        df.loc[components, 'aic_std'] = np.std(aic_list)
        df.loc[components, 'bic_mean'] = np.mean(bic_list)
        df.loc[components, 'bic_std'] = np.std(bic_list)

    df.loc[components, 'num_components'] = components
    df.loc[components, 'training_time_mean'] = np.mean(training_time_list)
    df.loc[components, 'training_time_std'] = np.std(training_time_list)
    df.loc[components, 'evaluation_time_mean'] = np.mean(evaluation_time_list)
    df.loc[components, 'evaluation_time_std'] = np.std(evaluation_time_list)
    for metric_str, metric_list in metric_lists.items():
        df.loc[components, f'{metric_str}_mean'] = np.mean(metric_list)
        df.loc[components, f'{metric_str}_std'] = np.std(metric_list)
df['num_components'] = df.index

  0%|          | 0/15 [00:00<?, ?it/s]

2


  7%|▋         | 1/15 [00:02<00:38,  2.77s/it]

4


 13%|█▎        | 2/15 [00:07<00:51,  4.00s/it]

6


 20%|██        | 3/15 [00:13<00:59,  4.97s/it]

8


 27%|██▋       | 4/15 [00:22<01:12,  6.60s/it]

10


 33%|███▎      | 5/15 [00:32<01:16,  7.63s/it]

12


 40%|████      | 6/15 [00:42<01:17,  8.65s/it]

14


 47%|████▋     | 7/15 [00:50<01:07,  8.45s/it]

16


 53%|█████▎    | 8/15 [00:58<00:56,  8.07s/it]

18


 60%|██████    | 9/15 [01:03<00:42,  7.11s/it]

20


 67%|██████▋   | 10/15 [01:12<00:38,  7.73s/it]

25


 73%|███████▎  | 11/15 [01:27<00:40, 10.09s/it]

50


 80%|████████  | 12/15 [01:45<00:37, 12.38s/it]

75


 87%|████████▋ | 13/15 [02:25<00:41, 20.67s/it]

100


 93%|█████████▎| 14/15 [03:04<00:26, 26.20s/it]

195


100%|██████████| 15/15 [04:18<00:00, 17.25s/it]


In [18]:
if SAVE:
    os.makedirs(os.path.dirname(df_path), exist_ok=True)
    df.to_csv(df_path, index=False)