In [9]:
# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [10]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import time

In [11]:
class GaussianMixtureWrapper:
    NAME = 'GMM'
    def __init__(self, n_components):
        self.n_components = n_components
        self.model = GaussianMixture(n_components=n_components)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

class KMeansWrapper:
    NAME = 'KMeans'
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.model = KMeans(n_clusters=n_clusters)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

In [12]:
# DATASET_NAME = 'mnist'
DATASET_NAME = 'wine_quality'

# DIM_RED_METHOD = 'LLE'
DIM_RED_METHOD = 'PCA'
# DIM_RED_METHOD = 'ICA'

METHOD = GaussianMixtureWrapper
# METHOD = KMeansWrapper
K_FOLDS = 5
SAVE = True

In [13]:
if DATASET_NAME == 'wine_quality':
    DATASET_STR = 'Wine Quality'
    X_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_train.pkl', 'rb'))
    y_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_train.pkl', 'rb'))
    X_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_test.pkl', 'rb'))
    y_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_test.pkl', 'rb'))
elif DATASET_NAME == 'mnist':
    DATASET_STR = 'MNIST'
    X_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_train.pkl', 'rb'))
    y_train = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_train.pkl', 'rb'))
    X_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/X_test.pkl', 'rb'))
    y_test = pickle.load(open(f'transformed_data/{DATASET_NAME}/{DIM_RED_METHOD}/y_test.pkl', 'rb'))
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

In [14]:
df_path = f'results/step_3/{DATASET_NAME}/{METHOD.NAME}_{DIM_RED_METHOD}_metrics.csv'
if os.path.exists(df_path):
    df = pd.read_csv(df_path)
    df.set_index('num_components', inplace=True)
    df['num_components'] = df.index
else:
    df = pd.DataFrame()

In [15]:
# get the number of instances of each label in y
for i in np.unique(y_train):
    print(f'Label {i}: {np.sum(y_train == i)} instances')

Label 3: 15 instances
Label 4: 130 instances
Label 5: 1152 instances
Label 6: 1758 instances
Label 7: 719 instances
Label 8: 143 instances
Label 9: 1 instances


In [16]:
n = X_train.shape[0] // K_FOLDS
metrics = {
    'calinski_harabasz_score': calinski_harabasz_score,
    'davies_bouldin_score': davies_bouldin_score,
    'adjusted_rand_score': adjusted_rand_score,
    'adjusted_mutual_info_score': adjusted_mutual_info_score,
    'homogeneity_score': homogeneity_score,
    'completeness_score': completeness_score,
    'silhouette_score': silhouette_score,
}

if DATASET_NAME == 'wine_quality':
    range_ = range(2, 21)
elif DATASET_NAME == 'mnist':
    range_ = range(2, 21)
else:
    raise ValueError(f'Invalid dataset name {DATASET_NAME}')

for components in tqdm(range_):
    if components in df.index:
        continue
    print(components)
    metric_lists = {k: [] for k in metrics.keys()}
    training_time_list = []
    evaluation_time_list = []
    for i in range(K_FOLDS):
        X_train_k = np.concatenate([X_train[:i * n], X_train[(i + 1) * n:]])
        y_train_k = np.concatenate([y_train[:i * n], y_train[(i + 1) * n:]])
        X_test_k = X_train[i * n:(i + 1) * n]
        y_test_k = y_train[i * n:(i + 1) * n]

        model = METHOD(components)
        t0 = time.perf_counter()
        model.fit(X_train_k)
        t1 = time.perf_counter()
        training_time_list.append(t1 - t0)

        t0 = time.perf_counter()
        y_test_pred = model.predict(X_test_k) 
        t1 = time.perf_counter()
        evaluation_time_list.append(t1 - t0)
  
        for metric_str, metric in metrics.items():
            if metric_str in ['adjusted_rand_score', 'adjusted_mutual_info_score', 'homogeneity_score', 'completeness_score']:
                metric_lists[metric_str].append(metric(y_test_k, y_test_pred))
            else:
                metric_lists[metric_str].append(metric(X_test_k, y_test_pred))

    df.loc[components, 'num_components'] = components
    df.loc[components, 'training_time_mean'] = np.mean(training_time_list)
    df.loc[components, 'training_time_std'] = np.std(training_time_list)
    df.loc[components, 'evaluation_time_mean'] = np.mean(evaluation_time_list)
    df.loc[components, 'evaluation_time_std'] = np.std(evaluation_time_list)
    for metric_str, metric_list in metric_lists.items():
        df.loc[components, f'{metric_str}_mean'] = np.mean(metric_list)
        df.loc[components, f'{metric_str}_std'] = np.std(metric_list)
df['num_components'] = df.index

  0%|          | 0/19 [00:00<?, ?it/s]

2


  5%|▌         | 1/19 [00:01<00:20,  1.13s/it]

3


 11%|█         | 2/19 [00:02<00:25,  1.52s/it]

4


 16%|█▌        | 3/19 [00:04<00:22,  1.43s/it]

5


 21%|██        | 4/19 [00:06<00:23,  1.59s/it]

6


 26%|██▋       | 5/19 [00:08<00:25,  1.80s/it]

7


 32%|███▏      | 6/19 [00:10<00:24,  1.86s/it]

8


 37%|███▋      | 7/19 [00:12<00:24,  2.05s/it]

9


 42%|████▏     | 8/19 [00:15<00:24,  2.26s/it]

10


 47%|████▋     | 9/19 [00:18<00:24,  2.40s/it]

11


 53%|█████▎    | 10/19 [00:21<00:24,  2.71s/it]

12


 58%|█████▊    | 11/19 [00:24<00:21,  2.68s/it]

13


 63%|██████▎   | 12/19 [00:26<00:19,  2.74s/it]

14


 68%|██████▊   | 13/19 [00:30<00:17,  2.88s/it]

15


 74%|███████▎  | 14/19 [00:34<00:16,  3.24s/it]

16


 79%|███████▉  | 15/19 [00:39<00:15,  3.90s/it]

17


 84%|████████▍ | 16/19 [00:44<00:12,  4.24s/it]

18


 89%|████████▉ | 17/19 [00:50<00:09,  4.68s/it]

19


 95%|█████████▍| 18/19 [00:55<00:04,  4.79s/it]

20


100%|██████████| 19/19 [01:00<00:00,  3.20s/it]


In [17]:
if SAVE:
    os.makedirs(os.path.dirname(df_path), exist_ok=True)
    df.to_csv(df_path, index=False)