In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import completeness_score

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import time

In [2]:
from data_helpers.wine_quality_data_helper import load_wine_quality_data
from data_helpers.mnist_data_helper import load_mnist_data



In [3]:
class GaussianMixtureWrapper:
    NAME = 'GaussianMixture'
    def __init__(self, n_components):
        self.n_components = n_components
        self.model = GaussianMixture(n_components=n_components)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

class KMeansWrapper:
    NAME = 'KMeans'
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.model = KMeans(n_clusters=n_clusters)

    def fit(self, X):
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)

In [4]:
DATASET_NAME = 'wine_quality'
# we only want to use the training data
X, y, _, _ = load_wine_quality_data()
# n = 5000
# X = X[:n]
# y = y[:n]

CLUSTER_METHOD = GaussianMixtureWrapper
K_FOLDS = 5
SAVE = True

In [5]:
n = X.shape[0] // K_FOLDS
metrics = {
    'calinski_harabasz_score': calinski_harabasz_score,
    'davies_bouldin_score': davies_bouldin_score,
    'adjusted_rand_score': adjusted_rand_score,
    'adjusted_mutual_info_score': adjusted_mutual_info_score,
    'homogeneity_score': homogeneity_score,
    'completeness_score': completeness_score,
    'silhouette_score': silhouette_score,
}
data = {'num_components': [], 'training_time_mean': [], 'training_time_std': [], 'evaluation_time_mean': [], 
        'evaluation_time_std': []}
for metric_str in metrics.keys():
    data[f'{metric_str}_mean'] = []
    data[f'{metric_str}_std'] = []

for components in tqdm(range(2, 11)):
    print(components)
    metric_lists = {k: [] for k in metrics.keys()}
    training_time_list = []
    evaluation_time_list = []
    for i in range(K_FOLDS):
        X_train = np.concatenate([X[:i * n], X[(i + 1) * n:]])
        y_train = np.concatenate([y[:i * n], y[(i + 1) * n:]])
        X_test = X[i * n:(i + 1) * n]
        y_test = y[i * n:(i + 1) * n]

        model = CLUSTER_METHOD(components)
        t0 = time.perf_counter()
        model.fit(X_train)
        t1 = time.perf_counter()
        training_time_list.append(t1 - t0)

        t0 = time.perf_counter()
        y_test_pred = model.predict(X_test) 
        t1 = time.perf_counter()
        evaluation_time_list.append(t1 - t0)
  
        for metric_str, metric in metrics.items():
            if metric_str in ['adjusted_rand_score', 'adjusted_mutual_info_score', 'homogeneity_score', 'completeness_score']:
                metric_lists[metric_str].append(metric(y_test, y_test_pred))
            else:
                metric_lists[metric_str].append(metric(X_test, y_test_pred))

    data['num_components'].append(components)
    data['training_time_mean'].append(np.mean(training_time_list))
    data['training_time_std'].append(np.std(training_time_list))
    data['evaluation_time_mean'].append(np.mean(evaluation_time_list))
    data['evaluation_time_std'].append(np.std(evaluation_time_list))
    for metric_str, metric_list in metric_lists.items():
        data[f'{metric_str}_mean'].append(np.mean(metric_list))
        data[f'{metric_str}_std'].append(np.std(metric_list))

  0%|          | 0/9 [00:00<?, ?it/s]

2


 11%|█         | 1/9 [01:18<10:28, 78.61s/it]

3


 22%|██▏       | 2/9 [04:38<17:31, 150.15s/it]

4


 33%|███▎      | 3/9 [07:52<16:59, 169.97s/it]

5


 44%|████▍     | 4/9 [11:15<15:16, 183.23s/it]

6


 56%|█████▌    | 5/9 [15:39<14:09, 212.31s/it]

7


 67%|██████▋   | 6/9 [20:57<12:24, 248.02s/it]

8


 78%|███████▊  | 7/9 [27:51<10:04, 302.39s/it]

9


 89%|████████▉ | 8/9 [34:55<05:41, 341.10s/it]

10


100%|██████████| 9/9 [42:26<00:00, 283.00s/it]


In [6]:
if SAVE:
    df = pd.DataFrame(data)
    df_path = f'results/{DATASET_NAME}/{CLUSTER_METHOD.NAME}_metrics.csv'
    os.makedirs(os.path.dirname(df_path), exist_ok=True)
    df.to_csv(df_path, index=False)