In [5]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import Birch
from sklearn.mixture import GaussianMixture
from tqdm import tqdm
from keras.datasets import mnist

In [6]:
(raw_train_X, raw_train_y), (raw_test_X, raw_test_y) = mnist.load_data()
train_X = raw_train_X / 255
test_X = raw_test_X / 255
train_X = train_X.reshape(train_X.shape[0], 28*28)
test_X = test_X.reshape(test_X.shape[0], 28*28)
perm = np.random.permutation(train_X.shape[0])
train_X = train_X[perm]
train_y = raw_train_y[perm]
perm = np.random.permutation(test_X.shape[0])
test_X = test_X[perm]
test_y = raw_test_y[perm]
# combine the train and test
# X = np.concatenate([train_X, test_X])
# y = np.concatenate([train_y, test_y])
X = test_X
y = test_y

In [4]:
k_folds = 2
n = X.shape[0] // k_folds
data = {'num_components': [], 'bic_mean': [], 'aic_mean': [], 'bic_std': [], 'aic_std': []}
# for components in tqdm(range(1, 21)):
for components in tqdm(range(1, 6)):
    print(components)
    aic_list = []
    bic_list = []
    for i in range(k_folds):
        X_train = np.concatenate([X[:i * n], X[(i + 1) * n:]])
        y_train = np.concatenate([y[:i * n], y[(i + 1) * n:]])
        X_test = X[i * n:(i + 1) * n]
        y_test = y[i * n:(i + 1) * n]

        # gm = Birch(n_clusters=components).fit(X_train)
        gm = GaussianMixture(n_components=components, covariance_type='full').fit(X_train)
        # get the inertia
        inertian = gm.score(X_test)

        aic_list.append(gm.aic(X_test))
        bic_list.append(gm.bic(X_test))

    data['num_components'].append(components)
    data['aic_mean'].append(np.mean(aic_list))
    data['bic_mean'].append(np.mean(bic_list))
    data['aic_std'].append(np.std(aic_list))
    data['bic_std'].append(np.std(bic_list))

  0%|          | 0/5 [00:00<?, ?it/s]

1


  0%|          | 0/5 [00:05<?, ?it/s]


AttributeError: 'Birch' object has no attribute 'aic'

In [None]:
df = pd.DataFrame(data)
df_path = 'results/wine_quality/birch_metrics.csv'
os.makedirs(os.path.dirname(df_path), exist_ok=True)
df.to_csv(df_path, index=False)

In [None]:
metric = 'aic'
plt.plot(data['num_components'], data[f'{metric}_mean'], label=metric.upper())
plt.fill_between(
    np.array(data['num_components']), 
    np.array(data[f'{metric}_mean'])-np.array(data[f'{metric}_std']), 
    np.array(data[f'{metric}_mean'])+np.array(data[f'{metric}_std']), 
    alpha=0.15)

In [None]:
metric = 'bic'
plt.plot(data['num_components'], data[f'{metric}_mean'], label=metric.upper())
plt.fill_between(
    np.array(data['num_components']), 
    np.array(data[f'{metric}_mean'])-np.array(data[f'{metric}_std']), 
    np.array(data[f'{metric}_mean'])+np.array(data[f'{metric}_std']), 
    alpha=0.15)