In [57]:
#pip install fastcluster

In [58]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from ensemble_clustering import Ensemble
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score
from sklearn import metrics

In [59]:
# Set metrics to use with each algorithm.
algo_metrics = {
    'MiniBatchKMeans': ['aic', 'inertia'],
    'linkage_vector': ['inertia', 'max_diff']
}

In [60]:
# Set higher-level algorithm parameters. Set 'looping' param to None to auto-detect it.
algo_params = {
    'MiniBatchKMeans': {'n_clusters': None, 'compute_labels': True},
    'linkage_vector': {}
}

In [61]:
# Define hyperparameter ranges.
h_params = {
    'linkage_vector': {
        'method': ['centroid', 'median'],
        'metric': ['euclidean']
    },
    'MiniBatchKMeans': {
        'init': ['k-means++', 'random'],
        'reassignment_ratio': np.geomspace(1e-4, 0.5, 2)
    },
}

In [62]:
# Define matrix parameters.
e_params = {
    'build': ['mode', 'raw'],
    'vote': ['row', 'col', 'full'],
    'ground_truth': 3
}

In [63]:
X, y = make_blobs(n_samples=30000, centers=3, n_features=2, center_box=(-5, 5), random_state=1)

In [64]:
X

array([[-5.76625805, -1.60287464],
       [-5.0903303 , -2.08131793],
       [-5.49699114, -0.53030276],
       ...,
       [-2.28029643,  1.88394361],
       [-5.1421503 , -0.15208666],
       [-0.71844269,  2.18372238]])

In [65]:
np.unique(y)

array([0, 1, 2])

In [66]:
X.shape

(30000, 2)

In [67]:
my_ensemble = Ensemble(algo_metrics, algo_params, h_params)
res, nc_res = my_ensemble(X, (2, 7), e_params)

Algorithms:   0%|                                                             | 0/2 [00:00<?, ?it/s]




[A



[A
[A




[A
[A



Algorithms:  50%|██████████████████████████▌                          | 1/2 [00:02<00:02,  2.40s/it]
[A
[A
[A
[A
[A
Algorithms: 100%|█████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.20s/it]

Ensemble Clustering metrics:
----------------------------
mode-row --- Accuracy: 0.00%, Average Error: 1.00.
mode-col --- Accuracy: 0.00%, Average Error: 1.00.
mode-full --- Accuracy: 0.00%, Average Error: 1.00.
raw-row --- Accuracy: 0.00%, Average Error: 1.00.
raw-col --- Accuracy: 0.00%, Average Error: 1.00.
raw-full --- Accuracy: 100.00%, Average Error: nan.





In [68]:
res

{'MiniBatchKMeans': [{'aic': 3, 'inertia': 3},
  {'aic': 3, 'inertia': 3},
  {'aic': 3, 'inertia': 3},
  {'aic': 4, 'inertia': 4}],
 'linkage_vector': [{'inertia': 6, 'max_diff': 2},
  {'inertia': 5, 'max_diff': 2}]}

In [69]:
nc_res

{'mode': {'row': {'num_clusters': [2]},
  'col': {'num_clusters': [2]},
  'full': {'num_clusters': [2]}},
 'raw': {'row': {'num_clusters': [2]},
  'col': {'num_clusters': [2]},
  'full': {'num_clusters': [3]}},
 'best_algo': [[('MiniBatchKMeans',
    {'init': 'k-means++', 'reassignment_ratio': 0.0001}),
   ('MiniBatchKMeans', {'init': 'k-means++', 'reassignment_ratio': 0.5}),
   ('MiniBatchKMeans', {'init': 'random', 'reassignment_ratio': 0.0001})]],
 'best_algo_global': [('MiniBatchKMeans',
   {'init': 'k-means++', 'reassignment_ratio': 0.0001}),
  ('MiniBatchKMeans', {'init': 'k-means++', 'reassignment_ratio': 0.5}),
  ('MiniBatchKMeans', {'init': 'random', 'reassignment_ratio': 0.0001})]}

In [70]:
sc_X = StandardScaler()

In [71]:
x = sc_X.fit_transform(X)

In [72]:
X

array([[-5.76625805, -1.60287464],
       [-5.0903303 , -2.08131793],
       [-5.49699114, -0.53030276],
       ...,
       [-2.28029643,  1.88394361],
       [-5.1421503 , -0.15208666],
       [-0.71844269,  2.18372238]])

In [73]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin

  
# Load data in X 
#batch_size = 20
# centers = [[1, 1], [-2, -1], [1, -2], [1, 9]]
# n_clusters = len(centers)
X = X
  
# perform the mini batch K-means
mbk = MiniBatchKMeans(init ='k-means++', n_clusters = 2,
                      #batch_size = batch_size,
                      #n_init = 10,
                      random_state=0,
                      max_iter = 10)
  
mbk.fit(X)
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis = 0)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
  
# print the labels of each data
print(mbk_means_labels)

[0 0 0 ... 1 0 1]




In [74]:
calinski_harabasz_score_ensemble = metrics.calinski_harabasz_score(X, mbk_means_labels)
db_index_ensemble = davies_bouldin_score(X, mbk_means_labels)

In [75]:
kmeans = KMeans(n_clusters=3, random_state=1).fit(X)

In [76]:
# we store the cluster labels
labels = kmeans.labels_

In [77]:
calinski_harabasz_score = metrics.calinski_harabasz_score(X, labels)
db_index = davies_bouldin_score(X, labels)

In [78]:
data = {'metrics': ['calinski_harabasz_score', 'davies_bouldin_score'],
        'without_ensemble': [calinski_harabasz_score, db_index],
        'with_ensemble': [calinski_harabasz_score_ensemble, db_index_ensemble],
        }
 
df = pd.DataFrame(data)
 

df

Unnamed: 0,metrics,without_ensemble,with_ensemble
0,calinski_harabasz_score,81810.329685,84912.052741
1,davies_bouldin_score,0.710529,0.474434
