In [2]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

%load_ext memory_profiler

In [3]:
def create_exp1_data(n_samples_array: list, n_features: int = 3, centers: int = 5, random_state: int = 17):
    global_storage: dict = {}
    for n_sample in n_samples_array:
        local_storage: dict = {}
        X, labels_true = make_blobs(
            n_samples=n_sample, 
            n_features=n_features,
            centers=centers, 
            random_state=random_state, 
        )
        X_scaled = StandardScaler().fit_transform(X)
        local_storage['X'], local_storage['X_scaled'], local_storage['labels_true'] = X, X_scaled, labels_true
        global_storage[n_sample] = local_storage
    return global_storage

In [4]:
exp1_n_features = 3
exp1_centers = 5
exp1_random_state = 17
exp1_n_samples_array: list = [100, 1_000, 10_000, 100_000, 1_000_000, 2_000_000]

exp1_data: dict = create_exp1_data(n_samples_array=exp1_n_samples_array)

### DBSCAN

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt

def apply_dbscan(exp_data: dict, n_samples_array: list, eps_array: list):
    for index, n_sample in enumerate(n_samples_array):
        %memit dbscan = DBSCAN(eps=eps_array[index], min_samples=10).fit(exp_data[n_sample]['X_scaled'])
        dbscan_labels_pred = dbscan.labels_

        rand_score: float = metrics.rand_score(exp_data[n_sample]['labels_true'], dbscan_labels_pred)
        homogeneity_score: float = metrics.homogeneity_score(exp_data[n_sample]['labels_true'], dbscan_labels_pred)
        
        exp_data[n_sample]['rand_score'] = rand_score
        exp_data[n_sample]['homogeneity_score'] = homogeneity_score

In [5]:
exp1_eps_array: list = [0.40, 0.30, 0.20, 0.10, 0.05, 0.05]

In [9]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[0], min_samples=10).fit(exp1_data[100]['X_scaled'])

peak memory: 381.20 MiB, increment: 0.27 MiB


In [10]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[1], min_samples=10).fit(exp1_data[1_000]['X_scaled'])

peak memory: 381.55 MiB, increment: 0.33 MiB


In [11]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[2], min_samples=10).fit(exp1_data[10_000]['X_scaled'])

peak memory: 403.36 MiB, increment: 21.81 MiB


In [12]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[3], min_samples=10).fit(exp1_data[100_000]['X_scaled'])

peak memory: 584.23 MiB, increment: 180.88 MiB


In [13]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[4], min_samples=10).fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 2201.16 MiB, increment: 1758.53 MiB


In [6]:
%%memit
dbscan = DBSCAN(eps=exp1_eps_array[5], min_samples=10).fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 3946.09 MiB, increment: 3542.77 MiB


### HDBSCAN

In [7]:
import hdbscan

In [8]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[100]['X_scaled'])

peak memory: 373.16 MiB, increment: 0.81 MiB


In [9]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[1_000]['X_scaled'])

peak memory: 373.94 MiB, increment: 0.77 MiB


In [10]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[10_000]['X_scaled'])

peak memory: 379.12 MiB, increment: 5.19 MiB


In [11]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[100_000]['X_scaled'])

peak memory: 484.38 MiB, increment: 105.23 MiB


In [12]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 1241.03 MiB, increment: 762.06 MiB


In [13]:
%%memit
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean').fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 1329.27 MiB, increment: 356.02 MiB


### K-means

In [14]:
from sklearn.cluster import KMeans

In [16]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[100]['X_scaled'])

peak memory: 758.08 MiB, increment: 0.39 MiB


In [17]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[1_000]['X_scaled'])

peak memory: 758.53 MiB, increment: 0.44 MiB


In [18]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[10_000]['X_scaled'])

peak memory: 734.30 MiB, increment: -24.25 MiB


In [19]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[100_000]['X_scaled'])

peak memory: 765.05 MiB, increment: 30.75 MiB


In [20]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 835.55 MiB, increment: 93.23 MiB


In [21]:
%%memit
kmeans = KMeans(
    n_clusters=exp1_centers, 
    random_state=exp1_random_state,
).fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 948.55 MiB, increment: 120.12 MiB


### BIRCH

In [22]:
from sklearn.cluster import Birch

In [23]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[100]['X_scaled'])

peak memory: 949.30 MiB, increment: 0.59 MiB


In [24]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[1_000]['X_scaled'])

peak memory: 949.33 MiB, increment: 0.02 MiB


In [25]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[10_000]['X_scaled'])

peak memory: 949.36 MiB, increment: 0.03 MiB


In [26]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[100_000]['X_scaled'])

peak memory: 951.42 MiB, increment: 2.06 MiB


In [27]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 967.42 MiB, increment: 16.00 MiB


In [28]:
%%memit
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=exp1_centers).fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 967.42 MiB, increment: 0.00 MiB


### Agglomerative clustering

In [29]:
from sklearn.cluster import AgglomerativeClustering

In [30]:
%%memit
agglomerative = AgglomerativeClustering(n_clusters=exp1_centers).fit(exp1_data[100]['X_scaled'])

peak memory: 967.47 MiB, increment: -0.02 MiB


In [31]:
%%memit
agglomerative = AgglomerativeClustering(n_clusters=exp1_centers).fit(exp1_data[1_000]['X_scaled'])

peak memory: 967.64 MiB, increment: 0.14 MiB


In [32]:
%%memit
agglomerative = AgglomerativeClustering(n_clusters=exp1_centers).fit(exp1_data[10_000]['X_scaled'])

peak memory: 1732.44 MiB, increment: 764.80 MiB


In [None]:
%%memit
agglomerative = AgglomerativeClustering(n_clusters=exp1_centers).fit(exp1_data[100_000]['X_scaled']) # died

### Spectral clustering

In [5]:
from sklearn.cluster import SpectralClustering

In [6]:
%%memit
spectral = SpectralClustering(
    n_clusters=exp1_centers, 
    assign_labels='kmeans', 
    random_state=exp1_random_state,
).fit(exp1_data[100]['X_scaled'])

peak memory: 393.25 MiB, increment: 2.67 MiB


In [7]:
%%memit
spectral = SpectralClustering(
    n_clusters=exp1_centers, 
    assign_labels='kmeans', 
    random_state=exp1_random_state,
).fit(exp1_data[1_000]['X_scaled'])

peak memory: 464.19 MiB, increment: 70.91 MiB


In [8]:
%%memit
spectral = SpectralClustering(
    n_clusters=exp1_centers, 
    assign_labels='kmeans', 
    random_state=exp1_random_state,
).fit(exp1_data[10_000]['X_scaled'])

peak memory: 2979.70 MiB, increment: 2515.52 MiB


In [None]:
%%memit
spectral = SpectralClustering(
    n_clusters=exp1_centers, 
    assign_labels='kmeans', 
    random_state=exp1_random_state,
).fit(exp1_data[100_000]['X_scaled'])