# HDBSCAN
### Imports, RAM check, and data generation

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

!pip install memory-profiler
%load_ext memory_profiler

!pip install hdbscan
import hdbscan

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting memory-profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.61.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 34.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: hdbscan
  Building wheel for hdbscan (PEP 517) ... [?25l[?25hdone
  Created wheel for hdbscan: filename=hdbscan-0.8.29-cp37-cp37m-linux_x86_64.whl size=2340721 sha256=e4d5496efee18feb7a66b1f8efa38067cf30c1a180602d8902820ea4470a79a2
  Stored in directory: /root/.cache/pip/wheels/93/78/

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [28]:
def create_exp1_data(
    n_samples_array: list, 
    n_features: int = 3, 
    centers: int = 5, 
    random_state: int = 17,
    ):
    global_storage: dict = {}
    for n_sample in n_samples_array:
        local_storage: dict = {}
        X, labels_true = make_blobs(
            n_samples=n_sample, 
            n_features=n_features,
            centers=centers, 
            random_state=random_state, 
        )
        X_scaled = StandardScaler().fit_transform(X)
        local_storage['X'], local_storage['X_scaled'], local_storage['labels_true'] = X, X_scaled, labels_true
        global_storage[n_sample] = local_storage
    return global_storage

exp1_n_features = 3
exp1_centers = 5
exp1_random_state = 17
exp1_n_samples_array: list = [
    100,
    500, 
    1_000,
    3_000, 
    5_000,
    7_500, 
    10_000,
    15_000, 
    20_000, 
    50_000,
    75_000, 
    100_000,
    500_000, 
    1_000_000, 
    2_000_000, 
    3_000_000,
]

exp1_data: dict = create_exp1_data(n_samples_array=exp1_n_samples_array)

## Algorithm run
#### n = 100

In [4]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[100]['X_scaled'])

peak memory: 630.31 MiB, increment: 0.38 MiB


In [5]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[100]['X_scaled'])

2.47 ms ± 33.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### n = 500

In [6]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[500]['X_scaled'])

peak memory: 641.76 MiB, increment: 0.09 MiB


In [7]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[500]['X_scaled'])

9.32 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### n = 1_000

In [8]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[1_000]['X_scaled'])

peak memory: 641.98 MiB, increment: 0.11 MiB


In [9]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[1_000]['X_scaled'])

18 ms ± 43.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### n = 3_000

In [10]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[3_000]['X_scaled'])

peak memory: 642.89 MiB, increment: 0.69 MiB


In [11]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[3_000]['X_scaled'])

58.7 ms ± 270 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### n = 5_000

In [12]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[5_000]['X_scaled'])

peak memory: 643.43 MiB, increment: 0.48 MiB


In [13]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[5_000]['X_scaled'])

110 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### n = 7_500

In [14]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[7_500]['X_scaled'])

peak memory: 644.09 MiB, increment: 0.62 MiB


In [15]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[7_500]['X_scaled'])

182 ms ± 1.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### n = 10_000

In [16]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[10_000]['X_scaled'])

peak memory: 644.75 MiB, increment: 0.52 MiB


In [17]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[10_000]['X_scaled'])

263 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 15_000

In [18]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[15_000]['X_scaled'])

peak memory: 645.43 MiB, increment: 0.45 MiB


In [19]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[15_000]['X_scaled'])

434 ms ± 4.39 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 20_000

In [20]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[20_000]['X_scaled'])

peak memory: 646.28 MiB, increment: 0.55 MiB


In [21]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[20_000]['X_scaled'])

531 ms ± 7.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 50_000

In [22]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[50_000]['X_scaled'])

peak memory: 654.41 MiB, increment: 8.05 MiB


In [23]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[50_000]['X_scaled'])

1.57 s ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 75_000

In [24]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[75_000]['X_scaled'])

peak memory: 661.91 MiB, increment: 14.29 MiB


In [25]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[75_000]['X_scaled'])

2.63 s ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 100_000

In [26]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[100_000]['X_scaled'])

peak memory: 691.69 MiB, increment: 25.52 MiB


In [27]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[100_000]['X_scaled'])

3.85 s ± 88.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 500_000

In [29]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[500_000]['X_scaled'])

peak memory: 1216.85 MiB, increment: 126.87 MiB


In [30]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[500_000]['X_scaled'])

32.7 s ± 223 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 1_000_000

In [31]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 1347.02 MiB, increment: 250.66 MiB


In [32]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[1_000_000]['X_scaled'])

1min 41s ± 488 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 2_000_000

In [33]:
%memit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 2021.38 MiB, increment: 500.34 MiB


In [34]:
%timeit hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(exp1_data[2_000_000]['X_scaled'])

5min 27s ± 1.75 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
