# K-means
## Imports, RAM check, and data generation

In [34]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

!pip install memory-profiler
%load_ext memory_profiler

from sklearn.cluster import KMeans

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [35]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
def create_exp1_data(
    n_samples_array: list, 
    n_features: int = 3, 
    centers: int = 5, 
    random_state: int = 17,
    ):
    global_storage: dict = {}
    for n_sample in n_samples_array:
        local_storage: dict = {}
        X, labels_true = make_blobs(
            n_samples=n_sample, 
            n_features=n_features,
            centers=centers, 
            random_state=random_state, 
        )
        X_scaled = StandardScaler().fit_transform(X)
        local_storage['X'], local_storage['X_scaled'], local_storage['labels_true'] = X, X_scaled, labels_true
        global_storage[n_sample] = local_storage
    return global_storage

exp1_n_features = 3
exp1_centers = 5
exp1_random_state = 17
exp1_n_samples_array: list = [
    100,
    500, 
    1_000,
    3_000, 
    5_000,
    7_500, 
    10_000,
    15_000, 
    20_000, 
    50_000,
    75_000, 
    100_000,
    500_000, 
    1_000_000, 
    2_000_000, 
    3_000_000,
]

exp1_data: dict = create_exp1_data(n_samples_array=exp1_n_samples_array)

## Algorithm run
#### n = 100

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[100]['X_scaled'])

peak memory: 625.36 MiB, increment: 0.94 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[100]['X_scaled'])

18.3 ms ± 980 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### n = 500

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[500]['X_scaled'])

peak memory: 625.50 MiB, increment: 0.00 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[500]['X_scaled'])

27.7 ms ± 5.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 1_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[1_000]['X_scaled'])

peak memory: 636.46 MiB, increment: 0.00 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[1_000]['X_scaled'])

25 ms ± 644 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### n = 3_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[3_000]['X_scaled'])

peak memory: 636.61 MiB, increment: 0.07 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[3_000]['X_scaled'])

258 ms ± 41.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 5_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[5_000]['X_scaled'])

peak memory: 636.72 MiB, increment: 0.09 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[5_000]['X_scaled'])

1.56 s ± 217 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 7_500

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[7_500]['X_scaled'])

peak memory: 636.80 MiB, increment: 0.01 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[7_500]['X_scaled'])

1.48 s ± 54.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 10_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[10_000]['X_scaled'])

peak memory: 636.80 MiB, increment: 0.00 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[10_000]['X_scaled'])

1.48 s ± 41.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 15_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[15_000]['X_scaled'])

peak memory: 636.86 MiB, increment: 0.05 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[15_000]['X_scaled'])

1.52 s ± 66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 20_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[20_000]['X_scaled'])

peak memory: 636.87 MiB, increment: 0.01 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[20_000]['X_scaled'])

1.72 s ± 314 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 50_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[50_000]['X_scaled'])

peak memory: 636.90 MiB, increment: 0.04 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[50_000]['X_scaled'])

1.93 s ± 263 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 75_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[75_000]['X_scaled'])

peak memory: 637.26 MiB, increment: 0.11 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[75_000]['X_scaled'])

1.82 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 100_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[100_000]['X_scaled'])

peak memory: 637.27 MiB, increment: 0.00 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[100_000]['X_scaled'])

1.83 s ± 53.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 500_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[500_000]['X_scaled'])

peak memory: 637.29 MiB, increment: 0.02 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[500_000]['X_scaled'])

3.18 s ± 193 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 1_000_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[1_000_000]['X_scaled'])

peak memory: 690.25 MiB, increment: 52.96 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[1_000_000]['X_scaled'])

4.4 s ± 324 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### n = 2_000_000

In [None]:
%memit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[2_000_000]['X_scaled'])

peak memory: 789.77 MiB, increment: 99.45 MiB


In [None]:
%timeit kmeans = KMeans(n_clusters=exp1_centers, random_state=exp1_random_state).fit(exp1_data[2_000_000]['X_scaled'])

7.23 s ± 184 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
