# Algorithm tryouts
## General imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_blobs
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

%load_ext memory_profiler

## Data generation

In [8]:
exp1_n_features = 3
exp1_centers = 5
exp1_random_state = 17
def create_exp1_data(n_samples_array: list, n_features: int = 3, centers: int = 5, random_state: int = 17):
    global_storage: dict = {}
    for n_sample in n_samples_array:
        local_storage: dict = {}
        X, labels_true = make_blobs(
            n_samples=n_sample, 
            n_features=n_features,
            centers=centers, 
            random_state=random_state, 
        )
        X_scaled = StandardScaler().fit_transform(X)
        local_storage['X'], local_storage['X_scaled'], local_storage['labels_true'] = X, X_scaled, labels_true
        global_storage[n_sample] = local_storage
    return global_storage

exp2_n_samples = 1_000
exp2_centers = 5
exp2_random_state = 17
def create_exp2_data(n_features_array: list, n_samples: int = 1_000, centers: int = 5, random_state: int = 17):
    global_storage: dict = {}
    for n_feature in n_features_array:
        local_storage: dict = {}
        X, labels_true = make_blobs(
            n_samples=n_samples, 
            n_features=n_feature,
            centers=centers, 
            random_state=random_state, 
        )
        X_scaled = StandardScaler().fit_transform(X)
        local_storage['X'], local_storage['X_scaled'], local_storage['labels_true'] = X, X_scaled, labels_true
        global_storage[n_feature] = local_storage
    return global_storage

In [30]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt

def apply_dbscan(exp_data: dict, n_samples_array: list, eps_array: list):
    for index, n_sample in enumerate(n_samples_array):
        dbscan = DBSCAN(eps=eps_array[index], min_samples=10).fit(exp_data[n_sample]['X_scaled'])
        dbscan_labels_pred = dbscan.labels_

        rand_score: float = metrics.rand_score(exp_data[n_sample]['labels_true'], dbscan_labels_pred)
        homogeneity_score: float = metrics.homogeneity_score(exp_data[n_sample]['labels_true'], dbscan_labels_pred)
        
        exp_data[n_sample]['rand_score'] = rand_score
        exp_data[n_sample]['homogeneity_score'] = homogeneity_score
        
# for n_sample in exp1_n_samples_array:
#     neighbors = NearestNeighbors(n_neighbors=exp1_n_features*2).fit(exp1_data[n_sample]['X_scaled'])
#     distances, indices = neighbors.kneighbors(exp1_data[n_sample]['X_scaled'])

#     distances = np.sort(distances, axis=0)
#     distances = distances[:,1]
#     plt.plot(distances)
#     plt.show()

In [3]:
exp1_n_samples_array: list = [100, 1_000, 10_000, 100_000, 1_000_000]
exp2_n_features_array: list = [3, 10, 100, 500, 1_000]

exp1_data: dict = create_exp1_data(n_samples_array=exp1_n_samples_array)
exp2_data: dict = create_exp2_data(n_features_array=exp2_n_features_array)

## DBSCAN

In [31]:
exp1_eps_array: list = [0.40, 0.30, 0.20, 0.10, 0.05]
apply_dbscan(exp_data=exp1_data, n_samples_array=exp1_n_samples_array, eps_array=exp1_eps_array)

for i in exp1_n_samples_array:
    print(exp1_data[i]['rand_score'])
    print(exp1_data[i]['homogeneity_score'])
    print('-----------------')

NameError: name 'eps_array' is not defined

## BIRCH

In [7]:
%%timeit
%%memit
from sklearn.cluster import Birch
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=centers).fit(X_scaled)
birch_labels_pred = birch.labels_

peak memory: 1260.95 MiB, increment: 842.17 MiB
peak memory: 1301.30 MiB, increment: 803.03 MiB
peak memory: 1309.78 MiB, increment: 770.50 MiB
peak memory: 1340.16 MiB, increment: 792.61 MiB
peak memory: 1348.56 MiB, increment: 770.47 MiB
peak memory: 1348.91 MiB, increment: 762.80 MiB
peak memory: 1349.77 MiB, increment: 763.12 MiB
peak memory: 1378.83 MiB, increment: 791.83 MiB
2.89 s ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
print(metrics.rand_score(labels_true, birch_labels_pred))
print(metrics.homogeneity_score(labels_true, birch_labels_pred))

1.0
1.0


## Spectral clustering

In [9]:
%%timeit
%%memit
from sklearn.cluster import SpectralClustering
spectral = SpectralClustering(
    n_clusters=centers, 
    assign_labels='kmeans', 
    random_state=random_state,
).fit(X_scaled)
spectral_labels_pred = spectral.labels_

peak memory: 3125.44 MiB, increment: 2508.66 MiB
peak memory: 3378.06 MiB, increment: 2283.33 MiB
peak memory: 3166.78 MiB, increment: 2958.25 MiB
peak memory: 3144.28 MiB, increment: 2826.38 MiB
peak memory: 3332.25 MiB, increment: 3134.72 MiB
peak memory: 4034.67 MiB, increment: 3034.47 MiB
peak memory: 3771.92 MiB, increment: 2965.27 MiB
peak memory: 3187.27 MiB, increment: 2776.31 MiB
11 s ± 309 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
print(metrics.rand_score(labels_true, spectral_labels_pred))
print(metrics.homogeneity_score(labels_true, spectral_labels_pred))

1.0
1.0


## K-means

In [11]:
%%timeit
%%memit
from sklearn.cluster import KMeans
kmeans = KMeans(
    n_clusters=centers, 
    random_state=random_state
).fit(X_scaled)
kmeans_labels_pred = kmeans.labels_

peak memory: 237.52 MiB, increment: 28.03 MiB
peak memory: 239.53 MiB, increment: 2.02 MiB
peak memory: 246.25 MiB, increment: 6.72 MiB
peak memory: 246.42 MiB, increment: 0.17 MiB
peak memory: 246.58 MiB, increment: 0.16 MiB
peak memory: 246.72 MiB, increment: 0.14 MiB
peak memory: 246.83 MiB, increment: 0.11 MiB
peak memory: 247.00 MiB, increment: 0.17 MiB
833 ms ± 155 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
print(metrics.rand_score(labels_true, kmeans_labels_pred))
print(metrics.homogeneity_score(labels_true, kmeans_labels_pred))

1.0
1.0


## Agglomerative clustering

In [13]:
%%timeit
%%memit
from sklearn.cluster import AgglomerativeClustering
agglomerative = AgglomerativeClustering(n_clusters=centers).fit(X_scaled)
agglomerative_labels_pred = agglomerative.labels_

peak memory: 1057.98 MiB, increment: 810.92 MiB
peak memory: 1058.61 MiB, increment: 762.89 MiB
peak memory: 1058.86 MiB, increment: 762.88 MiB
peak memory: 1058.86 MiB, increment: 762.88 MiB
peak memory: 1058.86 MiB, increment: 762.88 MiB
peak memory: 1058.88 MiB, increment: 762.89 MiB
peak memory: 1058.88 MiB, increment: 762.88 MiB
peak memory: 1058.89 MiB, increment: 762.88 MiB
2.98 s ± 3.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
print(metrics.rand_score(labels_true, agglomerative_labels_pred))
print(metrics.homogeneity_score(labels_true, agglomerative_labels_pred))

1.0
1.0


## HDBSCAN

In [15]:
%%timeit
%%memit
import hdbscan
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean').fit(X_scaled)
hdbscan_labels_pred = hdbscan_clusterer.labels_

peak memory: 302.11 MiB, increment: 6.05 MiB
peak memory: 302.12 MiB, increment: 0.02 MiB
peak memory: 303.09 MiB, increment: 0.97 MiB
peak memory: 305.91 MiB, increment: 2.81 MiB
peak memory: 305.91 MiB, increment: 0.00 MiB
peak memory: 306.34 MiB, increment: 0.44 MiB
peak memory: 306.53 MiB, increment: 0.19 MiB
peak memory: 306.52 MiB, increment: 0.06 MiB
6.06 s ± 23.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
print(metrics.rand_score(labels_true, hdbscan_labels_pred))
print(metrics.homogeneity_score(labels_true, hdbscan_labels_pred))

1.0
1.0
