In [1]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_pickle('../../data/processed/state0/right_ios_df.pkl')
print(df.shape)

(9603, 8)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [4]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [5]:
results_list = []

In [6]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [7]:
N_total = X_scaled.shape[0]

In [8]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [9]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [10]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.319784  0.219254    0.000000         2       NaN            NaN
1    kmeans  0.300638  0.164462    0.000000         3       NaN            NaN
2    kmeans  0.313956  0.274940    0.000000         4       NaN            NaN
3    kmeans  0.317130  0.418911    0.000000         5       NaN            NaN
4    dbscan  0.440146  0.264117   90.711236       304       0.1            2.0
5    dbscan       NaN  0.000000  100.000000         0       0.1           15.0
6    dbscan -0.165920  0.688766   66.260544       477       0.2            2.0
7    dbscan  0.005197  0.705971   91.367281         4       0.2           15.0
8    dbscan -0.179078  0.784829   44.985942       502       0.3            2.0
9    dbscan  0.413097  0.490288   75.518067         3       0.3           15.0
10   dbscan -0.467090  0.868974   29.334583       398       0.4            2.0
11   dbscan  0.272749 

In [11]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

      model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
14   dbscan -0.434927  0.952589  11.298553       158       0.6            2.0
16   dbscan -0.394015  0.966565   7.185255        95       0.7            2.0
18   dbscan -0.249662  0.969131   4.665209        61       0.8            2.0
20   dbscan -0.210034  0.967182   3.228158        44       0.9            2.0
22   dbscan -0.204562  0.963668   2.165990        35       1.0            2.0
24   dbscan  0.109661  0.949783   1.614079        22       1.1            2.0
26   dbscan  0.139458  0.930183   1.270436        15       1.2            2.0
28   dbscan  0.276025  0.897726   1.082995        10       1.3            2.0
30   dbscan  0.273182  0.898046   0.874727        10       1.4            2.0
32   dbscan  0.273617  0.898049   0.697699        10       1.5            2.0
34   dbscan  0.409542  0.855556   0.603978         7       1.6            2.0
36   dbscan  0.432305  0.831903   0.510257         6       1.7  