In [1]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_pickle('../../data/processed/state2/left_ios_df.pkl')
print(df.shape)

(13467, 8)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [4]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [5]:
results_list = []

In [6]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [7]:
N_total = X_scaled.shape[0]

In [8]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [9]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [10]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.283982  0.221913    0.000000         2       NaN            NaN
1    kmeans  0.272998  0.339398    0.000000         3       NaN            NaN
2    kmeans  0.219891  0.271646    0.000000         4       NaN            NaN
3    kmeans  0.199498  0.276973    0.000000         5       NaN            NaN
4    dbscan  0.908833  0.000000   99.806935        13       0.1            2.0
5    dbscan       NaN  0.000000  100.000000         0       0.1           15.0
6    dbscan  0.510290  0.145445   92.648697       409       0.2            2.0
7    dbscan       NaN  0.000000  100.000000         0       0.2           15.0
8    dbscan -0.045284  0.556197   67.706245       901       0.3            2.0
9    dbscan  0.267328  0.406349   99.220316         3       0.3           15.0
10   dbscan -0.330404  0.773448   41.093042       814       0.4            2.0
11   dbscan -0.130853 

In [11]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

     model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
14  dbscan -0.433832  0.940337  13.744709       300       0.6            2.0
16  dbscan -0.357853  0.959788   8.056731       201       0.7            2.0
18  dbscan -0.277772  0.973128   5.101359       107       0.8            2.0
20  dbscan -0.186671  0.974205   3.230118        84       0.9            2.0
22  dbscan -0.090585  0.969489   2.071731        42       1.0            2.0
24  dbscan  0.063126  0.963053   1.396005        31       1.1            2.0
26  dbscan  0.165508  0.954670   1.017302        24       1.2            2.0
28  dbscan  0.245301  0.944630   0.683151        19       1.3            2.0
30  dbscan  0.280852  0.921216   0.534640        13       1.4            2.0
32  dbscan  0.271662  0.898569   0.386129        10       1.5            2.0
34  dbscan  0.272937  0.887598   0.304448         9       1.6            2.0
38  dbscan  0.307460  0.873773   0.155937         8       1.8            2.0