In [1]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_pickle('../../data/processed/state2/right_ios_df.pkl')
print(df.shape)

(13533, 8)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [4]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [5]:
results_list = []

In [6]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [7]:
N_total = X_scaled.shape[0]

In [8]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [9]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [10]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.297341  0.157430    0.000000         2       NaN            NaN
1    kmeans  0.306596  0.373556    0.000000         3       NaN            NaN
2    kmeans  0.229823  0.246084    0.000000         4       NaN            NaN
3    kmeans  0.208175  0.239415    0.000000         5       NaN            NaN
4    dbscan  0.824743  0.026891   99.741373        17       0.1            2.0
5    dbscan       NaN  0.000000  100.000000         0       0.1           15.0
6    dbscan  0.372308  0.262685   89.063770       527       0.2            2.0
7    dbscan       NaN  0.000000  100.000000         0       0.2           15.0
8    dbscan -0.122812  0.624287   62.883322       871       0.3            2.0
9    dbscan  0.267175  0.635783   95.374270         4       0.3           15.0
10   dbscan -0.403392  0.822416   38.143797       682       0.4            2.0
11   dbscan  0.158808 

In [11]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

      model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
14   dbscan -0.436114  0.945081  12.066800       285       0.6            2.0
16   dbscan -0.369079  0.963294   7.293283       174       0.7            2.0
18   dbscan -0.237524  0.971245   4.367103       112       0.8            2.0
20   dbscan -0.178962  0.973425   2.674943        65       0.9            2.0
22   dbscan -0.077553  0.970786   1.596098        47       1.0            2.0
24   dbscan  0.017925  0.959427   1.041898        28       1.1            2.0
26   dbscan  0.146233  0.920740   0.761102        13       1.2            2.0
28   dbscan  0.289946  0.887369   0.576369         9       1.3            2.0
30   dbscan  0.315837  0.873850   0.413803         8       1.4            2.0
32   dbscan  0.340028  0.856190   0.317742         7       1.5            2.0
3    kmeans  0.208175  0.239415   0.000000         5       NaN            NaN
2    kmeans  0.229823  0.246084   0.000000         4       NaN  