In [1]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_pickle('../../data/processed/state0/right_android_df.pkl')
print(df.shape)

(153, 7)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [4]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [5]:
results_list = []

In [7]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [8]:
N_total = X_scaled.shape[0]

In [10]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [11]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [12]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.397719  0.356209    0.000000         2       NaN            NaN
1    kmeans  0.163444  0.152505    0.000000         3       NaN            NaN
2    kmeans  0.246736  0.197712    0.000000         4       NaN            NaN
3    kmeans  0.245855  0.177778    0.000000         5       NaN            NaN
4    kmeans  0.397719  0.356209    0.000000         2       NaN            NaN
..      ...       ...       ...         ...       ...       ...            ...
85   dbscan       NaN  0.000000   20.261438         1       1.9           15.0
86   dbscan  0.312366  0.472603    4.575163         2       2.0            2.0
87   dbscan       NaN  0.000000   16.993464         1       2.0           15.0
88  hdbscan  0.502256  0.449153   22.875817         2       NaN            2.0
89  hdbscan       NaN  0.000000  100.000000         0       NaN           15.0

[90 rows x 7 columns]

In [13]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

     model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
3   kmeans  0.245855  0.177778   0.000000         5       NaN            NaN
7   kmeans  0.245855  0.177778   0.000000         5       NaN            NaN
36  dbscan -0.042470  0.741176  11.111111         5       1.5            2.0
76  dbscan -0.042470  0.741176  11.111111         5       1.5            2.0
2   kmeans  0.246736  0.197712   0.000000         4       NaN            NaN
6   kmeans  0.246736  0.197712   0.000000         4       NaN            NaN
1   kmeans  0.163444  0.152505   0.000000         3       NaN            NaN
5   kmeans  0.163444  0.152505   0.000000         3       NaN            NaN
44  dbscan  0.202851  0.624709   6.535948         3       1.9            2.0
84  dbscan  0.202851  0.624709   6.535948         3       1.9            2.0
42  dbscan  0.160920  0.632850   9.803922         3       1.8            2.0
82  dbscan  0.160920  0.632850   9.803922         3       1.8            2.0