In [1]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_pickle('../../data/processed/state2/left_android_df.pkl')
print(df.shape)

(260, 7)


In [3]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [4]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [5]:
results_list = []

In [6]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [7]:
N_total = X_scaled.shape[0]

In [8]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [9]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [10]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.498742  0.369231    0.000000         2       NaN            NaN
1    kmeans  0.371387  0.387179    0.000000         3       NaN            NaN
2    kmeans  0.295863  0.234615    0.000000         4       NaN            NaN
3    kmeans  0.258112  0.293846    0.000000         5       NaN            NaN
4    dbscan       NaN  0.000000  100.000000         0       0.1            2.0
5    dbscan       NaN  0.000000  100.000000         0       0.1           15.0
6    dbscan  0.871396  0.000000   98.461538         2       0.2            2.0
7    dbscan       NaN  0.000000  100.000000         0       0.2           15.0
8    dbscan  0.703565  0.036667   90.384615        12       0.3            2.0
9    dbscan       NaN  0.000000  100.000000         0       0.3           15.0
10   dbscan  0.438138  0.166105   74.615385        27       0.4            2.0
11   dbscan       NaN 

In [11]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

     model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
24  dbscan  0.183477  0.797555   8.461538        11       1.1            2.0
26  dbscan  0.143130  0.798387   4.615385        11       1.2            2.0
22  dbscan  0.172483  0.786344  12.692308        10       1.0            2.0
28  dbscan  0.211353  0.782371   3.461538         8       1.3            2.0
30  dbscan  0.220544  0.749012   2.692308         6       1.4            2.0
3   kmeans  0.258112  0.293846   0.000000         5       NaN            NaN
2   kmeans  0.295863  0.234615   0.000000         4       NaN            NaN
1   kmeans  0.371387  0.387179   0.000000         3       NaN            NaN
32  dbscan  0.274918  0.640523   1.923077         3       1.5            2.0
0   kmeans  0.498742  0.369231   0.000000         2       NaN            NaN
34  dbscan  0.298809  0.488235   1.923077         2       1.6            2.0
36  dbscan  0.298809  0.488235   1.923077         2       1.7            2.0