In [106]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [107]:
df = pd.read_pickle('../../data/processed/state0/left_android_df.pkl')
print(df.shape)

(162, 7)


In [108]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [109]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [110]:
results_list = []

In [111]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [112]:
N_total = X_scaled.shape[0]

In [113]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [114]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    clusterer = hdbscan.HDBSCAN(min_samples=min_samp)
    clusterer.fit(X_scaled)
    labels = clusterer.labels_
    
    non_noise_mask = labels != -1
    data_non_noise = X_scaled[non_noise_mask]
    labels_non_noise = labels[non_noise_mask]
    n_clusters = len(np.unique(labels_non_noise))
    n_noise = np.sum(labels == -1)

    if n_clusters >= 2 and len(labels_non_noise) >= 2:
        sil_score = silhouette_score(data_non_noise, labels_non_noise)
    else:
        sil_score = np.nan 
        
    noise_percentage = (n_noise / N_total) * 100
    gini_score = calculate_gini_score(labels) 

    results_list.append({
        'model': 'hdbscan',
        'HP (min_samp)': min_samp,
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': n_clusters
    })




In [115]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  HP (min_samp)
0    kmeans  0.231866  0.129630    0.000000         2       NaN            NaN
1    kmeans  0.176586  0.131687    0.000000         3       NaN            NaN
2    kmeans  0.246910  0.376543    0.000000         4       NaN            NaN
3    kmeans  0.244905  0.424691    0.000000         5       NaN            NaN
4    dbscan       NaN  0.000000  100.000000         0       0.1            2.0
5    dbscan       NaN  0.000000  100.000000         0       0.1           15.0
6    dbscan       NaN  0.000000  100.000000         0       0.2            2.0
7    dbscan       NaN  0.000000  100.000000         0       0.2           15.0
8    dbscan  0.699525  0.000000   93.827160         5       0.3            2.0
9    dbscan       NaN  0.000000  100.000000         0       0.3           15.0
10   dbscan  0.644678  0.051471   89.506173         8       0.4            2.0
11   dbscan       NaN 

In [118]:
results_df = pd.DataFrame(results_list).sort_values(by=['clust. #', 'Gini', 'noise %'], ascending=[False, True,True])
filtered_results = results_df[results_df['noise %'] <= 15.0]
print(filtered_results.head(20))

     model    sil. #      Gini    noise %  clust. #  HP (eps)  HP (min_samp)
3   kmeans  0.244905  0.424691   0.000000         5       NaN            NaN
2   kmeans  0.246910  0.376543   0.000000         4       NaN            NaN
1   kmeans  0.176586  0.131687   0.000000         3       NaN            NaN
0   kmeans  0.231866  0.129630   0.000000         2       NaN            NaN
32  dbscan  0.336254  0.478723  12.962963         2       1.5            2.0
36  dbscan  0.436814  0.486486   8.641975         2       1.7            2.0
38  dbscan  0.429255  0.486667   7.407407         2       1.8            2.0
40  dbscan  0.422531  0.486842   6.172840         2       1.9            2.0
42  dbscan       NaN  0.000000   5.555556         1       2.0            2.0
34  dbscan       NaN  0.000000  11.111111         1       1.6            2.0
43  dbscan       NaN  0.000000  11.111111         1       2.0           15.0
41  dbscan       NaN  0.000000  13.580247         1       1.9           15.0