In [13]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [14]:
df = pd.read_pickle('../../data/processed/state2/left_android_df.pkl')
print(df.shape)

(260, 7)


In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [16]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [17]:
results_list = []

In [18]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [19]:
N_total = X_scaled.shape[0]

In [20]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in range(2, 15):
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [21]:
# HDBSCAN modeling
for min_samp in range(2, 15):
    for min_cluster_size in range(2, 40):
        clusterer = hdbscan.HDBSCAN(min_samples=min_samp, min_cluster_size=min_cluster_size)
        clusterer.fit(X_scaled)
        labels = clusterer.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
        noise_percentage = (n_noise / N_total) * 100

        if n_clusters >= 2 and len(labels_non_noise) >= 2 and noise_percentage < 15:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'hdbscan',
            'HP (min_samp)': min_samp,
            'HP (min_clust)': min_cluster_size,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })




In [22]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of        model    sil. #      Gini  noise %  clust. #  HP (eps)  HP (min_samp)  \
0     kmeans  0.498742  0.369231      0.0         2       NaN            NaN   
1     kmeans  0.371387  0.387179      0.0         3       NaN            NaN   
2     kmeans  0.295863  0.234615      0.0         4       NaN            NaN   
3     kmeans  0.258112  0.293846      0.0         5       NaN            NaN   
4     dbscan       NaN  0.000000    100.0         0       0.1            2.0   
..       ...       ...       ...      ...       ...       ...            ...   
753  hdbscan       NaN  0.000000    100.0         0       NaN           14.0   
754  hdbscan       NaN  0.000000    100.0         0       NaN           14.0   
755  hdbscan       NaN  0.000000    100.0         0       NaN           14.0   
756  hdbscan       NaN  0.000000    100.0         0       NaN           14.0   
757  hdbscan       NaN  0.000000    100.0         0       NaN           14.0   

     HP (

In [23]:
results_df = pd.DataFrame(results_list).sort_values(by=[ 'Gini', 'noise %', 'sil. #', 'clust. #',], ascending=[True,True, False, False])
filtered_results = results_df[results_df['noise %'] <= 15.0]
filtered_results = filtered_results[filtered_results['clust. #'] >= 2]
filtered_results = filtered_results[filtered_results['Gini'] <= 0.7]
print(filtered_results.head(20))

       model    sil. #      Gini    noise %  clust. #  HP (eps)  \
2     kmeans  0.295863  0.234615   0.000000         4       NaN   
3     kmeans  0.258112  0.293846   0.000000         5       NaN   
0     kmeans  0.498742  0.369231   0.000000         2       NaN   
1     kmeans  0.371387  0.387179   0.000000         3       NaN   
205   dbscan  0.533009  0.414634   5.384615         2       1.6   
190   dbscan  0.526360  0.418033   6.153846         2       1.5   
189   dbscan  0.521486  0.418367   5.769231         2       1.5   
246   dbscan  0.556205  0.428000   3.846154         2       1.9   
233   dbscan  0.559042  0.430894   5.384615         2       1.8   
248   dbscan  0.562541  0.435223   5.000000         2       1.9   
262   dbscan  0.560227  0.435484   4.615385         2       2.0   
247   dbscan  0.558019  0.435484   4.615385         2       1.9   
261   dbscan  0.555731  0.435743   4.230769         2       2.0   
219   dbscan  0.572070  0.441909   7.307692         2       1.