In [12]:
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [13]:
df = pd.read_pickle('../../data/processed/state2/right_android_df.pkl')
print(df.shape)

(331, 7)


In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [15]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [16]:
results_list = []

In [17]:
# k means model

# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [18]:
N_total = X_scaled.shape[0]

In [19]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in [2, 15]:
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [20]:
# HDBSCAN modeling
for min_samp in [2, 15]:
    for min_cluster_size in [2, 40]:
        clusterer = hdbscan.HDBSCAN(min_samples=min_samp, min_cluster_size=min_cluster_size)
        clusterer.fit(X_scaled)
        labels = clusterer.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
        noise_percentage = (n_noise / N_total) * 100

        if n_clusters >= 2 and len(labels_non_noise) >= 2 and noise_percentage < 15:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'hdbscan',
            'HP (min_samp)': min_samp,
            'HP (min_clust)': min_cluster_size,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })




In [21]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of       model    sil. #      Gini     noise %  clust. #  HP (eps)  \
0    kmeans  0.362608  0.216012    0.000000         2       NaN   
1    kmeans  0.273219  0.151057    0.000000         3       NaN   
2    kmeans  0.287258  0.366314    0.000000         4       NaN   
3    kmeans  0.315372  0.414502    0.000000         5       NaN   
4    dbscan       NaN  0.000000  100.000000         0       0.1   
5    dbscan       NaN  0.000000  100.000000         0       0.1   
6    dbscan       NaN  0.000000   99.395770         1       0.2   
7    dbscan       NaN  0.000000  100.000000         0       0.2   
8    dbscan  0.391703  0.261006   83.987915        18       0.3   
9    dbscan       NaN  0.000000  100.000000         0       0.3   
10   dbscan  0.347663  0.425493   64.954683        28       0.4   
11   dbscan       NaN  0.000000  100.000000         0       0.4   
12   dbscan  0.291317  0.586347   49.546828        25       0.5   
13   dbscan       NaN  0.000000 

In [23]:
results_df = pd.DataFrame(results_list).sort_values(by=[ 'Gini', 'noise %', 'sil. #', 'clust. #',], ascending=[True,True, False, False])
filtered_results = results_df[results_df['noise %'] <= 15.0]
filtered_results = filtered_results[filtered_results['clust. #'] >= 2]
filtered_results = filtered_results[filtered_results['Gini'] <= 0.7]
print(filtered_results.head(20))

     model    sil. #      Gini   noise %  clust. #  HP (eps)  HP (min_samp)  \
1   kmeans  0.273219  0.151057  0.000000         3       NaN            NaN   
0   kmeans  0.362608  0.216012  0.000000         2       NaN            NaN   
2   kmeans  0.287258  0.366314  0.000000         4       NaN            NaN   
3   kmeans  0.315372  0.414502  0.000000         5       NaN            NaN   
42  dbscan  0.297492  0.654397  1.510574         3       2.0            2.0   

    HP (min_clust)  
1              NaN  
0              NaN  
2              NaN  
3              NaN  
42             NaN  
