In [11]:
from pathlib import Path
import numpy as np
import pandas as pd
import hdbscan

from collections import Counter
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import silhouette_score

In [12]:
here = Path.cwd()
root = next(p for p in [here, *here.parents] if (p / "data" / "processed").exists())
df = pd.read_csv(root / "data" / "processed" / "state1" / "ios_df.csv", header=0)
df = df[df['eventType'] == 5]
df = df.drop(columns=['is_ios', 'eventType', 'eventSampleSpeed', 'eventGPSSignalStrength'])
print(df.shape)
df.columns.tolist()

(3457, 8)


['eventDurationSeconds',
 'eventMilesDriven',
 'eventStartSpeed',
 'eventEndSpeed',
 'memsMedianHorizontalNorm',
 'gyro_angular_change_per_second',
 'gyroAngleChange',
 'mems_radius']

In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(X_scaled, columns=df.columns)

In [14]:
def calculate_gini_score(labels):
    counts = Counter(labels)
    if -1 in counts:
        del counts[-1]
    if len(counts) < 2:
        return 0.0 
    y = np.array(sorted(counts.values()))
    n = len(y)
    numerator = sum((2 * (i + 1) - n - 1) * y[i] for i in range(n))
    denominator = n * sum(y)

    gini = numerator / denominator
    
    return gini

In [15]:
# k means model
results_list = []
# possible k values
for k in [2, 3, 4, 5]:
    kmeans_final = KMeans(n_clusters=k, random_state=42)
    kmeans_final.fit(scaled_df)
    final_labels = kmeans_final.labels_

    sil_score = silhouette_score(scaled_df, final_labels)
    noise_percentage = 0.0
    gini_score = calculate_gini_score(final_labels)

    results_list.append({
        'model': 'kmeans',
        'sil. #': sil_score,
        'Gini': gini_score,
        'noise %': noise_percentage,
        'clust. #': k
    })    


In [16]:
N_total = X_scaled.shape[0]

In [21]:
# DBSCAN modeling
for eps in np.linspace(0.1, 2.0, 20):
    for min_pts in range(2, 40):
        db = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=-1).fit(X_scaled)
        labels = db.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
    
        if n_clusters >= 2 and len(labels_non_noise) >= 2:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        noise_percentage = (n_noise / N_total) * 100
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'dbscan',
            'HP (eps)': eps, 
            'HP (min_samp)': min_pts,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })

In [22]:
# HDBSCAN modeling
for min_samp in range(2, 60):
    for min_cluster_size in range(2, 40):
        clusterer = hdbscan.HDBSCAN(min_samples=min_samp, min_cluster_size=min_cluster_size)
        clusterer.fit(X_scaled)
        labels = clusterer.labels_
        
        non_noise_mask = labels != -1
        data_non_noise = X_scaled[non_noise_mask]
        labels_non_noise = labels[non_noise_mask]
        n_clusters = len(np.unique(labels_non_noise))
        n_noise = np.sum(labels == -1)
        noise_percentage = (n_noise / N_total) * 100

        if n_clusters >= 2 and len(labels_non_noise) >= 2 and noise_percentage < 15:
            sil_score = silhouette_score(data_non_noise, labels_non_noise)
        else:
            sil_score = np.nan 
            
        gini_score = calculate_gini_score(labels) 

        results_list.append({
            'model': 'hdbscan',
            'HP (min_samp)': min_samp,
            'HP (min_clust)': min_cluster_size,
            'sil. #': sil_score,
            'Gini': gini_score,
            'noise %': noise_percentage,
            'clust. #': n_clusters
        })




In [23]:
results_df = pd.DataFrame(results_list)
results_df.head

<bound method NDFrame.head of         model    sil. #      Gini    noise %  clust. #  HP (eps)  \
0      kmeans  0.319344  0.214492   0.000000         2       NaN   
1      kmeans  0.275353  0.362164   0.000000         3       NaN   
2      kmeans  0.188867  0.262439   0.000000         4       NaN   
3      kmeans  0.197036  0.325022   0.000000         5       NaN   
4      dbscan       NaN  0.000000  99.942146         1       0.1   
...       ...       ...       ...        ...       ...       ...   
3007  hdbscan       NaN  0.452381  62.337287         2       NaN   
3008  hdbscan       NaN  0.452381  62.337287         2       NaN   
3009  hdbscan       NaN  0.452381  62.337287         2       NaN   
3010  hdbscan       NaN  0.452381  62.337287         2       NaN   
3011  hdbscan       NaN  0.452381  62.337287         2       NaN   

      HP (min_samp)  HP (min_clust)  
0               NaN             NaN  
1               NaN             NaN  
2               NaN             NaN  
3

In [24]:
results_df = pd.DataFrame(results_list).sort_values(by=[ 'Gini', 'noise %', 'sil. #', 'clust. #',], ascending=[True,True, False, False])
filtered_results = results_df[results_df['noise %'] <= 15.0]
filtered_results = filtered_results[filtered_results['clust. #'] >= 2]
filtered_results = filtered_results[filtered_results['Gini'] <= 0.7]
print(filtered_results.head(20))

       model    sil. #      Gini    noise %  clust. #  HP (eps)  \
0     kmeans  0.319344  0.214492   0.000000         2       NaN   
2     kmeans  0.188867  0.262439   0.000000         4       NaN   
3     kmeans  0.197036  0.325022   0.000000         5       NaN   
1     kmeans  0.275353  0.362164   0.000000         3       NaN   
819  hdbscan  0.351956  0.492174  14.984090         2       NaN   
820  hdbscan  0.351956  0.492174  14.984090         2       NaN   
821  hdbscan  0.351956  0.492174  14.984090         2       NaN   
822  hdbscan  0.351956  0.492174  14.984090         2       NaN   
823  hdbscan  0.351956  0.492174  14.984090         2       NaN   
824  hdbscan  0.351956  0.492174  14.984090         2       NaN   
825  hdbscan  0.351956  0.492174  14.984090         2       NaN   
826  hdbscan  0.351956  0.492174  14.984090         2       NaN   
828  hdbscan  0.351956  0.492174  14.984090         2       NaN   
829  hdbscan  0.351956  0.492174  14.984090         2       Na