In [21]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.io as pio
pio.renderers.default = "notebook_connected"



In [22]:
here = Path.cwd()
root = next(p for p in [here, *here.parents] if (p / "data" / "processed").exists())

df_v3 = pd.read_csv(root / "data" / "processed" / "merged" / "state1_clean.csv", header=0)
df_v3.head() 

Unnamed: 0,is_ios,source_file,eventType,eventSampleSpeed,eventDurationSeconds,eventMilesDriven,eventSensorDetectionMthd,eventGPSSignalStrength,eventStartSpeed,eventEndSpeed,memsMedianHorizontalNorm,gyro_angular_change_per_second,gyroAngleChange,mems_radius
0,0,android_turns.csv,4.0,10.394929,7.244125,0.016991,2.0,1.480796,17.323687,6.789408,1.046972,9.521707,69.3196,22.941639
1,0,android_turns.csv,4.0,9.40191,7.497719,0.016103,2.0,1.693853,6.197542,10.10106,1.153392,11.245602,84.723694,20.737572
2,0,android_turns.csv,5.0,11.290954,4.244563,0.008359,2.0,21.17347,11.229438,9.081976,5.708309,24.616348,105.462135,18.996666
3,0,android_turns.csv,5.0,12.743101,3.1735,0.007654,1.0,15.306123,10.714942,13.421639,3.099853,17.929585,57.610783,23.95911
4,0,android_turns.csv,5.0,8.986107,5.990321,0.013587,0.0,19.387754,2.729067,20.937756,1.197121,2.787075,16.806042,91.71366


In [23]:
df_v3.drop(columns=['source_file'], inplace=True)

sub_df_right = df_v3[df_v3['eventType'] == 5]
sub_df_right = sub_df_right.drop(columns=['is_ios', 'eventType', 'eventSampleSpeed', 'eventSensorDetectionMthd', 'eventGPSSignalStrength'])

In [24]:
sub_df_right.columns.tolist()

['eventDurationSeconds',
 'eventMilesDriven',
 'eventStartSpeed',
 'eventEndSpeed',
 'memsMedianHorizontalNorm',
 'gyro_angular_change_per_second',
 'gyroAngleChange',
 'mems_radius']

In [25]:
scaler =StandardScaler()

X_features = sub_df_right
features = scaler.fit_transform(X_features)
scaled_df = pd.DataFrame(features,columns=X_features.columns)

# Print the scaled data
X = scaled_df.values
scaled_df.head(2)


Unnamed: 0,eventDurationSeconds,eventMilesDriven,eventStartSpeed,eventEndSpeed,memsMedianHorizontalNorm,gyro_angular_change_per_second,gyroAngleChange,mems_radius
0,-0.255254,-0.583691,-0.476287,-0.929513,3.650594,2.117809,1.087867,-0.487198
1,-0.586721,-0.612994,-0.529993,-0.406077,1.06207,0.936588,-0.108381,-0.336605


In [26]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
results = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(features)

    sil = silhouette_score(features, labels)
    ch = calinski_harabasz_score(features, labels)

    results.append((k, sil, ch))

for r in results:
    print(f"k={r[0]} | Silhouette={r[1]:.3f} | CH Score={r[2]:.0f}")

k=2 | Silhouette=0.331 | CH Score=8208
k=3 | Silhouette=0.231 | CH Score=7665
k=4 | Silhouette=0.235 | CH Score=7227
k=5 | Silhouette=0.209 | CH Score=6386
k=6 | Silhouette=0.193 | CH Score=6333
k=7 | Silhouette=0.196 | CH Score=6086
k=8 | Silhouette=0.199 | CH Score=5819
k=9 | Silhouette=0.192 | CH Score=5570


In [27]:
kmeans=KMeans(n_clusters=3)
kmeans.fit(X)
sub_df_right['cluster'] = kmeans.fit_predict(X)

In [28]:
pca=PCA(n_components=3)

reduced_X=pd.DataFrame(data=pca.fit_transform(X),columns=['PCA1','PCA2', 'PCA3'])

#Reduced Features
reduced_X.head()

Unnamed: 0,PCA1,PCA2,PCA3
0,-1.610527,2.026714,3.088181
1,-1.368099,0.42222,1.019713
2,0.140054,-2.146991,-1.786294
3,-0.968097,0.917867,0.317053
4,-1.48662,-0.229901,-0.873554


In [32]:
centers=pca.transform(kmeans.cluster_centers_)

# reduced centers
centers

array([[-0.67550299,  0.87017759,  0.23634587],
       [ 3.12003955, -0.0880411 ,  0.20572586],
       [-0.3683503 , -1.33600932, -0.46862964]])

In [31]:
sub_df_right.groupby('cluster').mean(numeric_only=True).T

cluster,0,1,2
eventDurationSeconds,4.910408,9.229402,3.409389
eventMilesDriven,0.015722,0.06209,0.014732
eventStartSpeed,12.526777,29.588537,14.615593
eventEndSpeed,13.624271,28.816862,16.263471
memsMedianHorizontalNorm,2.242225,1.886997,1.759031
gyro_angular_change_per_second,16.555748,9.306936,7.944145
gyroAngleChange,77.045803,86.758934,26.678358
mems_radius,17.971224,58.60439,51.228402


In [35]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Optional: choose inline or browser rendering
# pio.renderers.default = "notebook_connected"   # For VS Code/Jupyter inline
# pio.renderers.default = "browser"              # To open in your web browser

# Make sure cluster labels are strings for a clean legend
cluster_series = pd.Series(kmeans.labels_, name="Cluster").astype(str)

# Create 3D scatter plot
fig = px.scatter_3d(
    reduced_X,
    x="PCA1", y="PCA2", z="PCA3",
    color=cluster_series,
    opacity=0.75,
    title="Turn Cluster",
    labels={"color": "Cluster"}
)

# Add cluster centers
fig.add_trace(go.Scatter3d(
    x=centers[:, 0],
    y=centers[:, 1],
    z=centers[:, 2],
    mode="markers",
    name="Centers",
    marker=dict(size=8, symbol="x", line=dict(width=2), color="black")
))

# Style layout
fig.update_layout(
    scene=dict(
        xaxis_title="PCA1",
        yaxis_title="PCA2",
        zaxis_title="PCA3"
    ),
    legend_title_text="Cluster",
    margin=dict(l=0, r=0, t=40, b=0)
)

# Show interactive figure
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed