In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px

from src.features import load_trips

sns.set_theme()


In [418]:
# %time tmp = load_trips(['M3_VH', 'M3_HH', 'CPH1_HH', 'CPH1_VH', 'CPH6_VH', 'CPH6_HH'], trips='all', sampling='2s')
%time tmp = load_trips(['CPH6_VH', 'CPH6_HH', 'CPH1_HH'], trips='all', sampling='5s')
df = tmp

CPU times: total: 1min 17s
Wall time: 1min 31s


In [419]:
df['is_speeding'] = (df['velocity'] * 0.975) > df['speed_limit']

In [420]:
def remove_outliers(df, columns):
    for col in columns:
        q_low = df[col].quantile(0.001)
        q_hi = df[col].quantile(0.999)

        df = df[(df[col] <= q_hi) & (df[col] >= q_low)]

    return df

In [421]:
outlier_columns = ['acceleration_z',
                   'acceleration_y',
                   'acceleration_x',
                   'steering_position',
                   'steering_acceleration',
                   'traction_instant_consumption',
                   'yaw_rate',
                   'velocity']

df = remove_outliers(df, outlier_columns)
df = df.sort_index()

In [422]:
px.histogram(df, x=['steering_position'])

In [423]:
px.histogram(df, x=['steering_acceleration'])

In [424]:
px.histogram(df, x=['yaw_rate'])

In [425]:
df.query('route == "CPH6_VH"').trip.unique()

array([13201], dtype=int64)

In [426]:
df.query('route == "CPH6_HH"').trip.unique()

array([13175], dtype=int64)

In [427]:
df_tmp = df.query('`pass` == 1 and trip == 13201')
px.histogram(df_tmp, x=['acceleration_x', 'acceleration_y', 'acceleration_z'], nbins=100)

In [428]:
df_tmp = df.query('`pass` == 1 and trip == 13175')
px.histogram(df_tmp, x=['acceleration_x', 'acceleration_y', 'acceleration_z'], nbins=100)

In [429]:
px.line(df_tmp, x=df_tmp.index, y='acceleration_y')

In [None]:
df_tmp = df_tmp.sort_index()
px.scatter(df_tmp, x='traction_instant_consumption', y='acceleration_x')

In [431]:
px.histogram(df, x='velocity')

In [432]:
px.histogram(df, x='traction_instant_consumption')

In [433]:
df.columns

Index(['street_name_start', 'segment_id', 'GPS_dt', 'lat_int', 'lon_int',
       'acceleration_z', 'acceleration_y', 'acceleration_x', 'yaw_rate',
       'front_wiper_status', 'odometer', 'velocity',
       'traction_instant_consumption', 'steering_acceleration',
       'steering_position', 'driver_safety_belt_reminder', 'kW',
       'traffic_lights', 'road_signs', 'speed_limit', 'ped_walks',
       'speed_bumps', 'bike_lane', 'pass', 'trip', 'route', 'is_speeding'],
      dtype='object')

In [434]:
df['is_speeding_int'] = df['is_speeding'].astype(int)

In [435]:
features = ['acceleration_x', 'acceleration_y', 'yaw_rate', 'traction_instant_consumption', 'velocity']

binary_features = ['driver_safety_belt_reminder', 'is_speeding']

# features = ['acceleration_x', 'acceleration_y', 'acceleration_z', 'steering_acceleration', 'yaw_rate']

In [436]:
from sklearn.preprocessing import StandardScaler

In [437]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

In [438]:
import umap

In [439]:
fit = umap.UMAP(n_components=3)

%time u = fit.fit_transform(df_scaled)
u=u

CPU times: total: 53.8 s
Wall time: 24.3 s


In [440]:
px.scatter_3d(u, x=0, y=1, z=2)

In [441]:
from sklearn.cluster import KMeans
import hdbscan

In [468]:
# clusterer = KMeans(n_clusters=3)
clusterer = hdbscan.HDBSCAN(min_cluster_size=500)

In [469]:
df_fit = pd.DataFrame(df_scaled, index=df.index, columns=features)
df_fit[binary_features] = df[binary_features]

clusters = clusterer.fit_predict(df_fit)
np.unique(clusters)

array([-1,  0,  1,  2,  3], dtype=int64)

In [470]:
import pandas as pd
import numpy as np

In [471]:
df_cluster = pd.DataFrame(np.column_stack([df_fit, clusters]), columns=list(range(df_fit.shape[1])) + ['cluster'])

In [472]:
df_cluster['cluster'] = df_cluster['cluster'].astype(str)

In [473]:
px.scatter_3d(df_cluster, x=0, y=1, z=2, color='cluster')

In [474]:
df['cluster'] = clusters

In [475]:
px.histogram(df, x=['steering_position'], color='cluster')

In [476]:
px.histogram(df, x=['steering_acceleration'], color='cluster')

In [477]:
px.histogram(df, x=['yaw_rate'], color='cluster', nbins=300)

In [478]:
px.histogram(df, x=['acceleration_x', 'acceleration_y', 'acceleration_z'], nbins=300, color='cluster')

In [479]:
px.histogram(df, x='velocity', color='cluster')

In [480]:
px.histogram(df, x='traction_instant_consumption', color='cluster')

In [481]:
px.histogram(df, x='is_speeding', color='cluster')