In [1]:
import numpy as np
import pandas as pd
from scipy import stats as st
import itertools
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from joblib import dump, load

In [8]:
def sliding_windows(df, features, window_size=20):
    """Create a sliding window with a defined window size and return the calculation for each record inside the sliding window.

    The calculations made are for each time window: 
    - The mean.
    - The median.
    - The standard deviation.
    - The maximum and minimum value.
    - The trend.

    Note the process is explained in greater detail in: <TODO: reference link>

    Args:
        dataset (numpy.array): An array with the data taken on a vehicle trip, 
        composed for rows: dataset registers and columns: first data index, and subsequent dataset event features.
        window_size (int, optional): number Number of registers contained in the time window. Defaults to 10.
        event_features (str, optional): the names of dataset features. Defaults to "X".

    Returns:
        tuple: A tuple structured like this: (sliding window id, sliding window featured data, sliding window label data, features name)
    """
    features = ["id", *features,  "eventClass"]
    dataset = df[features].to_numpy()

    sld_window = np.lib.stride_tricks.sliding_window_view(dataset, window_size, axis=0) #[::1, :] Add this for define window step

    # Splitin dataset id
    id = sld_window[:,0:1,:]
    sld_window_id = np.concatenate((id[:,:,0], id[:,:,-1]), axis=1) # get the first and last id from registers in every sliding window

    # Spliting the dataset (features, label)
    separator = dataset.shape[1] - 1 # Split the last page corresponding to the eventClass
    features_data = sld_window[:, 1:separator, :] # Get the features of the data in every sliding window
    label_data = sld_window[:, separator, :] # Get the labels of the data in every sliding window

    # Processing the sliding window
    # Get the mean, median, std, max and min value
    mean = features_data.mean(axis=2)
    median = np.median(features_data, axis=2)
    std = features_data.std(axis=2)
    max_val = features_data.max(axis=2)
    min_val = features_data.min(axis=2)
    # Get tendency
    divider = np.array([mean[0], *mean[:-1]])
    tendency = mean/np.where(divider == 0, 1, divider)
    label = st.mode(label_data, axis=1)[0]

    # Concatenate processed sliding window
    sld_window_features = np.concatenate((mean, median, std, max_val, min_val, tendency), axis=1)
    # Reshape label for sklearn standard
    label = label.reshape(label.shape[0])

    # Make input algorithm df
    X = pd.DataFrame(sld_window_features)
    X[["first","last"]] = sld_window_id
    X.set_index(["first","last"], inplace=True)

    return (X, label)

In [3]:
from pykalman import KalmanFilter

def data_filter(df):
    features = ["accX", "accY", "velAngZ", "magX", "magY"]

    for var in features:
        data = df[var]
        
        # Kalman filter process
        kf = KalmanFilter(initial_state_mean = data.iloc[0], n_dim_obs=1)
        filter_data = kf.em(data).filter(data)[0].T[0]
        filter_data_s = pd.Series(np.array(filter_data), name=var)
        df[var] = filter_data_s

        # Normalized magnetometer data with min-max normalization
        if var == "magX" or var =="magY":
            normalized_data = (data-data.min())/(data.max()-data.min())
            df[var] = normalized_data

    return df

## Prueba

In [5]:
clasifiers = ["clf_sudden_braking_smartphone",
              "clf_sudden_braking_raspberry",
              "clf_sudden_acceleration_smartphone",
              "clf_sudden_acceleration_raspberry",
              "clf_chg_line_right_smartphone",
              "clf_chg_line_right_raspberry",
              "clf_chg_line_left_smartphone",
              "clf_chg_line_left_raspberry",
              "clf_agg_turn_right_smartphone",
              "clf_agg_turn_right_raspberry",
              "clf_agg_turn_left_smartphone",
              "clf_agg_turn_left_raspberry"]

data = ["smartphone_02-Feb-2022-20-01_frenada repentina_Data-Mv2EUSpYPrGpMG3qjaD","raspberry_02-Feb-2022-20-02_frenada repentina_Data-Mv2E_1BnfkXOP6gRM1J",
        "smartphone_02-Feb-2022-19-59_aceleración repentina_Data-Mv1jxE4ZYJvJ9AVKae1", "raspberry_02-Feb-2022-20-00_aceleración repentina_Data-Mv1m25ILfD2v20bDi5Q",
        "smartphone_02-Feb-2022-20-26_cambio de línea agresivo derecha_Data-Mv2MrzIVJOQsCqDJHcb", "raspberry_02-Feb-2022-20-29_cambio de línea agresivo derecha_Data-Mv2N4sqCOBh_ItfGkmE",
        "smartphone_02-Feb-2022-20-21_cambio de línea agresivo izquierda_Data-Mv2LixfwEokL4QCgN_p", "raspberry_02-Feb-2022-20-22_cambio de línea agresivo izquierda_Data-Mv2LjjP0SLKY6zw_7Y9",
        "smartphone_02-Feb-2022-20-31_giro derecho agresivo_Data-Mv2Nk_4aV4M_KufahA0", "raspberry_02-Feb-2022-20-08_giro derecho agresivo_Data-Mv2HSOB7ABwlXlcqhJ8",
        "smartphone_02-Feb-2022-20-15_giro izquierdo agresivo_Data-Mv2KZyC2nSbvYeWyEmh", "raspberry_02-Feb-2022-20-15_giro izquierdo agresivo_Data-Mv2JoflmGmLbEwyIi2a"]

features = [["speed","accY"],
            ["speed","accY"],
            ["speed","accY"],
            ["speed", "accPosition", "accY"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX" ,"accY", "velAngZ", "magX", "magY"],
            ["speed", "accX", "accY", "velAngZ", "magX"],
            ["accX" ,"accY", "velAngZ", "magX", "magY"], 
            ["speed", "accX", "accY", "velAngZ", "magX"]]

for c, d, f in zip(clasifiers, data, features):
    print("\n\n", c,"\n\n")
    clf = load(str("./built_algorithms/"+c+".joblib"))
    df = pd.read_csv(str("./filtered_data/"+d+".csv"))
    event_features_label = ["id", *f,  "eventClass"]
    dataset = df[event_features_label].to_numpy()
    window_size = 40
    ids, X, y, features_names = sliding_windows(dataset, window_size, f)
    X_df = pd.DataFrame(X, columns=features_names)
    X_df[["first","last"]] = ids
    X_df.set_index(["first","last"], inplace=True)

    y_predict = clf.predict(X_df.values)
    y_predict_proba = clf.predict_proba(X_df.values)[:,1]
    
    print('Accuracy: {:.6f}'.format(accuracy_score(y, y_predict)))
    print('Precision: {:.6f}'.format(precision_score(y, y_predict)))
    print('Recall: {:.6f}'.format(recall_score(y, y_predict)))
    print('F1: {:.6f}'.format(f1_score(y, y_predict)))
    print('AUC: {:.6f}'.format(roc_auc_score(y, y_predict_proba)))



 clf_sudden_braking_smartphone 



Sliding windows shape:  (414, 4, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_sudden_braking_raspberry 



Sliding windows shape:  (401, 4, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_sudden_acceleration_smartphone 



Sliding windows shape:  (382, 4, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_sudden_acceleration_raspberry 



Sliding windows shape:  (296, 5, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_chg_line_right_smartphone 



Sliding windows shape:  (624, 4, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_chg_line_right_raspberry 



Sliding windows shape:  (388, 4, 40)
Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000
AUC: 1.000000


 clf_chg_line_left_smartphone 



Sliding windows

## Algoritmo de reconocimiento de near-crash

In [16]:
# TODO: this variables is getting by firebase in app.py
device_name = "smartphone" #"raspberry"
firebase_data = "./data/smartphone_02-Feb-2022-20-01_frenada repentina_Data-Mv2EUSpYPrGpMG3qjaD.csv" # TODO: change for data getting by firebase
df = pd.read_csv(firebase_data)


clasifiers = ["clf_sudden_braking_smartphone",
              "clf_sudden_braking_raspberry",
              "clf_sudden_acceleration_smartphone",
              "clf_sudden_acceleration_raspberry",
              "clf_chg_line_right_smartphone",
              "clf_chg_line_right_raspberry",
              "clf_chg_line_left_smartphone",
              "clf_chg_line_left_raspberry",
              "clf_agg_turn_right_smartphone",
              "clf_agg_turn_right_raspberry",
              "clf_agg_turn_left_smartphone",
              "clf_agg_turn_left_raspberry"]

features = [["speed","accY"],
            ["speed","accY"],
            ["speed","accY"],
            ["speed", "accPosition", "accY"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX", "velAngZ"],
            ["accX" ,"accY", "velAngZ", "magX", "magY"],
            ["speed", "accX", "accY", "velAngZ", "magX"],
            ["accX" ,"accY", "velAngZ", "magX", "magY"], 
            ["speed", "accX", "accY", "velAngZ", "magX"]]

if device_name == "smartphone":
    clasifiers = clasifiers[::2]
    features = features[::2]
else:
    clasifiers = clasifiers[1::2]
    features = features[1::2]

# TODO: before filter is need to manage the offset of the data, for this reason in the experiments we need to make a standby time
"""max_standby = 7660
var_with_offset = "accY"
offset = df.loc[df['id'] <= max_standby, var_with_offset].mean()
df[var_with_offset] = df[var_with_offset] - offset"""


# Make filter kalman for all data
df_filtered = data_filter(df)

near_crash = {}

for c, f in zip(clasifiers, features):
    #Make sliding window
    X, y = sliding_windows(df_filtered, f, 40)

    # Check near-crash
    clf = load(str("./built_algorithms/"+c+".joblib"))
    y_predict = clf.predict(X.values)
    y_predict_proba = clf.predict_proba(X.values)[:,1]

    if (len(np.where(y_predict == 1.0)[0]) != 0):
        near_crash_df = X.iloc[np.where(y_predict == 1.0)[0]]
        near_crash[c] = near_crash_df

near_crash

{'clf_sudden_braking_smartphone':                          0         1          2         3         4         5  \
 first   last                                                                    
 33026.0 33065.0  28.034598 -2.693644  26.896181 -0.101513  1.227545  3.286733   
 33027.0 33066.0  28.012121 -2.829017  26.896181 -0.287917  1.200094  3.273355   
 33028.0 33067.0  27.989644 -2.942426  26.896181 -2.395554  1.171569  3.245156   
 33029.0 33068.0  27.967167 -3.028473  26.896181 -3.768366  1.141889  3.204784   
 33030.0 33069.0  27.944690 -3.084421  26.896181 -3.768366  1.110963  3.158017   
 33031.0 33070.0  27.922213 -3.109734  26.896181 -3.768366  1.078682  3.135164   
 33032.0 33071.0  27.976214 -3.114662  27.976214 -3.768366  1.080033  3.130625   
 33033.0 33072.0  27.838813 -3.126082  27.976214 -3.768366  1.483088  3.119935   
 33034.0 33073.0  27.701412 -3.138076  27.976214 -3.768366  1.787419  3.108179   
 33035.0 33074.0  27.564012 -3.155055  27.976214 -3.768366  2.037

In [10]:
from itertools import groupby
from operator import itemgetter
data = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,17,30,31,32,60,61,62,63,64,65,70,71,80,81,82,83,84,85]
for k, g in groupby(enumerate(data), lambda i_x: i_x[0] - i_x[1]):
    print(list(map(itemgetter(1), g)))

[1, 2, 3, 4, 5, 6, 7, 8]
[10, 11, 12, 13, 14, 15]
[17]
[30, 31, 32]
[60, 61, 62, 63, 64, 65]
[70, 71]
[80, 81, 82, 83, 84, 85]


In [15]:
import numpy as np
def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

a = np.array(data)
consecutive(a)

[array([1, 2, 3, 4, 5, 6, 7, 8]),
 array([10, 11, 12, 13, 14, 15]),
 array([17]),
 array([30, 31, 32]),
 array([60, 61, 62, 63, 64, 65]),
 array([70, 71]),
 array([80, 81, 82, 83, 84, 85])]