# Model training

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from sktime.datatypes._panel._convert import from_multi_index_to_nested
from tsfresh.utilities.dataframe_functions import roll_time_series
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import warnings

warnings.simplefilter('ignore')

SEED=42

%matplotlib inline

In [2]:
df = pd.read_csv("../data/odb-2-master-thesis/exp1_14drivers_14cars_dailyRoutes_interpolated.csv")
df.head()

Unnamed: 0,AIR_INTAKE_TEMP,ENGINE_COOLANT_TEMP,ENGINE_LOAD,ENGINE_RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE_POS,TIMING_ADVANCE,TROUBLE_CODES,VEHICLE_ID,TIMESTAMP,TROUBLE_CODES_BINARY
0,59.0,80.0,0.333,1009.0,0.0,0.0,0.251,0.569,,car1,2017-08-16 16:55:04.267,0
1,59.0,80.0,0.325,1003.0,0.0,0.0,0.251,0.565,,car1,2017-08-16 16:55:12.283,0
2,59.0,80.0,0.329,995.0,0.0,0.0,0.251,0.573,,car1,2017-08-16 16:55:20.291,0
3,60.0,80.0,0.325,1004.0,0.0,0.0,0.251,0.565,,car1,2017-08-16 16:55:28.300,0
4,60.0,80.0,0.329,1005.0,0.0,0.0,0.251,0.569,,car1,2017-08-16 16:55:36.320,0


In [3]:
df.shape

(47514, 12)

In [10]:
selected_columns = df.columns.difference(["TROUBLE_CODES", "VEHICLE_ID", "TIMESTAMP"])
df = df[selected_columns]

TARGET = "TROUBLE_CODES_BINARY"

X = df.drop([TARGET], axis=1)
y = df[TARGET]

X.shape, y.shape

((47514, 8), (47514,))

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((38011, 8), (9503, 8), (38011,), (9503,))

In [40]:
Counter(y)

Counter({0: 35589, 1: 11925})

In [24]:
scaler = StandardScaler()
scaler.fit(X_train)

index_train = X_train.index
index_test = X_test.index

X_train = scaler.transform(X_train)
X_train = pd.DataFrame(X_train, index=index_train, columns=X.columns)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, index=index_test, columns=X.columns)

## Logistic Regression

In [65]:
log_reg = LogisticRegression(random_state=SEED)
log_reg.fit(X_train, y_train)

y_pred = lin_svc.predict(X_test)

print("Classification Repost for the LinearSVC model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the LinearSVC model 

              precision    recall  f1-score   support

           0       0.75      1.00      0.85      7118
           1       0.20      0.00      0.01      2385

    accuracy                           0.75      9503
   macro avg       0.47      0.50      0.43      9503
weighted avg       0.61      0.75      0.64      9503



## Linear Support Vector Classifier

In [64]:
lin_svc = LinearSVC(random_state=SEED)
lin_svc.fit(X_train, y_train)

y_pred = lin_svc.predict(X_test)

print("Classification Repost for the LinearSVC model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the LinearSVC model 

              precision    recall  f1-score   support

           0       0.75      1.00      0.85      7118
           1       0.20      0.00      0.01      2385

    accuracy                           0.75      9503
   macro avg       0.47      0.50      0.43      9503
weighted avg       0.61      0.75      0.64      9503



## Random Forest Classifier

In [63]:
# Model training
random_forest = RandomForestClassifier(random_state=SEED)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

print("Classification Repost for the RandomForestClassifier model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the RandomForestClassifier model 

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      7118
           1       0.96      0.95      0.96      2385

    accuracy                           0.98      9503
   macro avg       0.97      0.97      0.97      9503
weighted avg       0.98      0.98      0.98      9503



## Time Series Classification

In [66]:
vehicle_ids = ['car1', 'car11', 'car12', 'car13', 'car3', 'car4', 'car6', 'car7', 'car8', 'car9']

In [None]:
path_features = "../data/odb-2-window-89/features-{}-windowsize{}.parquet.gzip"
path_target = "../data/odb-2-window-89/target-binary-{}-windowsize{}.parquet.gzip"

X_ts = None
y_ts = None

for vid in vehicle_ids:
    if df_ts is None:
        

In [6]:
df_car1 = pd.read_parquet("../data/odb-2-window-89/features-car9-windowsize89.parquet.gzip")
df_car1["INSTANCES"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88])

In [7]:
df_car1

Unnamed: 0,INSTANCES,TIMEPOINTS,AIR_INTAKE_TEMP,ENGINE_COOLANT_TEMP,ENGINE_LOAD,ENGINE_RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE_POS,TIMING_ADVANCE
0,0,0,45.000000,76.0,0.3650,771.00,-0.031000,0.0,0.1220,0.537000
1,0,1,46.000000,76.0,0.3690,777.00,0.016000,0.0,0.1250,0.553000
2,0,2,46.000000,76.0,0.3570,918.00,-0.023000,0.0,0.1290,0.655000
3,0,3,46.000000,76.0,0.3730,786.00,0.000000,0.0,0.1250,0.529000
4,0,4,48.000000,66.0,0.3920,803.00,0.000000,0.0,0.1250,0.529000
...,...,...,...,...,...,...,...,...,...,...
527765,88,84,47.222222,81.0,0.1665,823.75,-0.181889,0.0,0.1055,0.340125
527766,88,85,47.777778,79.0,0.1450,841.00,-0.233111,0.0,0.1040,0.274500
527767,88,86,48.333333,77.0,0.1235,858.25,-0.284333,0.0,0.1025,0.208875
527768,88,87,48.888889,75.0,0.1020,875.50,-0.335556,0.0,0.1010,0.143250


In [8]:
527770 / 89

5930.0

In [95]:
from sktime.datatypes._panel._convert import from_multi_index_to_nested

In [97]:
# from_2d_array_to_nested(df_car1, index="INSTANCES", time_index=df_car1.TIMEPOINTS)
from_pd_wide_to_nested(df_car1.head(89))

Unnamed: 0,0
0,0 0.000 1 0.000 2 59.000 3 ...
1,0 0.000 1 1.000 2 59.000 3 ...
2,0 0.000 1 2.000 2 59.000 3 8...
3,0 0.000 1 3.000 2 60.000 3 ...
4,0 0.000 1 4.000 2 60.000 3 ...
...,...
84,0 0.000 1 84.000 2 56.000 3 ...
85,0 0.000 1 85.000 2 55.000 3 ...
86,0 0.000 1 86.000 2 54.000 3 ...
87,0 0.000 1 87.000 2 54.000 3 ...


In [98]:
dfcsv = pd.read_csv("../data/odb-2-master-thesis/features-car1-windowsize89.csv", index_col=[0, 1])
dfcsv.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AIR INTAKE TEMP,ENGINE COOLANT TEMP,ENGINE LOAD,ENGINE RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE POS,TIMING ADVANCE
INSTANCES,TIMEPOINTS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,59.0,80.0,0.333,1009.0,0.0,0.0,0.251,0.569
0,1,59.0,80.0,0.325,1003.0,0.0,0.0,0.251,0.565
0,2,59.0,80.0,0.329,995.0,0.0,0.0,0.251,0.573
0,3,60.0,80.0,0.325,1004.0,0.0,0.0,0.251,0.565
0,4,60.0,80.0,0.329,1005.0,0.0,0.0,0.251,0.569


In [104]:
dfcsv.index

MultiIndex([( 0,  0),
            ( 0,  1),
            ( 0,  2),
            ( 0,  3),
            ( 0,  4),
            ( 0,  5),
            ( 0,  6),
            ( 0,  7),
            ( 0,  8),
            ( 0,  9),
            ...
            (88, 79),
            (88, 80),
            (88, 81),
            (88, 82),
            (88, 83),
            (88, 84),
            (88, 85),
            (88, 86),
            (88, 87),
            (88, 88)],
           names=['INSTANCES', 'TIMEPOINTS'], length=1204081)

In [100]:
from_multi_index_to_nested(dfcsv, instance_index="INSTANCES")

Unnamed: 0,AIR INTAKE TEMP,ENGINE COOLANT TEMP,ENGINE LOAD,ENGINE RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE POS,TIMING ADVANCE
0,0 59.0 1 59.0 2 59.0 3 60.0 4 ...,0 80.0 1 80.0 2 80.0 3 80.0 4 ...,0 0.333 1 0.325 2 0.329 3 0.32...,0 1009.0 1 1003.0 2 995.0 3 1...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 0.251 1 0.251 2 0.251 3 0.25...,0 0.569 1 0.565 2 0.573 3 0.56...
1,1 48.0 2 49.0 3 50.0 4 51.0 5 ...,1 84.0 2 84.0 3 84.0 4 84.0 5 ...,1 0.314 2 0.310 3 0.314 4 0.31...,1 1012.0 2 1010.0 3 1003.0 4 1...,1 0.0 2 0.0 3 0.0 4 0.0 5 ...,1 0.0 2 0.0 3 0.0 4 2.0 5 ...,1 0.247 2 0.247 3 0.247 4 0.27...,1 0.573 2 0.557 3 0.592 4 0.56...
2,2 33.0 3 34.0 4 34.0 5 34.0 6 ...,2 81.0 3 81.0 4 81.0 5 81.0 6 ...,2 0.161 3 0.306 4 0.165 5 0.18...,2 1963.0 3 1701.0 4 1630.0 5 1...,2 0.0 3 0.0 4 0.0 5 0.0 6 ...,2 32.0 3 28.5 4 25.0 5 10.0 6 ...,2 0.243 3 0.290 4 0.235 5 0.22...,2 0.631 3 0.722 4 0.624 5 0.69...
3,3 34.0 4 34.0 5 34.0 6 34.0 7 ...,3 90.0 4 91.0 5 92.0 6 93.0 7 ...,3 0.184 4 0.180 5 0.184 6 0.38...,3 1250.0 4 1201.0 5 1190.0 6 ...,3 0.0 4 0.0 5 0.0 6 0.0 7 ...,3 16.0 4 15.0 5 10.0 6 13.0 7 ...,3 0.227 4 0.216 5 0.220 6 0.25...,3 0.690 4 0.631 5 0.655 6 0.55...
4,0 30.0 1 30.0 2 30.0 3 30.0 4 ...,0 85.0 1 84.0 2 84.0 3 83.0 4 ...,0 0.161 1 0.271 2 0.310 3 0.27...,0 1693.0 1 1293.0 2 1017.0 3 1...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 35.0 1 26.0 2 19.0 3 14.0 4 ...,0 0.231 1 0.259 2 0.247 3 0.24...,0 0.682 1 0.690 2 0.573 3 0.58...
...,...,...,...,...,...,...,...,...
84,84 44.702857 85 44.708571 86 44.71428...,84 70.742857 85 70.728571 86 70.71428...,84 0.308074 85 0.307957 86 0.307840 8...,84 991.444444 85 991.435897 86 991...,84 -1.0 85 -1.0 86 -1.0 87 -1.0 88 -...,84 0.0 85 0.0 86 0.0 87 0.0 88...,84 0.245377 85 0.245331 86 0.245286 8...,84 0.587406 85 0.587417 86 0.587429 8...
85,85 31.0 86 31.0 87 30.0 88 31.0 0 ...,85 79.0 86 79.0 87 79.0 88 80.0 0 ...,85 0.161 86 0.325 87 0.537 88 0.20...,85 1807.0 86 1156.0 87 1945.0 88 1...,85 -1.0 86 -1.0 87 -1.0 88 -1.0 0 -...,85 47.0 86 25.0 87 49.0 88 50.0 0 ...,85 0.239 86 0.235 87 0.341 88 0.23...,85 0.486 86 0.651 87 0.675 88 0.69...
86,86 36.0 87 38.0 88 39.0 0 37.0 1 ...,86 80.0 87 80.0 88 80.0 0 84.0 1 ...,86 0.318 87 0.408 88 0.322 0 0.25...,86 1015.0 87 963.0 88 1043.0 0 1...,86 -1.0 87 -1.0 88 -1.0 0 -1.0 1 -...,86 4.0 87 15.0 88 0.0 0 0.0 1 ...,86 0.251 87 0.302 88 0.255 0 0.23...,86 0.561 87 0.612 88 0.580 0 0.61...
87,87 38.0 88 38.0 0 34.0 1 33.0 2 ...,87 79.0 88 79.0 0 77.0 1 78.0 2 ...,87 0.263 88 0.157 0 0.208 1 0.25...,87 1923.0 88 1953.0 0 2438.0 1 2...,87 -1.0 88 -1.0 0 -1.0 1 -1.0 2 -...,87 60.0 88 60.0 0 78.0 1 76.0 2 ...,87 0.290 88 0.247 0 0.290 1 0.31...,87 0.753 88 0.490 0 0.788 1 0.74...
