# Model training

## Cross-sectional data

In [10]:
import numpy as np
import pandas as pd
from sktime.classification.kernel_based import RocketClassifier
from sktime.transformations.panel.rocket import MiniRocket, MiniRocketMultivariate
from sktime.classification.interval_based import (
    CanonicalIntervalForest,
    DrCIF,
    RandomIntervalSpectralEnsemble,
    SupervisedTimeSeriesForest,
    TimeSeriesForestClassifier,
)
from sktime.datatypes._panel._convert import from_multi_index_to_3d_numpy
from tsfresh.utilities.dataframe_functions import roll_time_series
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
from mcfly.find_architecture import find_best_architecture
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import warnings

warnings.simplefilter('ignore')

SEED=42

%matplotlib inline

In [2]:
df = pd.read_csv("../data/odb-2-master-thesis/exp1_14drivers_14cars_dailyRoutes_interpolated.csv")
df.head()

Unnamed: 0,AIR_INTAKE_TEMP,ENGINE_COOLANT_TEMP,ENGINE_LOAD,ENGINE_RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE_POS,TIMING_ADVANCE,TROUBLE_CODES,VEHICLE_ID,TIMESTAMP,TROUBLE_CODES_BINARY
0,59.0,80.0,0.333,1009.0,0.0,0.0,0.251,0.569,,car1,2017-08-16 16:55:04.267,0
1,59.0,80.0,0.325,1003.0,0.0,0.0,0.251,0.565,,car1,2017-08-16 16:55:12.283,0
2,59.0,80.0,0.329,995.0,0.0,0.0,0.251,0.573,,car1,2017-08-16 16:55:20.291,0
3,60.0,80.0,0.325,1004.0,0.0,0.0,0.251,0.565,,car1,2017-08-16 16:55:28.300,0
4,60.0,80.0,0.329,1005.0,0.0,0.0,0.251,0.569,,car1,2017-08-16 16:55:36.320,0


In [3]:
df.shape

(47514, 12)

In [10]:
selected_columns = df.columns.difference(["TROUBLE_CODES", "VEHICLE_ID", "TIMESTAMP"])
df = df[selected_columns]

TARGET = "TROUBLE_CODES_BINARY"

X = df.drop([TARGET], axis=1)
y = df[TARGET]

X.shape, y.shape

((47514, 8), (47514,))

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((38011, 8), (9503, 8), (38011,), (9503,))

In [40]:
Counter(y)

Counter({0: 35589, 1: 11925})

In [24]:
scaler = StandardScaler()
scaler.fit(X_train)

index_train = X_train.index
index_test = X_test.index

X_train = scaler.transform(X_train)
X_train = pd.DataFrame(X_train, index=index_train, columns=X.columns)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, index=index_test, columns=X.columns)

### Logistic Regression

In [65]:
log_reg = LogisticRegression(random_state=SEED)
log_reg.fit(X_train, y_train)

y_pred = lin_svc.predict(X_test)

print("Classification Repost for the LinearSVC model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the LinearSVC model 

              precision    recall  f1-score   support

           0       0.75      1.00      0.85      7118
           1       0.20      0.00      0.01      2385

    accuracy                           0.75      9503
   macro avg       0.47      0.50      0.43      9503
weighted avg       0.61      0.75      0.64      9503



### Linear Support Vector Classifier

In [64]:
lin_svc = LinearSVC(random_state=SEED)
lin_svc.fit(X_train, y_train)

y_pred = lin_svc.predict(X_test)

print("Classification Repost for the LinearSVC model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the LinearSVC model 

              precision    recall  f1-score   support

           0       0.75      1.00      0.85      7118
           1       0.20      0.00      0.01      2385

    accuracy                           0.75      9503
   macro avg       0.47      0.50      0.43      9503
weighted avg       0.61      0.75      0.64      9503



### Random Forest Classifier

In [63]:
# Model training
random_forest = RandomForestClassifier(random_state=SEED)
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

print("Classification Repost for the RandomForestClassifier model \n")
print(classification_report(y_test, y_pred))

Classification Repost for the RandomForestClassifier model 

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      7118
           1       0.96      0.95      0.96      2385

    accuracy                           0.98      9503
   macro avg       0.97      0.97      0.97      9503
weighted avg       0.98      0.98      0.98      9503



## Time Series Classification on (3D) Panel Data

In [26]:
features_all = pd.read_parquet("../data/odb-2-window-89/features-all-windowsize89.parquet.gzip")
target_all = pd.read_parquet("../data/odb-2-window-89/target-binary-all-windowsize89.parquet.gzip")

In [27]:
features_all = features_all.set_index(["INSTANCES", "TIMEPOINTS"])
features_all

Unnamed: 0_level_0,Unnamed: 1_level_0,AIR_INTAKE_TEMP,ENGINE_COOLANT_TEMP,ENGINE_LOAD,ENGINE_RPM,SHORT TERM FUEL TRIM BANK 1,SPEED,THROTTLE_POS,TIMING_ADVANCE
INSTANCES,TIMEPOINTS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,59.000000,80.0,0.3330,1009.00,0.000000,0.0,0.2510,0.569000
0,1,59.000000,80.0,0.3250,1003.00,0.000000,0.0,0.2510,0.565000
0,2,59.000000,80.0,0.3290,995.00,0.000000,0.0,0.2510,0.573000
0,3,60.000000,80.0,0.3250,1004.00,0.000000,0.0,0.2510,0.565000
0,4,60.000000,80.0,0.3290,1005.00,0.000000,0.0,0.2510,0.569000
...,...,...,...,...,...,...,...,...,...
46253,84,47.222222,81.0,0.1665,823.75,-0.181889,0.0,0.1055,0.340125
46253,85,47.777778,79.0,0.1450,841.00,-0.233111,0.0,0.1040,0.274500
46253,86,48.333333,77.0,0.1235,858.25,-0.284333,0.0,0.1025,0.208875
46253,87,48.888889,75.0,0.1020,875.50,-0.335556,0.0,0.1010,0.143250


In [28]:
# Create indices
index_train, index_test = train_test_split(features_all.index.levels[0], 
                                           test_size=0.20, 
                                           random_state=42)

In [29]:
# Split train and test
X_train = features_all.loc[index_train]
X_train = from_multi_index_to_3d_numpy(X_train)

y_train = target_all.loc[index_train]
y_train = y_train.values.flatten()

X_test = features_all.loc[index_test]
X_test = from_multi_index_to_3d_numpy(X_test)

y_test = target_all.loc[index_test]
y_test = y_test.values.flatten()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((37003, 8, 89), (9251, 8, 89), (37003,), (9251,))

### Supervised Time Series Forest Classifier

In [None]:
%%time 

# sts_forest = SupervisedTimeSeriesForest(n_estimators=10)
# sts_forest.fit(X_train, y_train)

ValueError: Data seen by SupervisedTimeSeriesForest instance has multivariate series, but this SupervisedTimeSeriesForest instance cannot handle multivariate series. Calls with multivariate series may result in error or unreliable results.

In [None]:
# y_pred = clf.predict(X_test)

# print("Classification Repost for the SupervisedTimeSeriesForest model \n")
# print(classification_report(y_test.values.flatten(), y_pred.values.flatten()))

### Mini Rocket Multivariate Transformer + Ridge Classifier

In [6]:
%%time

minirocket_multi = MiniRocketMultivariate()
minirocket_multi.fit(X_train)

X_train_transform = minirocket_multi.transform(X_train)

CPU times: user 3min 1s, sys: 1.17 s, total: 3min 2s
Wall time: 3min 2s


In [None]:
%%time 

ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
ridge.fit(X_train_transform, y_train)

In [None]:
X_test_transform = minirocket_multi.transform(X_test)
y_pred = ridge.predict(X_test_transform)

print("Classification Repost for the RidgeClassifierCV model \n")
print(classification_report(y_test, y_pred))

### Canonical Interval Forest (CIF) Classifier

In [7]:
%%time

cif = CanonicalIntervalForest(n_estimators=50, att_subsample_size=8, random_state=47)
cif.fit(X_train, y_train)


KeyboardInterrupt



In [8]:
%%time 

y_pred = cif.predict(X_test)

print("Classification Repost for the Canonical Interval Forest model \n")
print(classification_report(y_test, y_pred))


KeyboardInterrupt



### Mcfly AutoML

In [30]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((29602, 8, 89), (7401, 8, 89), (29602,), (7401,))

In [34]:
X_train = np.swapaxes(X_train, 1, 2)
X_val = np.swapaxes(X_val, 1, 2)

In [31]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(y_train)

LabelBinarizer()

In [35]:
y_train

array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [1]])

In [None]:
%%time

best_model, best_params, best_model_type, knn_acc = \
            find_best_architecture(
                X_train=X_train,
                y_train=y_train,
                X_val=X_val,
                y_val=y_val, 
                number_of_models=10,
                nr_epochs=20,
                # subset_size=5000,
            )

Set maximum kernel size for InceptionTime models to number of timesteps.
Set maximum kernel size for InceptionTime models to number of timesteps.
Set maximum kernel size for InceptionTime models to number of timesteps.
Generated models will be trained on subset of the data (subset size: 100).
Training model 0 InceptionTime
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 3: early stopping
Training model 1 ResNet
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 3: early stopping
Training model 2 CNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training model 3 DeepConvLSTM
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20