In [1]:
import os
import sys
import json
import mlflow
import warnings
import numpy as np
import pandas as pd
from datetime import timedelta
from sktime.datatypes import convert_to
from timeit import default_timer as timer
from sktime.datasets import load_from_tsfile
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sktime.datasets import load_from_tsfile_to_dataframe
from mcfly.find_architecture import find_best_architecture
from mcfly.find_architecture import train_models_on_samples

warnings.filterwarnings('ignore')
np.random.seed(42)

2022-07-12 18:20:36.331670: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-12 18:20:36.332536: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("mcfly-training-duration")

<Experiment: artifact_location='../mlflow-artifacts/1', experiment_id='1', lifecycle_stage='active', name='mcfly-training-duration', tags={}>

In [3]:
datasets = {
    # "CharacterTrajectories": "/mnt/f/Downloads/CharacterTrajectories/CharacterTrajectories_TRAIN.ts",
    "HandMovementDirection": "/mnt/f/Downloads/HandMovementDirection_TRAIN.ts",
    "EthanolLevel": "/mnt/f/Downloads/EthanolLevel/EthanolLevel_TRAIN.ts",
    "HandOutlines": "/mnt/f/Downloads/HandOutlines/HandOutlines_TRAIN.ts",
    # "Car": "/mnt/f/Documents/Car_TRAIN.ts",
    "FordA": "/mnt/f/Downloads/FordA/FordA_TRAIN.ts"
}

In [4]:
def read_dataset(file_path):
    X, y = load_from_tsfile(file_path)
    
    X = convert_to(X, to_type="numpy3D")
    X = np.swapaxes(X, 1, 2)
    
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    
    return X, y, lb


def split_data(X, y):
    return train_test_split(
        X, 
        y, 
        test_size=0.20, 
        random_state=42, 
        shuffle=True, 
        stratify=y)

In [None]:
%%time

cnt = 0
num_models_list = [8, 15, 25, 40]
num_epochs_list = [20, 30, 42, 66]

for dataset_key in datasets.keys():
    for num_models in num_models_list:
        for num_epochs in num_epochs_list:
            X, y, label_binarizer = read_dataset(datasets[dataset_key])

            X_train, X_val, y_train_binary, y_val_binary = split_data(X, y)

            with mlflow.start_run():
                params = {
                'dataset': dataset_key,
                'num_of_models': num_models,
                'num_epochs': num_epochs,
                'num_instances': X_train.shape[0],
                'num_channels': X.shape[2],
                'series_length': X_train.shape[1],
                'num_channels_x_series_length': X.shape[2] * X.shape[1],
                }
                file_name = f"model-comparison-20220712-key-{dataset_key}-models-{params['num_of_models']}-epochs-{params['num_epochs']}.json"
                outputfile = os.path.join("../models", file_name)

                start_timer = timer()

                best_model, best_params, best_model_type, knn_acc = \
                    find_best_architecture(
                        X_train=X_train,
                        y_train=y_train_binary,
                        X_val=X_val,
                        y_val=y_val_binary,
                        nr_epochs=params['num_epochs'],
                        number_of_models=params['num_of_models'], 
                        outputpath=outputfile,
                        verbose=False, 
                    )

                mlflow.log_metric("time_elapsed", timedelta(seconds=timer()-start_timer).seconds)

                score_test = best_model.evaluate(X_val, y_val_binary, verbose=True)
                mlflow.log_metric("val_loss", score_test[0])
                mlflow.log_metric("val_accuracy", score_test[1])

                params["best_model_type"] = best_model_type
                params = {**params, **best_params}
                mlflow.log_params(params)
                
                cnt = cnt + 1
                print("Count:", cnt)

2022-07-12 18:22:19.787289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-12 18:22:19.787647: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-12 18:22:19.787719: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (p9): /proc/driver/nvidia/version does not exist
2022-07-12 18:22:19.790648: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Generated models will be trained on subset of the data (subset size: 100).
Count: 1
Generated models will be trained on subset of the data (subset size: 100).


### Example: pandas dataframe to numpy

In [73]:
df = pd.DataFrame({
   "id": [1, 1, 1, 1, 2, 2],
   "time": [1, 2, 3, 4, 8, 9],
   "x": [1, 2, 3, 4, 10, 11],
   "y": [5, 6, 7, 8, 12, 13],
})
df

Unnamed: 0,id,time,x,y
0,1,1,1,5
1,1,2,2,6
2,1,3,3,7
3,1,4,4,8
4,2,8,10,12
5,2,9,11,13


In [75]:
df.drop(["id", "time"], axis=1).to_numpy()

array([[ 1,  5],
       [ 2,  6],
       [ 3,  7],
       [ 4,  8],
       [10, 12],
       [11, 13]])

### Example: label encoding using sklearn.preprocessing.LabelEncoder and keras.utils.to_categorial

In [77]:
Xtmp, Ytmp = load_from_tsfile(datasets["Car"])

In [79]:
Ytmp

array(['1', '4', '2', '1', '2', '4', '1', '4', '4', '4', '1', '2', '3',
       '1', '2', '2', '1', '1', '4', '2', '1', '4', '4', '1', '3', '3',
       '1', '4', '3', '4', '2', '2', '3', '4', '3', '3', '1', '3', '2',
       '2', '4', '1', '1', '2', '4', '2', '4', '1', '3', '2', '1', '4',
       '2', '4', '1', '3', '2', '4', '3', '2'], dtype='<U1')

In [97]:
np.unique(Ytmp)

array(['1', '2', '3', '4'], dtype='<U1')

In [80]:
from tensorflow import keras

In [95]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder().fit_transform(Ytmp[0:15])

array([0, 3, 1, 0, 1, 3, 0, 3, 3, 3, 0, 1, 2, 0, 1])

In [96]:
keras.utils.to_categorical(LabelEncoder().fit_transform(Ytmp[0:15]))

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)