In [1]:
import os
import sys
import json
import mlflow
import warnings
import numpy as np
import pandas as pd
from datetime import timedelta
from sktime.datatypes import convert_to
from timeit import default_timer as timer
from sktime.datasets import load_from_tsfile
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sktime.datasets import load_from_tsfile_to_dataframe
from mcfly.find_architecture import find_best_architecture
from mcfly.find_architecture import train_models_on_samples

warnings.filterwarnings('ignore')
np.random.seed(42)

2022-08-23 20:35:39.976074: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-23 20:35:39.976514: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("mcfly-training-duration-models-only")

<Experiment: artifact_location='../mlflow-artifacts/2', experiment_id='2', lifecycle_stage='active', name='mcfly-training-duration-models-only', tags={}>

In [93]:
datasets = {
    # "CharacterTrajectories": "/mnt/f/Downloads/CharacterTrajectories/CharacterTrajectories_TRAIN.ts",
    # "HandMovementDirection": "/mnt/f/Downloads/HandMovementDirection_TRAIN.ts",
    # "EthanolLevel": "/mnt/f/Downloads/EthanolLevel/EthanolLevel_TRAIN.ts",
    # "HandOutlines": "/mnt/f/Downloads/HandOutlines/HandOutlines_TRAIN.ts",
    "Car": "/mnt/f/Documents/Car_TRAIN.ts",
    # "Wafer": "/mnt/f/Downloads/Wafer/Wafer_TRAIN.ts",
    # "FordA": "/mnt/f/Downloads/FordA/FordA_TRAIN.ts",
    # "RacketSports": "/mnt/f/Downloads/RacketSports/RacketSports_TRAIN.ts"
}

In [4]:
def read_dataset(file_path):
    X, y = load_from_tsfile(file_path)
    
    X = convert_to(X, to_type="numpy3D")
    X = np.swapaxes(X, 1, 2)
    
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    
    return X, y, lb


def split_data(X, y):
    return train_test_split(
        X, 
        y, 
        test_size=0.20, 
        random_state=42, 
        shuffle=True, 
        stratify=y)

In [94]:
%%time

cnt = 0
num_models_list = [5, 7, 11, 16, 22]
num_epochs_list = [30]

for dataset_key in datasets.keys():
    for num_models in num_models_list:
        for num_epochs in num_epochs_list:
            cnt = cnt + 1
            print("Run:", cnt, "| key:", dataset_key, "| num_models:", num_models, "| num_epochs:", num_epochs)
            X, y, label_binarizer = read_dataset(datasets[dataset_key])

            X_train, X_val, y_train_binary, y_val_binary = split_data(X, y)

            with mlflow.start_run():
                params = {
                'dataset': dataset_key,
                'num_of_models': num_models,
                'num_epochs': num_epochs,
                'num_instances': X_train.shape[0],
                'num_channels': X.shape[2],
                'series_length': X_train.shape[1],
                'num_channels_x_series_length': X.shape[2] * X.shape[1],
                }
                file_name = f"model-comparison-20220712-key-{dataset_key}-models-{params['num_of_models']}-epochs-{params['num_epochs']}.json"
                outputfile = os.path.join("../models", file_name)

                start_timer = timer()

                best_model, best_params, best_model_type, knn_acc = \
                    find_best_architecture(
                        X_train=X_train,
                        y_train=y_train_binary,
                        X_val=X_val,
                        y_val=y_val_binary,
                        nr_epochs=params['num_epochs'],
                        number_of_models=params['num_of_models'], 
                        outputpath=outputfile,
                        verbose=False, 
                    )

                mlflow.log_metric("time_elapsed", timedelta(seconds=timer()-start_timer).seconds)

                score_test = best_model.evaluate(X_val, y_val_binary, verbose=True)
                mlflow.log_metric("val_loss", score_test[0])
                mlflow.log_metric("val_accuracy", score_test[1])

                params["best_model_type"] = best_model_type
                params = {**params, **best_params}
                mlflow.log_params(params)

Run: 1 | key: Car | num_models: 5 | num_epochs: 30
Generated models will be trained on subset of the data (subset size: 100).
Run: 2 | key: Car | num_models: 7 | num_epochs: 30
Generated models will be trained on subset of the data (subset size: 100).
Run: 3 | key: Car | num_models: 11 | num_epochs: 30
Generated models will be trained on subset of the data (subset size: 100).
Run: 4 | key: Car | num_models: 16 | num_epochs: 30
Generated models will be trained on subset of the data (subset size: 100).
Run: 5 | key: Car | num_models: 22 | num_epochs: 30
Generated models will be trained on subset of the data (subset size: 100).
CPU times: user 3h 49min 48s, sys: 35min 29s, total: 4h 25min 17s
Wall time: 50min 3s


In [3]:
runs1 = mlflow.search_runs(experiment_ids=["1"]) 
runs2 = mlflow.search_runs(experiment_ids=["2"]) 

# mlflow.search_runs("mcfly-training-duration-models-only")

In [5]:
runs1["params.num_epochs"] = runs1["params.num_epochs"].astype('int64', copy=False)

In [6]:
runs1[runs1["params.num_epochs"]>=20].index # & runs1["params.num_epochs"] <= 30

Int64Index([0, 1, 3, 4, 6, 7, 9, 10, 13, 14, 16, 17, 19, 20, 21, 22, 23, 24,
            25],
           dtype='int64')

In [7]:
runs1[runs1["params.num_epochs"]<=30].index 

Int64Index([1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 17, 18, 20, 21, 24, 25], dtype='int64')

In [9]:
cols = ["params.num_epochs", "params.num_instances", "params.series_length", 
        "params.num_channels", "metrics.time_elapsed", "params.num_of_models", ]
runs1 = runs1[cols]
runs2 = runs2[cols]


In [10]:
runs = pd.concat([runs1, runs2])
runs.head()

Unnamed: 0,params.num_epochs,params.num_instances,params.series_length,params.num_channels,metrics.time_elapsed,params.num_of_models
0,40,2880,500,1,1595.0,4
1,25,2880,500,1,303.0,4
2,15,2880,500,1,422.0,4
3,40,120,30,6,254.0,15
4,25,120,30,6,253.0,15


In [13]:
runs.columns, runs.shape

(Index(['params.num_epochs', 'params.num_instances', 'params.series_length',
        'params.num_channels', 'metrics.time_elapsed', 'params.num_of_models'],
       dtype='object'),
 (59, 6))

In [12]:
runs.to_csv("../results/mcfly-model-training-duration.csv", index=False)

In [119]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

target = "params.num_of_models"

X = runs.drop([target], axis=1).to_numpy()
y = runs[target].to_numpy()
reg.fit(X, y)

LinearRegression()

In [120]:
reg.coef_

array([-0.16958759, -0.00380572, -0.00934381, -0.27648816,  0.01421024])

In [121]:
reg.intercept_

13.701875067289002

In [122]:
reg.predict([[30, 1000, 200, 10, 600]])

array([8.70102483])

In [123]:
predictors = list(map(lambda x: x.split(".")[1], cols[:-1]))
vals = list(reg.coef_)
vals.insert(0, reg.intercept_)

pd.DataFrame([vals], columns =["inception"] + predictors)

Unnamed: 0,inception,num_epochs,num_instances,series_length,num_channels,time_elapsed
0,13.701875,-0.169588,-0.003806,-0.009344,-0.276488,0.01421


### Example: pandas dataframe to numpy

In [73]:
df = pd.DataFrame({
   "id": [1, 1, 1, 1, 2, 2],
   "time": [1, 2, 3, 4, 8, 9],
   "x": [1, 2, 3, 4, 10, 11],
   "y": [5, 6, 7, 8, 12, 13],
})
df

Unnamed: 0,id,time,x,y
0,1,1,1,5
1,1,2,2,6
2,1,3,3,7
3,1,4,4,8
4,2,8,10,12
5,2,9,11,13


In [75]:
df.drop(["id", "time"], axis=1).to_numpy()

array([[ 1,  5],
       [ 2,  6],
       [ 3,  7],
       [ 4,  8],
       [10, 12],
       [11, 13]])

### Example: label encoding using sklearn.preprocessing.LabelEncoder and keras.utils.to_categorial

In [77]:
Xtmp, Ytmp = load_from_tsfile(datasets["Car"])

In [79]:
Ytmp

array(['1', '4', '2', '1', '2', '4', '1', '4', '4', '4', '1', '2', '3',
       '1', '2', '2', '1', '1', '4', '2', '1', '4', '4', '1', '3', '3',
       '1', '4', '3', '4', '2', '2', '3', '4', '3', '3', '1', '3', '2',
       '2', '4', '1', '1', '2', '4', '2', '4', '1', '3', '2', '1', '4',
       '2', '4', '1', '3', '2', '4', '3', '2'], dtype='<U1')

In [97]:
np.unique(Ytmp)

array(['1', '2', '3', '4'], dtype='<U1')

In [80]:
from tensorflow import keras

In [95]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder().fit_transform(Ytmp[0:15])

array([0, 3, 1, 0, 1, 3, 0, 3, 3, 3, 0, 1, 2, 0, 1])

In [96]:
keras.utils.to_categorical(LabelEncoder().fit_transform(Ytmp[0:15]))

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

### Example: pandas dataframe to numpy3d

In [10]:
df1 = pd.DataFrame({
      "s_0": [1, 1, 1, 1, 2, 2],
      "s_1": [1, 2, 3, 4, 8, 9],
      "s_2": [1, 2, 3, 4, 10, 11],
      "s_3": [5, 6, 7, 8, 12, 13],
   })

df2 = pd.DataFrame({
       "s_0": [10, 10, 11, 11, 12, 12],
       "s_1": [11, 12, 13, 14, 18, 19],
       "s_2": [11, 12, 13, 14, 12, 21],
       "s_3": [51, 62, 71, 82, 22, 53],
    })

dfn1 = df1.to_numpy()[np.newaxis, :]
dfn2 = df2.to_numpy()[np.newaxis, :]

np.vstack((dfn1, dfn2))

array([[[ 1,  1,  1,  5],
        [ 1,  2,  2,  6],
        [ 1,  3,  3,  7],
        [ 1,  4,  4,  8],
        [ 2,  8, 10, 12],
        [ 2,  9, 11, 13]],

       [[10, 11, 11, 51],
        [10, 12, 12, 62],
        [11, 13, 13, 71],
        [11, 14, 14, 82],
        [12, 18, 12, 22],
        [12, 19, 21, 53]]])

In [12]:
np.vstack((dfn1, dfn2)).shape

(2, 6, 4)

In [13]:
os.sep

'/'

### Pandas to numpy npz

In [10]:
X, y = load_from_tsfile("/mnt/f/Downloads/FordA/FordA_TRAIN.ts")

X = convert_to(X, to_type="numpy3D")
X = np.swapaxes(X, 1, 2)

In [11]:
X.shape, y.shape

((3601, 500, 1), (3601,))

In [15]:
X_train, X_test, y_train, y_test = split_data(X, y)
X_train, X_val, y_train, y_val = split_data(X_train, y_train)

In [16]:
X_train.shape, X_val.shape, X_test.shape

((2304, 500, 1), (576, 500, 1), (721, 500, 1))

In [17]:
y_train.shape, y_val.shape, y_test.shape

((2304,), (576,), (721,))

In [21]:
%%time 

for i in range(X_train.shape[0]):
    np.savez_compressed(f'/mnt/f/Downloads/FordA/npz/train/{i}', x=X_train[i, :, :], y=y_train[i])
    
for i in range(X_test.shape[0]):
    np.savez_compressed(f'/mnt/f/Downloads/FordA/npz/test/{i}', x=X_test[i, :, :], y=y_test[i])

for i in range(X_val.shape[0]):
    np.savez_compressed(f'/mnt/f/Downloads/FordA/npz/val/{i}', x=X_val[i, :, :], y=y_val[i])

CPU times: user 11 s, sys: 10.7 s, total: 21.6 s
Wall time: 1min 17s


## Label Encoding

In [13]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder

y_tmp = np.array([1, 0, 2, 0, 1, 0, 2])
y_tmp = np.array(["1", "0", "2", "0", "1", "0", "2"])
y_tmp = np.array(["B", "A", "C", "A", "B", "A", "C"])
y_tmp = np.array(["dog", "cat", "bird", "cat", "bird", "dog", "dog"])
y_tmp = np.array([1, 0, 0, 0, 1, 0, 1])

print("OneHotEncoder:\n", OneHotEncoder().fit_transform(y_tmp.reshape(-1, 1)).toarray())
print("LabelBinarizer:\n", LabelBinarizer().fit_transform(y_tmp.reshape(-1, 1)))
print("LabelEncoder:\n", LabelEncoder().fit_transform(y_tmp.reshape(-1, 1)))


OneHotEncoder:
 [[0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]
LabelBinarizer:
 [[1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]]
LabelEncoder:
 [1 0 0 0 1 0 1]


  y = column_or_1d(y, warn=True)


In [20]:
ohe.inverse_transform(
    [[0., 1., 0.],
     [1., 0., 0.]]).reshape(1, -1)[0]

array(['cat', 'bird'], dtype='<U4')

In [None]:
2+2