## libs

In [1]:
# python utils
import itertools
import inspect

# pre-processing and exploring data
import numpy as np
import pandas as pd
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import StratifiedKFold
import statistics

# viz
import matplotlib.pyplot as plt

# model building
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import generate_data
from keras.losses import mean_squared_error

Using TensorFlow backend.


## Import Data

In [2]:
# artificial data with outliers

contamination = 0.1  # percentage of outliers
n_train = 700  # number of training points
n_test = 700  # number of testing points
n_features = 18 # Number of features

X_data, y_data = generate_data(
    n_train=n_train, n_test=n_test,
    n_features= n_features, 
    train_only=True,
    contamination=contamination,random_state=1234)

## Preprocessing Data

In [3]:
# to pandas df
X_data = pd.DataFrame(X_data)
X_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,3.892732,2.805184,2.550995,3.552799,3.535617,2.603377,3.009781,1.602563,3.716597,3.618090,3.594025,1.740538,2.791833,3.001320,3.252642,3.180136,3.823225,2.036110
1,2.873729,2.591260,3.120523,3.344853,3.821352,2.707572,3.420944,1.867794,2.885903,3.659853,2.752102,3.210260,3.652755,3.651733,3.538190,2.923924,3.077710,2.798864
2,3.524455,4.489828,3.047481,2.647043,3.022520,1.707063,3.154401,2.440974,2.914762,3.011396,3.470705,3.134136,3.524040,2.099103,2.126418,2.937117,2.658385,2.909886
3,3.220593,2.977872,3.352517,3.963114,2.392945,2.956167,3.191898,2.870083,3.644170,1.504256,4.265287,2.288016,3.132026,3.439118,2.510588,3.287913,3.438811,3.326202
4,2.422843,4.251105,3.141422,2.281768,3.393792,3.024621,3.289367,0.779542,3.823192,3.095105,3.102520,2.732004,3.478154,3.613712,3.168760,3.867359,3.049750,2.750779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1.281831,1.062482,-0.439999,0.445105,1.399909,-0.780047,0.327582,-1.303468,2.596278,-1.252098,-2.640238,-2.654077,2.520779,-1.295039,-2.593610,2.974788,-0.992141,-0.669199
696,1.108874,0.942599,-1.632943,-2.813374,-1.537338,-1.230528,2.424851,2.072350,2.839880,-0.686806,-0.234186,-0.709740,-0.640154,1.011155,-1.067273,1.911158,2.666463,0.780191
697,0.667630,-0.741268,0.492865,-1.617740,2.404900,-2.150404,-1.008687,-1.290717,-1.428679,-2.289012,-1.264462,-2.905294,1.925509,-2.995778,0.159113,2.839631,0.013598,0.681762
698,-2.522522,-0.878053,-1.548773,2.826363,0.922174,2.248154,-1.393076,2.926527,0.494016,2.740795,-0.361400,1.060685,1.927476,2.978506,-0.538000,1.899725,-0.097139,-1.321670


In [73]:
y_data

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [4]:
# # normalizing values of input df between -1 and 1
# norm_transformer = MaxAbsScaler().fit(X_train.append(X_test, ignore_index=True))

# X_train_norm_values = norm_transformer.transform(X_train)
# X_train = pd.DataFrame(X_train_norm_values, index=X_train.index, columns=X_train.columns)

# X_test_norm_values = norm_transformer.transform(X_test)
# X_test = pd.DataFrame(X_test_norm_values, index=X_test.index, columns=X_test.columns)

# X_train

In [74]:
print("X_data shape = {}".format(np.shape(X_data)))

X_data shape = (700, 18)


## Model Builder

In [6]:
def autoencoder_builder(
    hidden_layers, dropout_rate, l2_regularizer, epochs, batch_size,
):
    return AutoEncoder(
        hidden_neurons=hidden_layers,
        dropout_rate=dropout_rate,
        l2_regularizer=l2_regularizer,

        hidden_activation='relu',
        output_activation='sigmoid',
        optimizer='adam',
        loss=mean_squared_error,
        
        epochs=epochs,
        batch_size=batch_size,
        
        validation_size=0,
        preprocessing=True
    )


def build_models(builder_method, *args_tests):
    """
    Build models with all the combinations of the args_test passed
    """
    builder_args_names = list(inspect.signature(builder_method).parameters.keys())
    return [
        dict((("model", builder_method(*args)),) + tuple(zip(builder_args_names, args)))
        for args in itertools.product(*args_tests)
    ]


## Models Definition

In [7]:
# Parameters definition
input_dim = 18

hidden_layers_tests = [
    [input_dim, 8, 4, 8, input_dim],
    [input_dim, 12, 8, 4, 8, 12, input_dim],
    [input_dim, 10, 6, 10, input_dim],
    [input_dim, 14, 10, 6, 10, 14, input_dim],
    [input_dim, 16, 8, 16, input_dim],
    [input_dim, 16, 12, 8, 12, 16, input_dim],
]
dropout_rate_tests = [0.2, 0.1, 0.05]
l2_regularizer_tests = [0.1, 0.05, 0.01]

# fixed
epochs = [50]
batch_size = [25]

In [8]:
# building models with all the combinations of the test parameters
models = build_models(
    autoencoder_builder,
    hidden_layers_tests,
    dropout_rate_tests,
    l2_regularizer_tests,
    epochs,
    batch_size,
)
models

[{'model': AutoEncoder(batch_size=25, contamination=0.1, dropout_rate=0.2, epochs=50,
        hidden_activation='relu', hidden_neurons=[18, 8, 4, 8, 18],
        l2_regularizer=0.1,
        loss=<function mean_squared_error at 0x7ff32a0c2830>,
        optimizer='adam', output_activation='sigmoid', preprocessing=True,
        random_state=None, validation_size=0, verbose=1),
  'hidden_layers': [18, 8, 4, 8, 18],
  'dropout_rate': 0.2,
  'l2_regularizer': 0.1,
  'epochs': 50,
  'batch_size': 25},
 {'model': AutoEncoder(batch_size=25, contamination=0.1, dropout_rate=0.2, epochs=50,
        hidden_activation='relu', hidden_neurons=[18, 8, 4, 8, 18],
        l2_regularizer=0.05,
        loss=<function mean_squared_error at 0x7ff32a0c2830>,
        optimizer='adam', output_activation='sigmoid', preprocessing=True,
        random_state=None, validation_size=0, verbose=1),
  'hidden_layers': [18, 8, 4, 8, 18],
  'dropout_rate': 0.2,
  'l2_regularizer': 0.05,
  'epochs': 50,
  'batch_size': 25}

## Models K-fold evaluation

In [9]:
%%time
kfold = StratifiedKFold(n_splits=5, shuffle=True)

for model in models[]:
    k_fold_val_losses = []
    temp_model = autoencoder_builder(
        model["hidden_layers"],
        model["dropout_rate"],
        model["l2_regularizer"],
        model["epochs"],
        model["batch_size"],
    )
    
    for i, indexes in enumerate(kfold.split(X_data, np.zeros(np.shape(X_data)[0]))):
        train_index, test_index = indexes
        print("Fold:", i)

        temp_model.fit(X_data.iloc[train_index])

        val_loss = temp_model.model_.evaluate(
            temp_model.scaler_.transform(X_data.iloc[test_index]),
            temp_model.scaler_.transform(X_data.iloc[test_index]),
            verbose=False,
        )

        print("val_loss = ", val_loss)
        k_fold_val_losses.append(val_loss)
    
    model["k_fold_evaluation_loss"] = statistics.mean(k_fold_val_losses)

Fold: 0
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 18)                342       
_________________________________________________________________
dropout_1 (Dropout)          (None, 18)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 18)                342       
_________________________________________________________________
dropout_2 (Dropout)          (None, 18)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 18)                342       
_________________________________________________________________
dropout_3 (Dropout)          (None, 18)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 8)        

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
val_loss =  2.6594114303588867
Fold: 2
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_15 (Dropout)         (None, 18)                0         
__________________

Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
val_loss =  2.934698976789202
Fold: 3
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_22 (Dropout)         (None, 18)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_23 (Dropout)         (None, 18)                0        

Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
val_loss =  3.5224688393729076
Fold: 4
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_29 (Dropout)         (None, 18)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_30 (Dropout)         (None, 18)                0         
_________________________________________________________________
dense_35 (Dense)             (None, 18)                342       
_________________________________________________________________
dropout_31 (Dropout)         (None, 18)

In [75]:
# b = X_data.iloc[[10]]
# a = pd.DataFrame(temp_model.scaler_.transform(b))
# print(b.values)
# print()
# print(a.values)
# print()
# print(temp_model.model_.predict(a))
# print()
# print(temp_model.model_.evaluate(a, a, verbose=False))
# print()
# print(temp_model.decision_function(b))

[[3.22654026 2.99080783 3.79284053 2.09676233 2.25505867 2.63120499
  2.7417184  2.1115748  3.13047573 2.63056753 2.08208824 2.44133275
  3.68813115 2.73109771 2.89959418 3.55404183 3.17969014 2.34477686]]

[[ 0.40482614  0.21759861  0.89099185 -0.5057645  -0.42358449 -0.07648761
   0.01039764 -0.50844703  0.33959281 -0.02704015 -0.63818407 -0.21748741
   0.86589665 -0.04365958  0.16333738  0.71720844  0.38898887 -0.31806403]]

[[0.12418073 0.12204647 0.11525089 0.13150167 0.11556122 0.12325948
  0.10748079 0.11822954 0.11672813 0.12141585 0.10796648 0.13041407
  0.11734381 0.12280864 0.12171713 0.12984389 0.12582947 0.12142564]]

0.27736037969589233

[1.94297717]


### Top Models in K-fold evaluation

In [None]:
# Sorting by test dataset accuracy, from higher to lower
sorted_models = sorted(models, key=lambda d: d['k_fold_evaluation_loss'], reverse=True)
for i, model in enumerate(sorted_models):
    print("TOP {}:".format(i + 1))
    print("    hidden_laye222rs: {}".format(model["hidden_layers"]))
    print("    l2_value: {}".format(model["l2_value"]))
    print("    momentum_beta: {}".format(model["momentum_beta"]))
    print()

## Retraining TOP 5 models - Building Essemble

In [None]:
# Top 1 history


In [None]:
# Top 2 history


In [None]:
# Top 3 history


In [None]:
# Top 4 history


In [None]:
# Top 5 history
