In [1]:
import sys
assert sys.version_info >= (3, 5)

import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

from pathlib import Path
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from mpl_toolkits import mplot3d

from keras import optimizers, Sequential
from keras.models import Model, Sequential, save_model, load_model
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Conv1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from keras.callbacks import CSVLogger, TensorBoard, ModelCheckpoint, EarlyStopping, LearningRateScheduler

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
assert tf.__version__ >= "2.0"

from ipynb.fs.full.rcids_functions import *

np.set_printoptions(suppress=True) #prevent numpy exponential
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x) #prevent scientific notation in pandas

## Reading from disk

In [None]:
# Loading existent df from disk
df_benign_data = pd.read_pickle("pkl/df_proc_benign_data.pkl")

## Pre-processing the data

In [None]:
# Defining window_size, n_feature and normalization function
window_size = 6
n_features = df_benign_data.shape[1]
norm_function = "mm"                 # std (StandardScaler), norm (Normalizer), mm (MinMaxScaler)

### Splitting Train / Test

In [None]:
df_train_data, df_test_data = train_test_split(df_benign_data, test_size=0.2, shuffle=False)

### Training data

In [None]:
# Normalizing data
mm = MinMaxScaler()
mm_train = mm.fit(df_train_data)
train_data = mm.transform(df_train_data)

print("Train data numpy.ndarray shape:", train_data.shape)

In [None]:
# Creating 3D array for train data
# For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_features
train_data_wz = pd.DataFrame(train_data)
train_data_wz = sliding_window(train_data_wz, window_size)

### Test data

In [None]:
# Normalizing data
mm = MinMaxScaler()
mm_test = mm.fit(df_train_data) # Fit deve ser feito com dados de treinanento
test_data = mm_test.transform(df_test_data) # Apenas transform nos dados de teste

print("Test data numpy.ndarray shape:", test_data.shape)

In [None]:
# Creating 3D array for test data
# For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_features
test_data_wz = pd.DataFrame(test_data)
test_data_wz = sliding_window(test_data_wz, window_size)

## Creating Tensorflow datasets

### Trainning dataset

In [None]:
# Train dataset
ds_train = tf.data.Dataset.from_tensor_slices(train_data_wz)
ds_train = ds_train.map(lambda x: (x, x))
ds_train_batch = ds_train.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

### Test dataset

In [None]:
# Test dataset
ds_test = tf.data.Dataset.from_tensor_slices(test_data_wz)
ds_test = ds_test.map(lambda x: (x, x))
ds_test_batch = ds_test.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

## Space

In [None]:
df_hyperopt = pd.DataFrame(columns=["Model","Params","Max_Loss", "Loss_99"])
index=0

In [None]:
def objective(space):
            
    model = Sequential()

    # Conv1D
    model.add(keras.layers.Conv1D(filters=space['filters'], kernel_size=space['kernel_size'], strides=1, padding="same", activation="relu", input_shape=(window_size, n_features)))

    # Encoder
    #model.add(CuDNNLSTM(space['first'], kernel_initializer=space['kernel_init'], input_shape=(window_size, n_features), return_sequences=True))
    model.add(CuDNNLSTM(space['first'], kernel_initializer=space['kernel_init'], return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))
         
    model.add(CuDNNLSTM(space['second'], kernel_initializer=space['kernel_init'], return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))
         
    model.add(CuDNNLSTM(space['third'], kernel_initializer=space['kernel_init'], return_sequences=False))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))

    model.add(RepeatVector(window_size))

    # Decoder
    model.add(CuDNNLSTM(space['third'], kernel_initializer=space['kernel_init'], return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))

    model.add(CuDNNLSTM(space['second'], kernel_initializer=space['kernel_init'], return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))
    
    model.add(CuDNNLSTM(space['first'], kernel_initializer=space['kernel_init'], return_sequences=True))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation(space['activ']))
    
    model.add(TimeDistributed(Dense(n_features)))
        
    model.compile(loss=space['loss_ob'], optimizer=space['optimizer'], metrics=space['metrics'])

    history = model.fit(ds_train_batch, epochs=space['epochs'], shuffle=False, verbose=0)
    
    # Predicting values using the trained model
    pred = model.predict(ds_test_batch)

    # Creating dataframes for loss calc
    # Reshaping array with predictions to 2D dataframe (column 2 x column 3)
    #X_pred.shape #--> (samples - window_size, window_size, n_features)
    pred = pred.reshape(pred.shape[0], pred.shape[1] * pred.shape[2])
    df_pred = pd.DataFrame(pred)
    
    # Reshaping array with real data to 2D dataframe (column 2 x column 3)
    test = test_data_wz.reshape(test_data_wz.shape[0], test_data_wz.shape[1] * test_data_wz.shape[2])
    df_test = pd.DataFrame(test)
    
    # Calculating test loss with MAE (Mean Absolute Error)
    df_test_loss = pd.DataFrame(index=df_pred.index)
    df_test_loss['Loss_mae'] = tf.metrics.MAE(df_test, df_pred)
    loss_threshold_max = np.round(df_test_loss.values.max(), 4)

    # Loss threshold = 99% percentile of loss in test data
    loss_threshold_99 = np.percentile(df_test_loss['Loss_mae'].values, 99)
      
    # Creating dataframe with metrics
    global index
    df_hyperopt.loc[index,:]=[index, space, loss_threshold_max, loss_threshold_99]

    # Saving df to disk
    df_hyperopt.to_pickle('pkl/df_hyperopt.pkl')
    
    index=index+1
    
    print(space, loss_threshold_max)
    
    return {'status': STATUS_OK, 'loss': loss_threshold_99, 'Params': space}


space ={'filters': hp.choice('filters', np.arange(1, 10, 1)),
'kernel_size': hp.choice('kernel_size', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]),
'first': hp.choice('first', np.arange(96, 192, 8)),
'second': hp.choice('second', np.arange(64, 96, 4)),
'third': hp.choice('third', np.arange(16, 32, 2)),
'kernel_init' : hp.choice('kernel_init', ["he_normal"]),
'activ' : hp.choice('activ', ["tanh"]),
#'dropout' : hp.choice('dropout', [0.0, 0.05, 0.1, 0.15, 0.2]),
'loss_ob' : hp.choice('loss_ob', ["mae"]),
'optimizer' : hp.choice('optimizer', ["nadam"]),
'metrics' : hp.choice('metrics', ["accuracy"]),
'epochs' : hp.choice('epochs', [10])
}

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=300,
            trials=trials, 
            verbose=0)

In [None]:
df_hyperopt.shape, df_hyperopt.keys()

In [None]:
df_trials = pd.DataFrame(trials.results)

In [None]:
# save dataframe to pickle file
df_trials.to_pickle('pkl/df_trials_results.pkl')

In [None]:
df_trials

In [None]:
df_hyperopt.sort_values(['Loss_99'], ascending=[True])