In [1]:
import sys
assert sys.version_info >= (3, 5)

import numpy as np
np.set_printoptions(suppress=True) #prevent numpy exponential

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

#from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
assert tf.__version__ >= "2.0"

#from keras import optimizers, Sequential, metrics
from elasticsearch import Elasticsearch
from keras.models import Sequential, save_model, load_model
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Conv1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping, LearningRateScheduler
from ipynb.fs.full.rcids_functions import *

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(suppress=True) #prevent numpy exponential
pd.set_option('display.float_format', lambda x: '%.4f' % x) #prevent scientific notation in pandas

In [None]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

### Reading from Elasticsearch

In [None]:
# Test conectivity with Elasticsearch
es = Elasticsearch(host="192.168.201.2", http_auth=("elastic","##redacted##"))
es.info(pretty=True)

In [None]:
# Defining elasticsearch indice to read from
index = "proc-public-benign"

# Counting number of documents in index
n_docs = es.count(index=index)
print("Number of documents in the index", index, "-->", n_docs['count'])

In [None]:
# Creating dataset for trainning
df_benign_data = read_from_elastic(index, es)

In [None]:
# Excluding timestamp column
df_benign_data.drop(['timestamp'], axis=1, inplace=True)

### Parameters

In [3]:
# Defining window_size, n_feature and normalization function
window_size = 6
n_features = df_benign_data.shape[1]
norm_function = "mm"                 # std (StandardScaler), norm (Normalizer), mm (MinMaxScaler)

### Benign HashDB

In [4]:
hashdb_name = "df_proc_benign_hashdb_wz" + str(window_size) + "_ft" + str(n_features) + "_" + str(norm_function) + ".pkl"

In [None]:
if os.path.isfile("pkl/" + hashdb_name): 
    # Loading existent df from disk
    df_benign_hashdb = pd.read_pickle("pkl/" + hashdb_name)
else:
    # Normalizing data
    #norm = Normalizer()
    #norm_benign = norm.fit(df_benign_data)
    #benign_data = norm_benign.transform(df_benign_data)

    mm = MinMaxScaler()
    mm_benign = mm.fit(df_benign_data)
    benign_data = mm_benign.transform(df_benign_data)

    # Creating 3D array for train data
    # For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_features
    benign_data_wz = pd.DataFrame(benign_data)
    benign_data_wz = sliding_window(benign_data_wz, window_size)

    # Coverting to 2d pandas df
    benign_data_wz_2d = benign_data_wz.reshape(benign_data_wz.shape[0], benign_data_wz.shape[1] * benign_data_wz.shape[2])
    df_benign_data_wz_2d = pd.DataFrame(benign_data_wz_2d)
    # Calculating rows (windows) hash
    df_benign_data_hash = df_benign_data_wz_2d.apply(lambda x: hash(tuple(x)).to_bytes(8, "big", signed=True).hex(), axis=1)
    # Removing duplicates
    df_benign_hashdb = pd.DataFrame(df_benign_data_hash.unique())
    # Saving df to disk
    df_benign_hashdb.to_pickle("pkl/" + hashdb_name)

print(hashdb_name + " size: ")
df_benign_hashdb.shape[0]

## Splitting Train / Test

In [None]:
df_train_data, df_test_data = train_test_split(df_benign_data, test_size=0.2, shuffle=False)

## Pre-processing the data

### Training data

In [None]:
# Normalizing data
#norm = Normalizer()
#norm_train = norm.fit(df_train_data)
#train_data = norm_train.transform(df_train_data)

mm = MinMaxScaler()
mm_train = mm.fit(df_train_data)
train_data = mm.transform(df_train_data)

print("Train data numpy.ndarray shape:", train_data.shape)

In [None]:
# Creating 3D array for train data
# For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_features
train_data_wz = pd.DataFrame(train_data)
train_data_wz = sliding_window(train_data_wz, window_size)

## Creating Tensorflow datasets

### Trainning dataset

In [None]:
# Train dataset
ds_train_full = tf.data.Dataset.from_tensor_slices(train_data_wz)

In [None]:
ds_train = ds_train_full.take(0.95 * ds_train_full.cardinality().numpy())
ds_validation = ds_train_full.take(0.05 * ds_train_full.cardinality().numpy())

In [None]:
ds_train = ds_train.map(lambda x: (x, x))
ds_train_batch = ds_train.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

In [None]:
ds_validation = ds_validation.map(lambda x: (x, x))
ds_validation_batch = ds_validation.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

## Defining and training the model

In [None]:
model_name = 'tfds_lstm_160_64_24_conv1d_relu_5_bn_tahn_wz' + str(window_size) + '_ft' + str(n_features) + '_' + str(norm_function)
model_name

In [None]:
model = Sequential()
    
# Conv1D
model.add(keras.layers.Conv1D(filters=n_features, kernel_size=window_size, strides=1, padding="same", activation="relu", input_shape=(window_size, n_features)))

# Encoder
model.add(CuDNNLSTM(160, kernel_initializer='he_normal', return_sequences=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))
         
model.add(CuDNNLSTM(64, kernel_initializer='he_normal', return_sequences=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))

model.add(CuDNNLSTM(24, kernel_initializer='he_normal', return_sequences=False))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))

model.add(RepeatVector(window_size))

# Decoder
model.add(CuDNNLSTM(24, kernel_initializer='he_normal', return_sequences=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))

model.add(CuDNNLSTM(64, kernel_initializer='he_normal', return_sequences=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))

model.add(CuDNNLSTM(160, kernel_initializer='he_normal', return_sequences=True))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('tanh'))
    
model.add(TimeDistributed(Dense(n_features)))
       
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])


In [None]:
# Trainning parameters
train_log = CSVLogger('models/log-' + str(model_name) + '.log', separator=',', append=True)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='min', min_delta=0.001, verbose=1)
learning_rate = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
mc = tf.keras.callbacks.ModelCheckpoint(filepath='model-' + str(model_name) + '.h5', monitor='val_loss', mode='min', save_best_only=True)

In [None]:
# Train model
model.fit(ds_train_batch, epochs=100, shuffle=False, callbacks=[train_log, early_stopping, learning_rate, mc], validation_data=ds_validation_batch)

In [None]:
# Saving/Loading the model
filepath = 'models/model-' + str(model_name) + '.h5'
#save_model(model, filepath)
model = load_model(filepath, compile=True)

In [None]:
# Reading model training history 
df_history = pd.read_csv('models/log-' + str(model_name) + '.log', sep=',', engine='python')

## Loss distribution for training data

In [None]:
# Plotting the loss distribution
plot = sns.displot(data=df_history['val_loss'], kind='kde', color='blue', height=5, aspect=2)
plot.set_axis_labels("Validation Loss", "Density")
plot.set(title='Training Validation Loss Distribution')

## Defining the Loss Threshold

### Test data

In [None]:
# Normalizing data
#norm = Normalizer()
#norm_test = norm.fit(df_train_data) # Fit deve ser feito com dados de treinanento
#test_data = norm_test.transform(df_test_data) # Aplicar transform nos dados de teste após o fit

mm = MinMaxScaler()
mm_test = mm.fit(df_train_data) # Fit deve ser feito com dados de treinanento
test_data = mm_test.transform(df_test_data) # Apenas transform nos dados de teste

print("Test data numpy.ndarray shape:", test_data.shape)

In [None]:
# Creating 3D array for train data
# For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_featuress
test_data_wz = pd.DataFrame(test_data)
test_data_wz = sliding_window(test_data_wz, window_size)

### Tensorflow test dataset

In [None]:
# Test dataset
ds_test = tf.data.Dataset.from_tensor_slices(test_data_wz)

In [None]:
ds_test = ds_test.map(lambda x: (x, x))
ds_test_batch = ds_test.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

### Predicting test data using the model

In [None]:
# Predicting values using the trained model
pred = model.predict(ds_test_batch)

In [None]:
# Reshaping array with predictions to 2D dataframe (column 2 x column 3)
#X_pred.shape #--> (samples - window_size, window_size, n_features)
pred = pred.reshape(pred.shape[0], pred.shape[1] * pred.shape[2])
df_pred = pd.DataFrame(pred)

In [None]:
# Reshaping array with real data to 2D dataframe (column 2 x column 3)
#X_test.shape # --> (samples - window_size, window_size, n_features)
test = test_data_wz.reshape(test_data_wz.shape[0], test_data_wz.shape[1] * test_data_wz.shape[2])
df_test = pd.DataFrame(test)

### Calculating the loss

In [None]:
# Calculating test loss with MAE (Mean Absolute Error)
df_test_loss = pd.DataFrame(index=df_pred.index)
df_test_loss['Loss_mae'] = tf.metrics.MAE(df_test, df_pred)

### Defining the loss threshold

In [None]:
df_test_loss['Loss_mae'].describe()

In [None]:
# Defining threshold based on the training loss
#loss_threshold = np.round(df_train_loss.quantile([.75]).values[0][0], 4)
df_test_loss_mean = df_test_loss['Loss_mae'].values.mean()
df_test_loss_std = df_test_loss['Loss_mae'].values.std()
loss_threshold_mean_std = np.round(df_test_loss_mean + df_test_loss_std, 4)
loss_threshold_max = np.round(df_test_loss.values.max(), 4)
loss_threshold_percentile = np.round(np.percentile(df_test_loss['Loss_mae'].values, 99), 4)
print("Threshold based on the max loss during the tests --> ", loss_threshold_max)
print("Threshold calculated through the mean + std deviation --> ", 
loss_threshold_mean_std)
print("Threshold basead on the 99 percentil of loss during the tests --> ", loss_threshold_percentile)

thresholds = loss_threshold_mean_std, loss_threshold_max, loss_threshold_percentile

In [None]:
# Defining confidence levels for threshold adjustment
confidence_levels =  [1, 0.995, 0.99, 0.98, 0.97, 0.96, 0.95]

# Calling function and creating dataframe with thresholds
thresholds = tunable_threshold(df_test_loss, confidence_levels)

# Printing threshold per confidence level
print("--- Threshold for each confidence interval ---")
for i in confidence_levels:
    print("Confidence Interval [", i, "] --> ", thresholds.iloc[0][i])

In [None]:
filepath = 'model-' + str(model_name) + '.h5' 

df_name = 'df_proc_thresholds.pkl'

if os.path.isfile('pkl/' + df_name):
    # Loading existent df from disk
    df_thresholds = pd.read_pickle('pkl/' + df_name)

    for i in confidence_levels:
        # Adding last execution results in to dataframe
        df_thresholds.loc[df_thresholds.shape[0]] = [filepath, i, thresholds.iloc[0][i]]
        
    # Saving df to disk
    df_thresholds.to_pickle('pkl/' + df_name)

else:
    # Defining dataframe columns
    df_thresholds = pd.DataFrame(columns=["Model", "Confidence_Level", "Threshold"])

    for i in confidence_levels:
    # Adding last execution results in to dataframe
        df_thresholds.loc[df_thresholds.shape[0]] = [filepath, i, thresholds.iloc[0][i]]

    # Saving df to disk
    df_thresholds.to_pickle('pkl/' + df_name)

In [None]:
df_thresholds.sort_values(['Model']).groupby(['Model']).value_counts()
#df_thresholds.sort_values(['Confidence_Level'], ascending=(False)).groupby(['Model']).head(7)

In [None]:
loss_threshold = float(input("Choose one of the thresholds: \n{}".format(thresholds.to_string(header=None, index=False))))
print("Chosen threshold --> ", loss_threshold)