In [None]:
import sys
assert sys.version_info >= (3, 5)

import numpy as np
np.set_printoptions(suppress=True) #prevent numpy exponential

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

#from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
assert tf.__version__ >= "2.0"

#from keras import optimizers, Sequential, metrics
from elasticsearch import Elasticsearch
from espandas import Espandas
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Conv1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from keras.callbacks import CSVLogger, ModelCheckpoint, EarlyStopping
from ipynb.fs.full.rcids_functions import *

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(suppress=True) #prevent numpy exponential
pd.set_option('display.float_format', lambda x: '%.4f' % x) #prevent scientific notation in pandas 

In [None]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

### Reading from Elasticsearch

In [None]:
# Test conectivity with Elasticsearch
es = Elasticsearch(host="192.168.201.2", http_auth=("elastic","##redacted##"))
#es.info(pretty=True)

In [None]:
# Defining elasticsearch indice to read from

#attack_name = "bruteforcelogin"
#attack_name = "dockerescape"
#attack_name = "maliciousscript"
#attack_name = "meterpreter"
#attack_name = "remoteshell"
#attack_name = "sqlinjection"
#attack_name = "sqlmisbehavior"

index = "proc-public-" + str(attack_name)

# Counting number of documents in index
n_docs = es.count(index=index)
print("Number of documents in the index ", index, "-->", n_docs['count'])

In [None]:
# Creating dataset for trainning
df_attack = read_from_elastic(index, es)
df_attack.shape

In [None]:
# Preserving timestamp column on a new dataframe
df_attack_timestamp = pd.DataFrame()
df_attack_timestamp['timestamp'] = df_attack['timestamp']

# Excluding timestamp column
df_attack.drop(['timestamp'], axis=1, inplace=True)

In [None]:
# Defining window_size and n_feature
window_size = 6
n_features = df_attack.shape[1]

## Pre-processing the data

In [None]:
# Normalizing data

# Loading existent df from disk
df_benign_data = pd.read_pickle("pkl/df_proc_benign_data.pkl")

# Normalizing attack data
#norm = Normalizer()
#norm_attack = norm.fit(df_benign_data)
#attack = norm_attack.transform(df_attack)

mm = MinMaxScaler()
mm_attack = mm.fit(df_benign_data)
attack = mm_attack.transform(df_attack)

print(attack_name, "numpy.ndarray shape:", attack.shape)

In [None]:
# Creating 3D array for train data
# For an LSTM Autoencoder the shape of input has to be of the format: n_samples x window_size x n_features
attack_wz = pd.DataFrame(attack)
attack_wz = sliding_window(attack_wz, window_size)

## Creating Tensorflow datasets

### Attack dataset

In [None]:
# Attack dataset
ds_attack = tf.data.Dataset.from_tensor_slices(attack_wz)
ds_attack = ds_attack.map(lambda x: (x, x))
ds_attack_batch = ds_attack.batch(1024).cache().prefetch(tf.data.AUTOTUNE)

## Loading the trainned model

In [None]:
model_name = 'tfds_lstm_160_64_24_conv1d_relu_5_bn_tahn_wz6_ft5_mm'

In [None]:
# Saving/Loading the model
filepath = 'models/model-' + str(model_name) + '.h5'
model = load_model(filepath, compile=True)

## Predicting test data using the model

In [None]:
# Predicting values using the trained model
pred = model.predict(ds_attack_batch)

In [None]:
# Reshaping array with predictions to 2D dataframe (column 2 x column 3)
#X_pred.shape #--> (samples - window_size, window_size, n_features)
pred = pred.reshape(pred.shape[0], pred.shape[1] * pred.shape[2])
df_pred = pd.DataFrame(pred)

In [None]:
# Reshaping array with real data to 2D dataframe (column 2 x column 3)
#X_test.shape # --> (samples - window_size, window_size, n_features)
attack_2d = attack_wz.reshape(attack_wz.shape[0], attack_wz.shape[1] * attack_wz.shape[2])
df_attack_2d = pd.DataFrame(attack_2d)

### Calculating the loss

In [None]:
# Calculating test loss with MAE (Mean Absolute Error)
df_test_loss = pd.DataFrame(index=df_pred.index)
df_test_loss['Loss_mae'] = tf.metrics.MAE(df_attack_2d, df_pred)

In [None]:
# Plotting the loss distribution
plot = sns.displot(data=df_test_loss['Loss_mae'], kind='kde', color='blue', height=5, aspect=2)
plot.set_axis_labels("Loss", "Density")
plot.set(title='Loss Distribution')

## Malicious data x Loss Threshold

In [None]:
# Loading thresholds dataframe
df_thresholds = pd.read_pickle('pkl/df_thresholds.pkl')
df_thresholds.groupby(['Model']).value_counts()

In [None]:
# Confidence level e threshold escolhidos a partir dos valores obtidos com dados de teste
confidence_level = 0.99
loss_threshold = 0.0164

### Verifying test data loss against defined threshold

#### Labeling Malicious Dataframe

In [None]:
# Calculating rows (windows) hash
attack_hash = df_attack_2d.apply(lambda x: hash(tuple(x)).to_bytes(8, "big", signed=True).hex(), axis=1)
df_attack_hash = pd.DataFrame(attack_hash)

In [None]:
# Loading benign_hashdb
df_bening_hashdb = pd.read_pickle('pkl/df_proc_benign_hashdb_wz6_ft5_mm.pkl')

In [None]:
# Check existence of hashes in benign_hashdb
df_attack_hash.shape, df_attack_hash[0].isin(df_bening_hashdb[0]).value_counts(), df_attack_hash[0].isin(df_bening_hashdb[0]).value_counts(normalize=True).mul(100).round(3).astype(str) + '%'

In [None]:
# Adding real label to df_test_loss
df_test_loss['real'] = ~df_attack_hash[0].isin(df_bening_hashdb[0])

In [None]:
# Creating dataframe with test data results
df_test_results = window_loss(df_test_loss, loss_threshold)

### Ploting loss of the test data prediction

In [None]:
# Plotting the test data x loss threshold
df_test_results_plot = df_test_results[['window_number', 'loss']]
df_test_results_plot.plot(kind='line', marker= 'H', x='window_number', y='loss', ylabel='Loss', xlabel='Window number', figsize=(20, 7)).axhline(y=loss_threshold, linewidth= 1, color='r')

In [None]:
df_test_results['anomaly'].value_counts(), df_test_results['anomaly'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'

## Metrics

In [None]:
# Storing metrics results
cm, accuracy, precision, tpr, npv, tnr, fpr, f1, roc_auc = metrics(df_test_results)

In [None]:
print("--- MÉTRICAS ---")
print("Acurácia:", np.round(accuracy * 100, 2), "%")
print("Precisão:", np.round(precision * 100, 2), "%")
print("TPR ou Recall:", np.round(tpr * 100, 2), "%")
print("NPV:", np.round(npv * 100, 2), "%")
print("TNR ou Especificidade:", np.round(tnr * 100, 2), "%")
print("FPR ou FAR:", np.round(fpr * 100, 2), "%")
print("F1 Score:", np.round(f1 * 100, 2), "%")
print("ROC AUC:", np.round(roc_auc * 100, 2), "%")

In [None]:
cmd = ConfusionMatrixDisplay(cm, display_labels=['Normal','Anomaly'])
cmd.plot(cmap="Blues", values_format='', )

In [None]:
fpr_roc, tpr_roc, _ = roc_curve(df_test_results['real'], df_test_results['anomaly'])

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr_roc, tpr_roc, label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### Saving metrics

In [None]:
if os.path.isfile('pkl/df_metrics_exp1.pkl'): 
    # Loading existent df from disk
    df_metrics = pd.read_pickle('pkl/df_metrics_exp1.pkl')
else:
    # Defining dataframe columns
    df_metrics = pd.DataFrame(columns=["Model","Attack","Confidence_Level","Threshold","Accuracy","Precision","TPR","NPV","TNR","FPR","F1-Score","ROC-AUC","CM"])

# Removing ./ from model name 
#filepath = filepath.replace('./',"")

# Adding last execution results in to dataframe
df_metrics.loc[df_metrics.shape[0]] = [filepath, attack_name, confidence_level, loss_threshold, accuracy, precision, tpr, npv, tnr, fpr, f1, roc_auc, cm]

# Saving df to disk
df_metrics.to_pickle('pkl/df_metrics_exp1.pkl')

### Checking the results

In [None]:
df_metrics[['Attack', 'Accuracy', 'NPV', 'TPR', 'FPR', 'Confidence_Level', 'Model']].sort_values(['Attack', 'Accuracy'], ascending=([True, False])).groupby('Attack').head(20)

In [None]:
df_metrics.sort_values(['Attack', 'Accuracy'], ascending=([True, False])).groupby('Attack').head(3)

### Writing results in Elasticsearch

In [None]:
# Test conectivity with Elasticsearch
es = Elasticsearch(host="192.168.201.2", http_auth=("elastic","##redacted##"))

In [None]:
# Creating index with each window loss
index = "proc-public-" + str(attack_name)
index = "scan-" + str(index)

In [None]:
# Copying df_test_results to a new dataframe before sending to Elastic
df_result_es = df_test_results.copy()

# The dataframe to insert in elasticsearch must have a column with name 'indexId' (https://github.com/dashaub/espandas#usage)
df_result_es['indexId'] = df_result_es.index.astype(str)

# Removing window_number column to reduce size of dataframe
df_result_es.drop(['window_number'], axis=1, inplace=True)

# Coverting 'anomaly' colum to string lower case due to Elastic requirements for boolean mapping type
df_result_es['anomaly'] = df_result_es['anomaly'].astype('string').str.lower()

# Adding chosen loss threshold to the dataframe
df_result_es['threshold'] = loss_threshold

# Adding chosen confidence_interval to the dataframe
df_result_es['confidence_level'] = confidence_level

# Adding the timestamp of the first system call in window to the dataframe
df_result_es.loc[:, 'timestamp'] = df_attack_timestamp.loc[:, 'timestamp']

In [None]:
# Configuring Elastic credentials
esp = Espandas(host="192.168.201.2", http_auth=("elastic","##redacted##"))

# Writing index in Elastic
esp.es_write(df_result_es, index=index, doc_type=None)