In [None]:
!pip install tensorflow==2.8
!pip install keras

In [None]:
!pip install gensim
!pip install python-Levenshtein

In [None]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Advanced_ML_anomaly_detection_L3/DataSets.zip

In [None]:
!pip install np_utils

In [None]:
!pip install --upgrade numpy

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ElTree
import re, h5py, itertools, math, glob, zipfile, os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import log_loss, auc, roc_curve
from tensorflow.keras import layers
from tensorflow.keras.layers import Masking,Activation
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, TimeDistributed,Bidirectional
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.python.client import device_lib
from lxml import etree
from gensim.models import Word2Vec

# %matplotlib inline
plt.rcParams['figure.figsize'] = (15, 5)
plt.style.use('ggplot')
seed = 42

import warnings
warnings.filterwarnings(action = "ignore")

In [None]:
#set precision value
pd.set_option("precision", 3)
pd.options.display.float_format = '{:.3f}'.format

In [None]:
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if 'accuracy' in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if 'accuracy' in s and 'val' in s]
    
    plt.figure(figsize = (12, 5), dpi = 100)
    COLOR = 'gray'
    
    plt.rc('legend', fontsize = 14)   # legend fontsize
    plt.rc('figure', titlesize = 12)  # fontsize of the figure title
        
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1, len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.subplot(1, 2, 1)
    plt.subplots_adjust(wspace = 2, hspace = 2)
    plt.rcParams['text.color'] = 'black'
    plt.rcParams['axes.titlecolor'] = 'black'
    plt.rcParams['axes.labelcolor'] = COLOR
    plt.rcParams['xtick.color'] = COLOR
    plt.rcParams['ytick.color'] = COLOR
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b-o',
                 label = 'Train (' + str(str(format(history.history[l][-1],'.4f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g',
                 label = 'Valid (' + str(str(format(history.history[l][-1],'.4f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.legend(facecolor = 'gray', loc = 'best')
    plt.grid(True)
    plt.tight_layout()
    
    ## Accuracy
    plt.subplot(1, 2, 2)
    plt.subplots_adjust(wspace = 2, hspace = 2)
    plt.rcParams['text.color'] = 'black'
    plt.rcParams['axes.titlecolor'] = 'black'
    plt.rcParams['axes.labelcolor'] = COLOR
    plt.rcParams['xtick.color'] = COLOR
    plt.rcParams['ytick.color'] = COLOR
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b-o',
                 label = 'Train (' + str(format(history.history[l][-1],'.4f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g',
                 label = 'Valid (' + str(format(history.history[l][-1],'.4f'))+')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.legend(facecolor = 'gray', loc = 'best')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

class B_Generator(object):
    def __init__(self, BZ, XX, YY, ohe):
        self.BZ = BZ
        self.n_b = int(math.floor(np.shape(XX)[0] / BZ))
        self.b_index = [a * BZ for a in range(0, self.n_b)]
        self.XX = XX
        self.YY = YY
        self.ohe = ohe
        
    def __iter__(self):
        for var_0 in itertools.cycle(self.b_index):
            YY = self.YY[var_0 : (var_0 + self.BZ)]
            ohe_Y = self.ohe.transform(YY.reshape(len(YY), 1))
            yield (self.XX[var_0 : (var_0 + self.BZ),], ohe_Y)

In [None]:
#reading the dataset
number = 4

if zipfile.is_zipfile('DataSets.zip'):
  file_1 = zipfile.ZipFile('DataSets.zip', 'r')
else:
  print('Type file isn`t ZIP')

name_dataset = file_1.namelist()[number]
file_1.extract(name_dataset)
print("File", name_dataset, "has been read")

In [None]:
#parse the unzipped file of the dataset from XML
tree_set = ElTree.parse(name_dataset)
root_tree_set = tree_set.getroot()

result = []
var_1 = root_tree_set.items()[0][1][:-4]

for item in root_tree_set.findall(var_1):
    result.append({node.tag: node.text for node in item.getiterator()})


In [None]:
#convert it into data frame
dSET = pd.DataFrame(result)
dSET = dSET.drop(dSET.columns[[0]], axis = 1)
dSET = dSET.drop_duplicates()

dSET = dSET.sort_values('startDateTime')
dSET['IPs_sequence'] = dSET['source'] + '_' + dSET['destination'] + '_' + dSET['startDateTime'].str[:13]

dSET['res_port'] = np.where(dSET.destinationPort <= dSET.sourcePort,
                            dSET['destinationPort'],
                            dSET['sourcePort'])

# rename some of the columns for more convenience
dSET = dSET.rename(columns = {'totalSourceBytes': 'totSB',
                        'totalDestinationBytes': 'totDB',
                        'totalDestinationPackets': 'totDP',
                        'totalSourcePackets': 'totSP',
                        'sourcePayloadAsBase64': 'sourB64',
                        'sourcePayloadAsUTF': 'sourUTF',
                        'destinationPayloadAsBase64': 'destB64',
                        'destinationPayloadAsUTF': 'destUTF',
                        'direction': 'direct',
                        'sourceTCPFlagsDescription': 'sourTCPFd',
                        'destinationTCPFlagsDescription': 'destTCPFd',
                        'protocolName': 'pName',
                        'sourcePort': 'sPort',
                        'destination': 'dest',
                        'destinationPort': 'dPort'})
print("Preparation process has been finished")

In [None]:
#dataset size & feature names
dSET.shape, dSET.columns

In [None]:
dSET.head(5)

In [None]:
## Build the sets (keys and sequences) in hour slices
print("Stage I. Keys building\n")
key = dSET.groupby('IPs_sequence')[['Tag', 'res_port']].agg({"Tag": lambda var_2: "%s" % ','.join([var_3 for var_3 in var_2]),
          "res_port" :lambda var_2: "%s" % ','.join([str(var_3) if int(var_3) < 10000 else "10000" for var_3 in var_2])})

print("Unique keys:\n" + str(key.count()))
attacks = [var_4.split(",") for var_4 in key.Tag.tolist()]
sequences = [var_4.split(",") for var_4 in key.res_port.tolist()]

In [None]:
print("Stage II. Label encoding\n")
U_tokens = list(set([var_5 for var_6 in sequences for var_5 in var_6]))
print("Number of unique tokens :", len(U_tokens))
LE = LabelEncoder().fit(U_tokens)
sequences = [LE.transform(var_7).tolist() for var_7 in sequences]
sequences = [[var_6 + 1 for var_6 in var_5] for var_5 in sequences]
print("Number of sequences :", len(sequences))
sequence_attack = zip(attacks, sequences)

In [None]:
print("Stage III. Sequences generating for the future model\n")
var_8 = np.float32(0)
len_sequence = 10
print("Length of the primary sequence :", len_sequence)
seq_IDX, seq_X, seq_Y, seq_ATT = [], [], [], []
for var_10, (var_11, var_12) in enumerate(sequence_attack):
    sequence_1 = [np.float32(0)] * (len_sequence) + var_12
    sequence_2 = [np.float32(0)] * (len_sequence) + var_11
    for var_9 in range(len_sequence, len(sequence_1)):
        sequence_3 = sequence_1[(var_9 - len_sequence):(var_9)]
        var_14 = []
        for var_13 in sequence_3:
            try:
                var_14.append(var_13)
            except:
                var_14.append(var_8)
        seq_X.append(var_14)
        seq_Y.append(sequence_1[var_9])
        seq_IDX.append(var_10)
        seq_ATT.append(sequence_2[var_9])
print("Length of X & Y sets :", len(seq_X))

In [None]:
#One-hot-encoder initializing
print("Stage IV. One-hot-encoder initializing\n")
OHE = OneHotEncoder(sparse = False, categories = 'auto').fit(np.unique(seq_Y).reshape(-1, 1))

X = np.array(seq_X)
print("Dimensionality size of set X :", X.shape)

In [None]:
#model building
print("Stage V. Model building\n")
drop_level = 0.35 # You can choose a drop level yourself
N_neurons = 50   # You can choose the number of neurons yourself

model = Sequential()
model.add(layers.Embedding(output_dim = 100,
                    input_dim = len(U_tokens) + 1,
                    mask_zero = True))

model.add(layers.Bidirectional(LSTM(N_neurons, return_sequences = True)))
model.add(layers.Dropout(drop_level))

model.add(layers.Bidirectional(LSTM(N_neurons, activation = "relu", return_sequences = False)))
model.add(layers.Dropout(drop_level))

model.add(layers.Dense(N_neurons, activation = "linear"))
model.add(layers.Dropout(drop_level))

model.add(layers.Dense(len(U_tokens), activation = "softmax"))

model.summary()

In [None]:
#model compiling and fitting
print("Stage VI. Compile and fit the model\n")

batch_size = 512  
n_epochs = 10     

optim = tf.keras.optimizers.Nadam()   # You can choose an optimizer yourself
loss_f = tf.keras.metrics.categorical_crossentropy

T_data = B_Generator(batch_size, np.asarray(X), np.asarray(seq_Y), OHE)

model.compile(loss = loss_f,
              optimizer = optim,
              metrics = ['accuracy'])

history = model.fit_generator(T_data.__iter__(),
    steps_per_epoch = T_data.n_b,
    epochs = n_epochs,
    verbose = 1)

In [None]:
#results
print("Stage VII. Results visualization\n")
plot_history(history)

In [None]:
#model saving
print("Stage VIII. Model saving & prediction checking\n")

M_name = 'Detection_model'

filepath = M_name + '.h5'
tf.keras.models.save_model(model, filepath, include_optimizer = True, save_format = 'h5', overwrite = True)
print("Size of the saved model :", os.stat(filepath).st_size, "bytes")

In [None]:
model_L = tf.keras.models.load_model(filepath)
predicts = model_L.predict(X, batch_size = batch_size)
print("Dimensionality sizes of model predicts :", predicts.shape, "\n")
print("Compare with length of X & Y sets :\t", len(seq_X), "\nand with number of tokens :\t\t", len(U_tokens))