# BoT-IoT Model Generation | CNN-B
Training and generation of the CNN-B model with BoT-IoT dataset

In [None]:
# Imports
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tensorflow.keras import datasets, layers, models
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Disable warns
pd.options.mode.chained_assignment = None  # default='warn'

# Set seaborn theme to the plots
sns.set()

## BoT-IoT Model

In [None]:
# Load the BoT-IoT Dataset
data = pd.read_csv('../datasets/BoT-IoT.csv', low_memory=False)

In [None]:
data.category.value_counts()

In [None]:
# Select the 'proto' and 'state' values that I want
data = data.loc[(data['proto'] == 'tcp') | (data['proto'] =='udp') | (data['proto'] =='icmp') | (data['proto'] =='arp') | (data['proto'] =='igmp'), :]
data = data.loc[(data['state'] == 'RST') | (data['state'] =='REQ') | (data['state'] =='INT') | (data['state'] =='FIN') | (data['state'] =='CON') | (data['state'] =='ECO') | (data['state'] =='ACC') | (data['state'] =='PAR'), :]

In [None]:
data.shape[0]

In [None]:
# Extracting dataset labels
data_labels=data[['attack']]

# Select interested data features
data_features=data[['proto','saddr','sport','daddr','dport','spkts','dpkts','sbytes','dbytes','state','stime','ltime','dur']]

# Filling NaN values with -1
data_features = data_features.fillna(value=-1)

In [None]:
"""PREPROCESSING"""

# Preprocess IP and ports features
# IP Source Address
data_features['saddr'] = data_features['saddr'].apply(lambda x: x.split(".")[-1])
data_features['saddr'] = data_features['saddr'].apply(lambda x: x.split(":")[-1])
data_features['saddr'] = data_features['saddr'].apply(lambda x: int(x, 16))

# IP Destination Address
data_features['daddr'] = data_features['daddr'].apply(lambda x: x.split(".")[-1])
data_features['daddr'] = data_features['daddr'].apply(lambda x: x.split(":")[-1])
data_features['daddr'] = data_features['daddr'].apply(lambda x: int(x, 16))

# Ports
data_features['sport'] = data_features['sport'].apply(lambda x: x.replace('0x','') if "0x" in str(x) else x)
data_features['dport'] = data_features['dport'].apply(lambda x: x.replace('0x','') if "0x" in str(x) else x)

# Convert all ports with 0 decimal, and HEX to DEC
data_features['sport'] = data_features['sport'].apply(lambda x: str(x)[:-2] if str(x)[-2:] == '.0' else str(x))
data_features['sport'] = data_features['sport'].apply(lambda x: -1 if str(x).isalpha()==True else int(x,16))

data_features['dport'] = data_features['dport'].apply(lambda x: str(x)[:-2] if str(x)[-2:] == '.0' else str(x))
data_features['dport'] = data_features['dport'].apply(lambda x: -1 if str(x).isalpha()==True else int(x,16))

# Convert field to int format
data_features['saddr'] = data_features['saddr'].astype(int)
data_features['sport'] = data_features['sport'].astype(int)
data_features['daddr'] = data_features['daddr'].astype(int)
data_features['dport'] = data_features['dport'].astype(int)

# Convert some fields to logarithmic
log1p_col = ['dur', 'sbytes', 'dbytes', 'spkts']

for col in log1p_col:
    data_features[col] = data_features[col].apply(np.log1p)
    
# Create a complementary field of attack & Transform to One hot encoding - LABELS
normal=data_labels['attack']
normal=normal.replace(1,2)
normal=normal.replace(0,1)
normal=normal.replace(2,0)

# Insert the new column in data labels
data_labels.insert(1, 'normal', normal)
data_labels = pd.get_dummies(data_labels)

# Transform to One hot encoding - FEATURES
data_features=pd.get_dummies(data_features)

# Normalize all data features
data_features = StandardScaler().fit_transform(data_features)

#Add dimension to data features
data_features = np.expand_dims(data_features, axis=2)
data_features = np.expand_dims(data_features, axis=3)

In [None]:
data_features.shape

In [None]:
# CNN-B model building and definition
input_shape = (24,1,1)
model = models.Sequential()
model.add(layers.Conv2D(filters=32,  input_shape=input_shape, kernel_size=(1,10), activation='relu', padding='same'))
model.add(layers.MaxPooling2D(pool_size=(1, 2), padding='same'))
model.add(layers.Conv2D(filters=64,  input_shape=input_shape, kernel_size=(1,10), activation='relu', padding='same'))
model.add(layers.MaxPooling2D(pool_size=(1, 2), padding='same'))
model.add(layers.Flatten())
model.add(Dense(444, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Model summary
model.summary()

In [None]:
# Early Stopping

filepath = '../models/tmp/BoT-IoT_CNN_Detection.hdf5' # define where the model is saved
callbacks = [
        keras.callbacks.EarlyStopping(
            monitor = 'val_loss', # Use accuracy to monitor the model
            patience = 2 # Stop after 2 steps with lower accuracy
        ),
        keras.callbacks.ModelCheckpoint(
            filepath = filepath, # file where the checkpoint is saved
            monitor = 'val_loss', # Don't overwrite the saved model unless val_loss is worse
            save_best_only = True)]# Only save model if it is the best

# Train-test Split 75% TRAIN - 25% TEST
x_train, x_test, y_train, y_test = train_test_split(data_features, data_labels, train_size=0.75, shuffle=True)

In [None]:
#Configure model training
adam=tf.keras.optimizers.Adam(learning_rate=5e-4)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=25, batch_size=2048, callbacks=callbacks)

In [None]:
# model = load_model('../models/BoT-IoT_CNN_Detection_Best.hdf5')"

In [None]:
#Evaluate the model
results = model.evaluate(x_test, y_test, verbose=0)
print("test loss, test acc:", results)

In [None]:
# summarize history for loss
train_loss = history.history['loss']
test_loss = history.history['val_loss']
x = list(range(1, len(test_loss) + 1))
plt.plot(x, test_loss, color = 'orange', label = 'Test loss')
plt.plot(x, train_loss, label = 'Training loss')
plt.legend()
plt.grid(visible=True)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss vs. Epoch', weight='bold', fontsize=18)
plt.show()

In [None]:
# summarize history for accuracy
train_acc = history.history['accuracy']
test_acc = history.history['val_accuracy']
x = list(range(1, len(test_acc) + 1))
plt.plot(x, test_acc, color = 'orange', label = 'Test accuracy')
plt.plot(x, train_acc, label = 'Training accuracy')
plt.legend()
plt.grid(visible=True)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Epoch', weight='bold', fontsize=18)
plt.show()

In [None]:
# Prediction
predicted = model.predict(x_test)

# Confusion matrix
y_class = np.argmax(predicted, axis = 1) 
y_test=y_test.to_numpy()
y_check = np.argmax(y_test, axis = 1) 

cmatrix = confusion_matrix(y_check, y_class)

# Plotting Confusion Matrix
cmatrix_df = pd.DataFrame(cmatrix, index = ['Attack', 'Normal'], columns = ['Attack', 'Normal'])
plt.title('Confusion matrix of the test/predicted attacks in BoT-IoT CNN-B', weight='bold', fontsize=13)
plt.tick_params(length=0)
plt.xlabel('Attacks')
plt.ylabel('Predicted Attacks')
sns.heatmap(cmatrix_df, annot=True, fmt="d", cbar=False)
plt.show()

In [None]:
# Precision, Recall & F1-Score
class_rep = classification_report(y_check, y_class, target_names=['Attack', 'Normal'], output_dict=True)
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
plt.tick_params(length=0)
sns.heatmap(pd.DataFrame(class_rep).iloc[:-1, :-3].T, cmap="YlGnBu", square=True, cbar=False, annot=True)
plt.show()

In [None]:
class_rep