<a href="https://colab.research.google.com/github/r-dube/CICIDS/blob/main/ids_keras_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the top modules that are used in multiple places
import numpy as np
import pandas as pd

In [None]:
# Some global variables to drive the script
# data_url is the location of the data
# Data is not loaded from a local file
# Data is loaded from a prepocessed dataset
data_url="https://raw.githubusercontent.com/r-dube/CICIDS/main/MachineLearningCVE/processed/bal-cicids2017.csv"

In [None]:
# label names (YY) in the data and their
# mapping to numerical values
label_map = {
 'BENIGN' : 0,
 'FTP-Patator' : 1,
 'SSH-Patator' : 2,
 'DoS slowloris' : 3,
 'DoS Slowhttptest': 4,
 'DoS Hulk' : 5,
 'DoS GoldenEye' : 6,
 'Heartbleed' : 7,
 'Web Attack � Brute Force' : 8,
 'Web Attack � XSS' : 8,
 'Web Attack � Sql Injection' : 8,
 'Infiltration' : 9,
 'Bot' : 10,
 'PortScan' : 11,
 'DDoS' : 12,
}

num_ids_features = 76
num_ids_classes = 13
ids_classes = [ 'BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed', 'Brute Force', 'XSS', 'Sql Injection', 'Infiltration', 'Bot', 'PortScan', 'DDoS',]

In [None]:
# Utility functions used by classifiers
# In particular to load and split data and output results
def ids_load_df_from_csv():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        None
    """

    df = pd.read_csv(data_url)

    print ("load Dataframe shape", df.shape)

    return df

def ids_split(df):
    """
    Input:
        Dataframe that has columns of covariates followed by a column of labels
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test as numpy arrays
    """

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler

    numcols = len(df.columns)
    print("df.shape", df.shape)

    X = df.iloc[:, 0:numcols-1]
    y = df.loc[:, 'YY']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
    print ("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
    print ("X_val.shape", X_val.shape, "y_val.shape", y_val.shape)
    print ("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    y_train = y_train.values
    y_val = y_val.values
    y_test = y_test.values

    return X_train, X_val, X_test, y_train, y_val, y_test

def ids_accuracy (y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        multiclass accuracy and f1 scores; two class accuracy and f1 scores
    """

    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score

    # modify labels to get results for two class classification
    y_actual_2 = (y_actual > 0).astype(int)
    y_pred_2 = (y_pred > 0).astype(int)

    acc = accuracy_score (y_actual, y_pred)
    f1 = f1_score(y_actual, y_pred, average='macro')
    acc_2 = accuracy_score (y_actual_2, y_pred_2)
    f1_2 = f1_score(y_actual_2, y_pred_2)
    
    return acc, f1, acc_2, f1_2
    

def ids_metrics(y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        None
    Print: various classification metrics
    """

    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix (y_actual, y_pred)
    print (cm)

    acc, f1, acc_2, f1_2 = ids_accuracy (y_actual, y_pred)
    print('Classifier accuracy : {:.4f}'.format(acc), 'F1 score: {:.4f}'.format(f1))
    print('Two class classifier accuracy : {:.4f}'.format(acc_2), 'F1 score: {:.4f}'.format(f1_2))

In [None]:
# FCNN model developed using the deeplizard tutorial
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import TensorBoard
import math
import datetime

# For reproducible results
import random as rn
import os
os.environ['PYTHONHASHSEED'] = '42'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(42)
rn.seed(42)
tf.random.set_seed(42)

df = ids_load_df_from_csv ()
X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)

# To use sparse_categorical_crossentropy as the loss function
#   use softmax as the activation function in the output layer
inputs = keras.Input(shape=(num_ids_features,), name="ids_input")
hl1 = Dense(num_ids_features, activation="relu", name="dense_1")(inputs)
hl2 = Dense(num_ids_features, activation="relu", name="dense_2")(hl1)
outputs = Dense(num_ids_classes, activation="softmax", name="output")(hl2)

model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()

initial_learning_rate = 0.004
epochs = 30
decay = initial_learning_rate / epochs

# learning scheduler 1
def lr_time_based_decay(epoch, lr):
    return lr * 1 / (1 + decay * epoch)

# learning scheuler 2
def lr_step_decay(epoch, lr):
    drop_rate = 0.75
    epochs_drop = 2
    return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tb_cbk = TensorBoard(logdir, histogram_freq=1)

model.compile(
    optimizer=Adam(learning_rate=initial_learning_rate), 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

model.fit(
    x=X_train, 
    y=y_train, 
    batch_size=64, 
    shuffle=True,
    epochs=epochs, 
    validation_data=(X_val, y_val),
    callbacks=[LearningRateScheduler(lr_time_based_decay, verbose=1), tb_cbk],
    verbose=2
)

In [None]:
from tensorboard import notebook
notebook.list() # View open TensorBoard instances

# Control TensorBoard display. If no port is provided, 
# the most recently launched TensorBoard is used
# notebook.display(port=6006, height=1000) 
%load_ext tensorboard
%tensorboard --logdir logs
notebook.display(port=6006, height=1000) 

In [None]:
# prediction step and metrics similar to logistic and knn classifiers
predictions = model.predict(
    x=X_val,
    batch_size=64,
    verbose=0,
) 

y_pred = np.argmax(predictions, axis=1)
ids_metrics(y_val, y_pred) 