<a href="https://colab.research.google.com/github/r-dube/CICIDS/blob/main/ids_keras_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the top modules that are used in multiple places
import numpy as np
import pandas as pd

In [None]:
# Some global variables to drive the script
# data_url is the location of the data
# Data is not loaded from a local file
# Data is loaded from a prepocessed dataset
data_url="https://raw.githubusercontent.com/r-dube/CICIDS/main/MachineLearningCVE/processed/bal-cicids2017.csv"

In [None]:
# label names (YY) in the data and their
# mapping to numerical values
label_map = {
 'BENIGN' : 0,
 'FTP-Patator' : 1,
 'SSH-Patator' : 2,
 'DoS slowloris' : 3,
 'DoS Slowhttptest': 4,
 'DoS Hulk' : 5,
 'DoS GoldenEye' : 6,
 'Heartbleed' : 7,
 'Web Attack � Brute Force' : 8,
 'Web Attack � XSS' : 8,
 'Web Attack � Sql Injection' : 8,
 'Infiltration' : 9,
 'Bot' : 10,
 'PortScan' : 11,
 'DDoS' : 12,
}

num_ids_features = 76
num_ids_classes = 13
ids_classes = [ 'BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed', 'Brute Force', 'XSS', 'Sql Injection', 'Infiltration', 'Bot', 'PortScan', 'DDoS',]

In [None]:
# Utility functions used by classifiers
# In particular to load and split data and output results
def ids_load_df_from_csv():
    """
    Load dataframe from csv file
    Input:
        None
    Returns:
        None
    """

    df = pd.read_csv(data_url)

    print ("load Dataframe shape", df.shape)

    return df

def ids_split(df):
    """
    Input:
        Dataframe that has columns of covariates followed by a column of labels
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test as numpy arrays
    """

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler

    numcols = len(df.columns)
    print("df.shape", df.shape)

    X = df.iloc[:, 0:numcols-1]
    y = df.loc[:, 'YY']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
    print ("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
    print ("X_val.shape", X_val.shape, "y_val.shape", y_val.shape)
    print ("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    y_train = y_train.values
    y_val = y_val.values
    y_test = y_test.values

    return X_train, X_val, X_test, y_train, y_val, y_test

def ids_accuracy (y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        multiclass accuracy and f1 scores; two class accuracy and f1 scores
    """

    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score

    # modify labels to get results for two class classification
    y_actual_2 = (y_actual > 0).astype(int)
    y_pred_2 = (y_pred > 0).astype(int)

    acc = accuracy_score (y_actual, y_pred)
    f1 = f1_score(y_actual, y_pred, average='macro')
    acc_2 = accuracy_score (y_actual_2, y_pred_2)
    f1_2 = f1_score(y_actual_2, y_pred_2)
    
    return acc, f1, acc_2, f1_2
    

def ids_metrics(y_actual, y_pred):
    """
    Input:
        Numpy arrays with actual and predicted labels
    Returns:
        None
    Print: various classification metrics
    """

    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix (y_actual, y_pred)
    print (cm)

    acc, f1, acc_2, f1_2 = ids_accuracy (y_actual, y_pred)
    print('Classifier accuracy : {:.4f}'.format(acc), 'F1 score: {:.4f}'.format(f1))
    print('Two class classifier accuracy : {:.4f}'.format(acc_2), 'F1 score: {:.4f}'.format(f1_2))


In [None]:
# Original model based on Colab tutorials
# Load Keras and Tensorflow modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__

df = ids_load_df_from_csv ()
X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)

inputs = keras.Input(shape=(num_ids_features,), name="ids_fcnn")
x1 = layers.Dense(num_ids_features, activation="relu", name="dense_1")(inputs)
x2 = layers.Dense(num_ids_features, activation="relu", name="dense_2")(x1)
outputs = layers.Dense(num_ids_classes, name="output")(x2)

model = keras.Model(inputs=inputs, outputs=outputs)

y_train = keras.utils.to_categorical(y_train, num_ids_classes)
y_val = keras.utils.to_categorical(y_val, num_ids_classes)
print ("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print ("X_val.shape", X_val.shape, "y_val.shape", y_val.shape)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.CategoricalCrossentropy(),
    # List of metrics to monitor
    metrics=[keras.metrics.CategoricalAccuracy()],
)

print("Fit model on training data")
history = model.fit(
    X_train,
    y_train,
    batch_size=8,
    epochs=10,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=(X_val, y_val),
)

In [None]:
# New FFNN model developed using the deeplizard tutorial
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

df = ids_load_df_from_csv ()
X_train, X_val, X_test, y_train, y_val, y_test = ids_split(df)

model = Sequential([
    Dense(units=num_ids_features, input_shape=(num_ids_features, ), activation='relu'),
    Dense(units=num_ids_features, activation='relu'),
    Dense(units=num_ids_classes, activation='linear')
])

model.summary()

model.compile(
    optimizer=Adam(learning_rate=0.0001), 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

model.fit(
    x=X_train, 
    y=y_train, 
    batch_size=10, 
    epochs=2, 
    verbose=2
)

In [None]:
()