## Imports

In [None]:
# Install catboost
! pip install catboost
# Install bayes_opt
!pip install bayesian-optimization
# XGBoost update (support for categorical features)
!pip install xgboost==1.6.1

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import warnings
import xgboost as xgb
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier
from google.colab import drive
from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report, confusion_matrix, accuracy_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, StratifiedKFold
from time import time

warnings.filterwarnings('ignore') # Ignore warning
drive.mount('/content/drive') # Mount drive

# Set random state
RANDOM_STATE = 42

## Functions

In [None]:
def split_x_y(df):
    # DataFrame X (features)
    X = df.loc[:, df.columns != 'class']
    y = df.loc[:, 'class'] # y (labels)

    # DataFrame X and y
    return X, y



def resampling_dataset(X, y, random_state=RANDOM_STATE):   
    # Random Undersampling (RUS)
    rus = RandomUnderSampler(random_state=random_state)
    X_resampled, y_resampled = rus.fit_resample(X, y)

    # X and y after resampling the data
    return X_resampled, y_resampled



def plot_roc_curve(model, fpr, tpr, roc_auc, k):
    # Config the plot
    plt.figure(figsize=(10, 10))
    plt.grid()
    plt.plot([0, 1], [0, 1], ls='--')
    plt.ylim([-0.01, 1.01])
    plt.xlim([-0.01, 1.01])
    plt.xlabel('False Positive Rate', size=16)
    plt.ylabel('True Positive Rate', size=16)

    linspace = np.linspace(0, 1, 100)

    # Calculate the mean ROC
    int_tpr = [np.interp(linspace, fpr[i], tpr[i]) for i in range(k)]
    mean_tpr = np.mean(int_tpr, axis=0)

    # Plot all ROC curves for each fold and the average ROC curve
    for i in range(k):
        plt.plot(fpr[i], tpr[i], label=f'fold {i}, Area: {roc_auc[i]:.4f}')
    plt.plot(linspace, mean_tpr, label=f'Average Curve, Area: {auc(linspace, mean_tpr):.4f} (+/- {np.std(roc_auc)*2:.2f})')

    # Plot the graph
    plt.legend(prop={'size': 16})
    plt.show() # Show the plot



def classification_whit_report(model, X, y, k):
    # Lists
    fpr = []
    tpr = []
    roc_auc = []
    original_class = []
    predicted_class = []
    predicted_proba = []

    # Stratified K-Folds cross-validator
    skf = StratifiedKFold(k, shuffle=True, random_state=RANDOM_STATE)
    i = 0 # Auxiliary variable to denote each fold
    
    # Time counting
    start = time()
    
    # Train and test the model for each 'k' fold in all the data
    for train_index, test_index in skf.split(X,y):
        model.fit(X.iloc[train_index], y[train_index])
        y_pred_probability = model.predict_proba(X.iloc[test_index])

        # Predict to generate classification report
        y_pred = model.predict(X.iloc[test_index])
        predicted_class.extend(y_pred)
        original_class.extend(y[test_index])

        # Compute micro-average ROC curve and ROC area
        temp_fpr, temp_tpr, _ = metrics.roc_curve(y[test_index], y_pred_probability[:,1])
        fpr.append(temp_fpr)
        tpr.append(temp_tpr)
        roc_auc.append(auc(temp_fpr, temp_tpr))
        i += 1 # i = i + 1
     
    # Total time spent on training
    total_time = time() - start

    # Plot the graph
    plot_roc_curve(model, fpr, tpr, roc_auc, k)

    #Insert results in file
    print('>> CLASSIFICATION REPORT')
    print(classification_report(original_class, predicted_class))
    print('\n>> AUC')
    print(roc_auc)
    print('\n>> CONFUSION MATRIX')
    print(confusion_matrix(original_class, predicted_class))
    print('\n>> TOTAL TIME: ' + str(total_time) + ' seconds\n')



def confusion_matrix_plot(y, pred, normalized):
    text = ['Normal', 'Attack', 'Predicted Label', 'True Label'] # Matrix labels
    cm = confusion_matrix(y, pred, normalize=normalized) # Confusion matrix
    cmp = ConfusionMatrixDisplay(cm, display_labels=[text[0], text[1]]) # Generating the graph
    fig, ax = plt.subplots(figsize=(7,7)) # Figure size
    plt.rcParams.update({'font.size': 14}) # Font size
    cmp.plot(ax=ax) # Plot
    cmp.ax_.set(xlabel=text[2], ylabel=text[3]) # Changing the axis text



def test_model(df, model, normalized='true'):
    # Dividing into X and y
    X_test, y_test = split_x_y(df)
    # Model report
    result = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    # Plot font size
    plt.rcParams.update({'font.size': 14})
    # Confusion matrix
    confusion_matrix_plot(y_test, y_pred, normalized)



def lgb_bayesian_optimization(init_points, n_iter):
    # Optimization range of each hyperparameter
    pbounds = {
        'lambda_l1': (1e-8, 10.0),
        'lambda_l2': (1e-8, 10.0),
        'num_leaves': (2, 256),
        "feature_fraction": (0.4, 1.0),
        "bagging_fraction": (0.4, 1.0),
        "bagging_freq": (1, 7),
        "min_child_samples": (5, 100)
    }
    
    # Initializing Bayesian optimization
    optimizer = BayesianOptimization(
        f=lightgbm_hyper_param,
        pbounds=pbounds,
        random_state=RANDOM_STATE,
    )
    
    start = time()
    # Optimizing... (slow) 'take a nap'
    optimizer.maximize(init_points=init_points, n_iter=n_iter)



def lightgbm_hyper_param(lambda_l1, lambda_l2, num_leaves, feature_fraction,
                         bagging_fraction, bagging_freq, min_child_samples):
    # Optimized hyperparameters
    params = {"boosting_type": "gbdt",
              "objective": "binary",
              "metric": "auc",
              "feature_pre_filter": False, # Set to false to search hyperparameters in min_child_samples
              "lambda_l1": lambda_l1,
              "lambda_l2": lambda_l2,
              "num_leaves": int(num_leaves),
              "feature_fraction": feature_fraction,
              "bagging_fraction": bagging_fraction,
              "bagging_freq": int(bagging_freq),
              "min_child_samples": int(min_child_samples),
              "categorical_feature": cat_columns,
              "verbosity": -1}
    
    # Getting the result of the iteration
    cv_result = lgb.cv(params, train_data, stratified=True, shuffle=True, nfold=5)
    
    # Mean AUC obtained
    return cv_result['auc-mean'][-1]

## Main

In [None]:
# Dataset path
dataset_path = '/content/drive/MyDrive/DoS_Detection/datasets/'

# Loading the processed training dataset
df_train = pd.read_csv(f'{dataset_path}train_dataset.csv', index_col=0)
# Loading the processed test dataset
df_test = pd.read_csv(f'{dataset_path}test_dataset.csv', index_col=0)

# Categorical columns
cat_columns = list(set(df_train.columns) - set(['frame.len', 'radiotap.dbm_antsignal', 'wlan.duration']) - {'class'})

# Convert to 'category' type
df_train[cat_columns] = df_train[cat_columns].astype('category')
df_test[cat_columns] = df_test[cat_columns].astype('category')

In [None]:
# Recurrent columns in all sets
recurrent_columns = 'frame.len|radiotap.dbm_antsignal|wlan.duration|class'

# Build each set of columns in the training dataset
df_train_set1 = df_train.filter(regex=f'{recurrent_columns}|wlan.fc.subtype')
df_train_set2 = df_train.filter(regex=f'{recurrent_columns}|wlan.fc.type')
df_train_set3 = df_train.filter(regex=f'{recurrent_columns}|wlan.fc.ds')
df_train_set4 = df_train.filter(regex=f'{recurrent_columns}|wlan.fc.protected')

# Build each set of columns in the test dataset
df_test_set1 = df_test.filter(regex=f'{recurrent_columns}|wlan.fc.subtype')
df_test_set2 = df_test.filter(regex=f'{recurrent_columns}|wlan.fc.type')
df_test_set3 = df_test.filter(regex=f'{recurrent_columns}|wlan.fc.ds')
df_test_set4 = df_test.filter(regex=f'{recurrent_columns}|wlan.fc.protected')

### Training, Test and Validation

#### **All columns**

In [None]:
# Dividing into X and y
X, y = split_x_y(df_train)
# Balanced dataset with random undersampling
X_res, y_res = resampling_dataset(X, y)

Algorithms with the default hyperparameters:

In [None]:
# XGBoost
model = xgb.XGBClassifier(tree_method="hist", verbosity=0, enable_categorical=True)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# CatBoost
model = CatBoostClassifier(verbose=False, cat_features=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# LightGBM
model = lgb.LGBMClassifier(categorical_feature=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

Bayesian Optimization (BO) in LightGBM:

In [None]:
train_data = lgb.Dataset(X_res, y_res)
# Optimizing...
lgb_bayesian_optimization(init_points=50, n_iter=500)

In [None]:
model = lgb.LGBMClassifier(bagging_fraction=0.9853,
                           bagging_freq=int(4.486),
                           feature_fraction=0.9675,
                           lambda_l1=0.3368,
                           lambda_l2=0.5105,
                           min_child_samples=int(82.14),
                           num_leaves=int(93.86),
                           categorical_feature=cat_columns)
# Training and test
classification_whit_report(model, X_res, y_res, k=5)

# Save model
pickle.dump(model, open('lgb_bo_all.sav', 'wb'))

Validate the model (with best AUC) in other data:

In [None]:
# Without normalizing the confusion matrix
test_model(df_test, model, normalized=None)
# Normalizing
test_model(df_test, model, normalized='true')

#### **Set 1**

In [None]:
# Dividing into X and y
X, y = split_x_y(df_train_set1)
# Balanced dataset with random undersampling
X_res, y_res = resampling_dataset(X, y)
# Redefine categorical columns
cat_columns = list(X_res.select_dtypes(include=['category']).columns)

Algorithms with the default hyperparameters:

In [None]:
# XGBoost
model = xgb.XGBClassifier(tree_method="hist", verbosity=0, enable_categorical=True)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# CatBoost
model = CatBoostClassifier(verbose=False, cat_features=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# LightGBM
model = lgb.LGBMClassifier(categorical_feature=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

Bayesian Optimization (BO) in LightGBM:

In [None]:
train_data = lgb.Dataset(X_res, y_res)
# Optimizing...
lgb_bayesian_optimization(init_points=50, n_iter=500)

In [None]:
model = lgb.LGBMClassifier(bagging_fraction=0.7462,
                           bagging_freq=int(2.993),
                           feature_fraction=0.9239,
                           lambda_l1=0.1172,
                           lambda_l2=0.5827,
                           min_child_samples=int(78.15),
                           num_leaves=int(93.14),
                           categorical_feature=cat_columns)
# Training and test
classification_whit_report(model, X_res, y_res, k=5)

# Save model
pickle.dump(model, open('lgb_bo_set1.sav', 'wb'))

Validate the model (with best AUC) in other data:

In [None]:
# Without normalizing the confusion matrix
test_model(df_test_set1, model, normalized=None)
# Normalizing
test_model(df_test_set1, model, normalized='true')

#### **Set 2**

In [None]:
# Dividing into X and y
X, y = split_x_y(df_train_set2)
# Balanced dataset with random undersampling
X_res, y_res = resampling_dataset(X, y)
# Redefine categorical columns
cat_columns = list(X_res.select_dtypes(include=['category']).columns)

Algorithms with the default hyperparameters:

In [None]:
# XGBoost
model = xgb.XGBClassifier(tree_method="hist", verbosity=0, enable_categorical=True)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# CatBoost
model = CatBoostClassifier(verbose=False, cat_features=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# LightGBM
model = lgb.LGBMClassifier(categorical_feature=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

Bayesian Optimization (BO) in LightGBM:

In [None]:
train_data = lgb.Dataset(X_res, y_res)
# Optimizing...
lgb_bayesian_optimization(init_points=50, n_iter=500)

In [None]:
model = lgb.LGBMClassifier(bagging_fraction=0.8733,
                           bagging_freq=int(5.149),
                           feature_fraction=0.7795,
                           lambda_l1=0.2649,
                           lambda_l2=0.02365,
                           min_child_samples=int(28.74),
                           num_leaves=int(111.1),
                           categorical_feature=cat_columns)
# Training and test
classification_whit_report(model, X_res, y_res, k=5)

# Save model
pickle.dump(model, open('lgb_bo_set2.sav', 'wb'))

Validate the model (with best AUC) in other data:

In [None]:
# Without normalizing the confusion matrix
test_model(df_test_set2, model, normalized=None)
# Normalizing
test_model(df_test_set2, model, normalized='true')

#### **Set 3**

In [None]:
# Dividing into X and y
X, y = split_x_y(df_train_set3)
# Balanced dataset with random undersampling
X_res, y_res = resampling_dataset(X, y)
# Redefine categorical columns
cat_columns = list(X_res.select_dtypes(include=['category']).columns)

Algorithms with the default hyperparameters:

In [None]:
# XGBoost
model = xgb.XGBClassifier(tree_method="hist", verbosity=0, enable_categorical=True)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# CatBoost
model = CatBoostClassifier(verbose=False, cat_features=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# LightGBM
model = lgb.LGBMClassifier(categorical_feature=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

Bayesian Optimization (BO) in LightGBM:

In [None]:
train_data = lgb.Dataset(X_res, y_res)
# Optimizing...
lgb_bayesian_optimization(init_points=50, n_iter=500)

In [None]:
model = lgb.LGBMClassifier(bagging_fraction=0.9834,
                           bagging_freq=int(4.268),
                           feature_fraction=0.923,
                           lambda_l1=0.05423,
                           lambda_l2=2.109,
                           min_child_samples=int(29.59),
                           num_leaves=int(172.8),
                           categorical_feature=cat_columns)
# Training and test
classification_whit_report(model, X_res, y_res, k=5)

# Save model
pickle.dump(model, open('lgb_bo_set3.sav', 'wb'))

Validate the model (with best AUC) in other data:

In [None]:
# Without normalizing the confusion matrix
test_model(df_test_set3, model, normalized=None)
# Normalizing
test_model(df_test_set3, model, normalized='true')

#### **Set 4**

In [None]:
# Dividing into X and y
X, y = split_x_y(df_train_set4)
# Balanced dataset with random undersampling
X_res, y_res = resampling_dataset(X, y)
# Redefine categorical columns
cat_columns = list(X_res.select_dtypes(include=['category']).columns)

Algorithms with the default hyperparameters:

In [None]:
# XGBoost
model = xgb.XGBClassifier(tree_method="hist", verbosity=0, enable_categorical=True)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# CatBoost
model = CatBoostClassifier(verbose=False, cat_features=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

In [None]:
# LightGBM
model = lgb.LGBMClassifier(categorical_feature=cat_columns)
classification_whit_report(model, X_res, y_res, k=5)

Bayesian Optimization (BO) in LightGBM:

In [None]:
train_data = lgb.Dataset(X_res, y_res)
# Optimizing...
lgb_bayesian_optimization(init_points=50, n_iter=500)

In [None]:
model = lgb.LGBMClassifier(bagging_fraction=0.6902,
                           bagging_freq=int(4.271),
                           feature_fraction=0.9717,
                           lambda_l1=0.07227,
                           lambda_l2=0.0807,
                           min_child_samples=int(32.52),
                           num_leaves=int(41.31),
                           categorical_feature=cat_columns)
# Training and test
classification_whit_report(model, X_res, y_res, k=5)

# Save model
pickle.dump(model, open('lgb_bo_set4.sav', 'wb'))

Validate the model (with best AUC) in other:

In [None]:
# Without normalizing the confusion matrix
test_model(df_test_set4, model, normalized=None)
# Normalizing
test_model(df_test_set4, model, normalized='true')