In [None]:
# Imports
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import pickle
import warnings
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from time import time

warnings.filterwarnings("ignore") # Ignore warnings
RANDOM_STATE = 42 # Random state default

In [None]:
# Functions

def split_x_y(df):    
    # DataFrame X (features)
    X = df.loc[:, df.columns != 'class']
    y = df.loc[:, 'class'] # y (labels)
    # DataFrame X and y
    return X, y



def plot_confusion_matrix(y_true, y_pred, class_names, save_figure=False, reports_path=None, file_name=None):
    # Get confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Confusion matrix display
    cm_p = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    fig, ax = plt.subplots(figsize=(10,10)) # Plot size
    plt.rc('font', **{'size':14}) # Setting font size 
    cm_p.plot(ax=ax) # Confusion matrix plot show
    
    if save_figure == True: # Save figure
        plt.savefig(reports_path + 'images/' + file_name + '.png')



def classification_with_report(model, X, y, k, class_names, save_report=False, 
                               reports_path=None, file_name=None, verbose=True):    
    # Lists
    predicted_class = []
    predicted_proba = []
    original_class = []

    # Stratified K-Folds cross-validator
    skf = StratifiedKFold(k)
    
    start = time() # Time counting

    # Train and validation the model for each 'k' fold in all the data
    for train_index, val_index in skf.split(X,y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Train model
        model.fit(X_train, y_train)

        # Predict with validation data
        y_pred_proba = model.predict_proba(X_val)
        # Add to list
        predicted_proba.extend(y_pred_proba)

        # Predict to generate classification report        
        y_pred = model.predict(X_val)
        predicted_class.extend(y_pred)
        original_class.extend(y_val)

    # Total time spent on training
    total_time = time() - start
    
    roc_auc = roc_auc_score( # Calculate ROC AUC
        original_class,
        predicted_proba,
        average="weighted",
        multi_class="ovr")

    if verbose == True: # Show results
        # Results
        print('\t\t\tClassification Report\n\n')
        print(classification_report(original_class, predicted_class, target_names=class_names, digits=5))
        print('F1-Score: ' + str(round(f1_score(original_class, predicted_class, average='macro'), 5)))
        print('AUC: ' + str(round(roc_auc, 5)))
        print('Log Loss: ' + str(round(log_loss(original_class, predicted_proba), 5)))
        print('Total Time: ' + str(round(total_time, 5)) + ' seconds')
        print('Confusion Matrix:\n')
        plot_confusion_matrix(original_class, predicted_class, class_names)
    
    if save_report == True: # Save report
        # file_name (without extension)
        with open(reports_path + file_name + '.txt', 'a+') as f:
            f.write(type(model).__name__ + '\n ')
            f.write(str(model))
            f.write('\n\n\n')
            f.write('\t\t\tClassification Report\n\n')
            f.write(classification_report(original_class, predicted_class, target_names=class_names, digits=5))
            f.write('\n\nF1-Score: ' + str(round((f1_score(original_class, predicted_class, average='macro')), 5)))
            f.write('\nAUC: ' + str(round((roc_auc), 5)))
            f.write('\nLog Loss: ' + str(round(log_loss(original_class, predicted_proba), 5)))
            f.write('\nTotal Time: ' + str(round(total_time, 5)) + ' seconds')
            
            plot_confusion_matrix(original_class, predicted_class, class_names,
                                  save_figure=True, reports_path=reports_path, file_name=file_name)

    return model # Return trained model



def objective(trial, X, y, k):
    params = { # Hyperparameters that will be optimized
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_class': y.nunique(),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
    }

    # Stratified K-Folds cross-validator
    skf = StratifiedKFold(k)
    scores = np.empty(5) # Save score of each fold

    # Train and validation the model for each 'k' fold
    for idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = lgb.LGBMClassifier(**params) # Start model
        # Train model
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  early_stopping_rounds=100,
        )

        # Predict the values based on the F1-score
        preds = model.predict(X_val)
        scores[idx] = f1_score(y_val, preds, average='macro')

    return np.mean(scores)



def report_model(model, X, y, class_names):
    # Predict to generate report     
    y_pred = model.predict(X)

    # Results
    print('\t\t\tClassification Report\n\n')
    print(classification_report(y, y_pred, target_names=class_names, digits=5))
    print('F1-Score: ' + str(round((f1_score(y, y_pred, average='macro')), 5)))
    # Confusion matrix
    print('\nConfusion Matrix:\n')
    plot_confusion_matrix(y, y_pred, class_names)



def save_object(obj, file_name):
    with open(f'{file_name}.pkl', 'wb') as f:
        pickle.dump(obj, f) # Save object with .pkl extension


        
def load_object(file_name):
    with open(f'{file_name}.pkl', 'rb') as f:
        loaded_obj = pickle.load(f) # Load object saved as .pkl
    
    return loaded_obj # Return loaded object



def lgb_plot_importance(booster, figsize, **kwargs):    
    # Create figure with size defined
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    
    # Bar chart with the importance of features
    return lgb.plot_importance(booster=booster, ax=ax, **kwargs)

In [None]:
# Main

# Datasets path
datasets_path = '/home/leandro/remy-project/centralized/datasets/WSN-DS/'
# Reports path
reports_path = '/home/leandro/remy-project/centralized/network_attacks/reports/'

# Name of each class
class_names = ["Normal", "Grayhole", "Blackhole", "Flooding"]

# Load test dataset
df_test = pd.read_csv(f'{datasets_path}test_data.csv')

# Split test data into X and y
X_test, y_test = split_x_y(df_test)

In [None]:
for dataset_type in ['none', 'ros', 'sm', 'k-sm', 'gans']:
    # Load dataset, and split into X and y
    X_train, y_train = split_x_y(pd.read_csv(f'{datasets_path}balanced/data_{dataset_type}.csv'))

    # Compare algorithms
    models = dict() # Create dictionary to save each model

    # Decision Tree (default)
    models['dt'] = DecisionTreeClassifier(random_state=RANDOM_STATE)
    # Random Forest (default)
    models['rf'] = RandomForestClassifier(random_state=RANDOM_STATE)
    # Gaussian Naive Bayes (default)
    models['nb'] = GaussianNB()
    # Multi-Layer Perceptron (default)
    models['mlp'] = MLPClassifier(random_state=RANDOM_STATE)
    # XGBoost (default)
    models['xgb'] = xgb.XGBClassifier(verbosity=0, random_state=RANDOM_STATE)
    # LightGBM (default)
    models['lgb'] = lgb.LGBMClassifier(random_state=RANDOM_STATE, objective='multiclass')

    for model_acronym in models.keys(): # Train and validate each dataset in each algorithm
        plt.ioff() # Turn off plot show
        _ = classification_with_report(models[model_acronym], X_train, y_train, 5, class_names,
                                       save_report=True, reports_path=reports_path,
                                       file_name=f'{dataset_type}-{model_acronym}', verbose=False)

plt.ion() # Turn on plot show

In [None]:
# Use data balanced with GANs to optimize LightGBM
X_train, y_train = split_x_y(pd.read_csv(f'{datasets_path}balanced/data_gans.csv'))

# Optimize LightGBM hyperparameters
study = optuna.create_study(direction="maximize", study_name="lightgbm")
func = lambda trial: objective(trial, X_train, y_train, 5)
study.optimize(func, n_trials=10, n_jobs=-1)

# Save the best parameters in a .pkl file
save_object(study.best_params, 'best_paramns')

In [None]:
# Load the best parameters
best_params = load_object('best_paramns')
# LightGBM (optimized)
model = lgb.LGBMClassifier(**best_params)
model = classification_with_report(model, X_train, y_train, 5, class_names,
                                       save_report=True, reports_path=reports_path,
                                       file_name='gans-lgb-o', verbose=False)
# Save optimized LightGBM model
save_object(model, 'model')

In [None]:
# Load optimized LightGBM model
model = load_object('model')

In [None]:
# Plot the importance of each feature on the trained model
lgb_plot_importance(model, (10, 10))

In [None]:
# Test LightGBM with optimized hyperparameters on test data
report_model(model, X_test, y_test, class_names)

In [None]:
# Next Step: Test Federated Learning with the Flower framework
# https://flower.dev/