# Extreme Gradient Boosting Classifier

By Steven Sison on December 16, 2023

## Description

This document will be used for the preliminary training and evaluation of the extreme gradient boosting classifier. The document includes the necessary processes taken to train the model with the default hyperparameters. This also evaluates the performance of the classifier in terms of accuracy, precision, recall, F1-score, training time, and detection time. Furthermore, this document will only use lexical features and will observe the effect of increasing the number of features used in the model. As this is only for preliminary work, no optimizations, except a simple train-test validation, will be carried out.

### Preliminaries

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib

dataset = pd.read_csv("final_unbalanced_withLexical.csv")      # Loading the dataset
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

## Model Training and Evaluation

### Base Model using All Lexical Features Generated

In [None]:
pipeline = XGBClassifier()

pipeline.fit(x_train, y_train)

# pipeline = XGBClassifier()
pipeline.fit(x_train, y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = pipeline.classes_)
disp.plot()
plt.show()

### Evaluating the Effect of Balanced and Unbalanced Datasets

In [None]:
dataset['url_type'].value_counts()

In [None]:
# Upsampling

from sklearn.utils import resample

dataset_benign = dataset[(dataset['url_type'] == 0)]
dataset_defacement = dataset[(dataset['url_type'] == 1)]
dataset_phishing = dataset[(dataset['url_type'] == 2)]
dataset_malware = dataset[(dataset['url_type'] == 3)]

dataset_defacement_upsampled = resample(dataset_defacement,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_phishing_upsampled = resample(dataset_phishing,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_malware_upsampled = resample(dataset_malware,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_benign_upsampled = resample(dataset_benign,
                                        replace=True,
                                        n_samples = dataset_benign.shape[0],
                                        random_state = 15)

dataset_upsampled = pd.concat([dataset_benign_upsampled, dataset_defacement_upsampled, dataset_malware_upsampled, dataset_phishing_upsampled])

# dataset_upsampled.info(0)
dataset_upsampled['url_type'].value_counts()

x_up_train, x_up_test, y_up_train, y_up_test = train_test_split(dataset_upsampled.drop(columns=['url_type']), dataset_upsampled['url_type'], test_size = 0.2, random_state=42)


In [None]:
pipeline_up = Pipeline([
    ('classifier', XGBClassifier())
])

pipeline_up.fit(x_up_train, y_up_train)
y_up_pred = pipeline_up.predict(x_up_test)
print(classification_report(y_up_test, y_up_pred))

In [None]:
cm_up = confusion_matrix(y_up_test, y_up_pred, labels=pipeline_up.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = pipeline_up.classes_)
disp.plot()
plt.show()

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        XGBClassifier(), x_up_train.to_numpy(), y_up_train.to_numpy(), x_up_test.to_numpy(), y_up_test.to_numpy(), 
        loss='0-1_loss',
        random_seed=42)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)
print('Sklearn 0-1 loss: %.3f' % zero_one_loss(y_test,y_pred))

### Evaluating the Effect of Adding more Lexical Features

In [None]:
results = []

for i in range(3):
    pipeline = XGBClassifier()

    temp_url_features = x_up_train.iloc[:, 0:(25*(i+1))]
    
    pipeline.fit(temp_url_features, y_up_train)
    
    pipeline.save_model('xgb_lexical_{}.json'.format((25*(i+1))))

    url_type_predict = pipeline.predict(x_up_test.iloc[:, 0:(25*(i+1))])

    accuracy = accuracy_score(y_up_test, url_type_predict)
    recall = recall_score(y_up_test, url_type_predict, average = 'weighted')
    precision = precision_score(y_up_test, url_type_predict, average = 'weighted', zero_division=1)
    f1 = f1_score(y_up_test, url_type_predict, average = 'weighted')
    results.append(((4*(i+1)), accuracy, recall, precision, f1))

In [None]:
results = pd.DataFrame(results, columns=['Number of Features', 'Accuracy', 'Recall', 'Precision', 'F1-Score'])
results = results.sort_values(by='Number of Features', ascending=True)
print(results)

Observations:
- Increasing the number of features improves all the class weighted metrics of the model at the cost of a higher training time.

## Model Optimizations

### Sensitivity Test for K (No Hyperparameter Tuning but using Balanced Dataset)

In [None]:
from numpy import mean
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from matplotlib import pyplot

def evaluating_model(cv):

    X, y = dataset_upsampled.drop(columns=['url_type']), dataset_upsampled['url_type']

    model = XGBClassifier()
    scores = cross_val_score(model, X,y, scoring = "accuracy", cv = cv, n_jobs=1)

    return mean(scores), scores.min(), scores.max()

# Getting the Ideal Score
'''ideal, _, _ = evaluating_model(LeaveOneOut())
print('Ideal: %.3f' % ideal)'''

folds = range(10,11)

means, mins, maxs = list(), list(), list()

for k in folds:
    # define the test condition
    cv = KFold(n_splits=k, shuffle=True, random_state=1)
    # evaluate k value
    k_mean, k_min, k_max = evaluating_model(cv)
    # report performance
    print('> folds=%d, accuracy=%.3f (%.3f,%.3f)' % (k, k_mean, k_min, k_max))
    # store mean accuracy
    means.append(k_mean)
    # store min and max relative to the mean
    mins.append(k_mean - k_min)
    maxs.append(k_max - k_mean)

# line plot of k mean values with min/max error bars
pyplot.errorbar(folds, means, yerr=[mins, maxs], fmt='o')
# plot the ideal case in a separate color
# pyplot.plot(folds, [ideal for _ in range(len(folds))], color='r')
# show the plot
pyplot.show()


### Hyperparameter Optimization (Grid Search) and Cross Validation Setup

In [None]:
from bayes_opt import BayesianOptimization

def gbm_cl_bo(colsample_bylevel, colsample_bytree, max_depth, learning_rate, n_estimators, reg_alpha, reg_lambda):
    params_gbm = {}
    params_gbm['colsample_bylevel'] = colsample_bylevel
    params_gbm['colsample_bytree'] = colsample_bytree
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['reg_alpha'] = reg_alpha
    params_gbm['reg_lambda'] = reg_lambda
    scores = cross_val_score(XGBClassifier(random_state=123, **params_gbm),
                             x_up_train, y_up_train, scoring='accuracy', cv=5).mean()
    score = scores.mean()
    return score
# Run Bayesian Optimization
start = time.time()
params_gbm ={
    'colsample_bylevel': (0,1),
    'colsample_bytree': (0,1),
    'max_depth':(3, 10),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'reg_alpha': (0,5),
    'reg_lambda':(0,5) 
}
gbm_bo = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
gbm_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

### Hyperparameter Optimization using Optuna

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(train_data, label=train_target)
    dvalid = xgb.DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-rmse')
    
    # Train the model with early stopping
    model = xgb.train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = xgb.DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    rmse = mean_squared_error(valid_target, y_pred, squared=False)
    
    return rmse

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params = study.best_params
best_rmse = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best RMSE: ", best_rmse)

#---------------------------------------------------------------------#
# You can also tune for multiple metrics. See here: https://stackoverflow.com/questions/69071684/how-to-optimize-for-multiple-metrics-in-optuna