# XGB Classifier with Filter-Based FS (Lexical + Content)

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier, DMatrix, train
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_new_Bacud_unbalanced_lexical.csv")      # Loading the dataset

dataset.head()

In [None]:
dataset.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [None]:
dataset['url_type'].value_counts()

#### 3. Removing Unnecessary Features

In [None]:
important_features_lexical = ['url_host_length',
                             'url_is_https',
                             'url_ip_in_domain',
                             'has_php_in_string',
                             'url_number_of_parameters',
                             'has_exe_in_string',
                             'url_has_port',
                             'url_is_digits_in_domain',
                             'url_path_length',
                             'url_num_question_mark', 
                             'url_query_length',
                             'url_string_entropy',
                             'url_num_periods',
                             'get_tld',
                             'url_scheme']

X_test_lexical = x_test[important_features_lexical]
X_train_lexical = x_train[important_features_lexical]

#### Hyper-parameter Optimization

In [None]:
# Define the objective function for Optuna
def objective_lexical(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_lexical, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_lexical = optuna.create_study(direction='minimize')
study_lexical.optimize(objective_lexical, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_lexical = study_lexical.best_params
best_error_lexical = study_lexical.best_value
print("Best Hyperparameters (33 Features): ", best_params_lexical)
print("Best Error (33 Features): ", best_error_lexical)

#### Model Training

In [None]:
from sklearn.model_selection import KFold
from xgboost import DMatrix, train

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

best_params_lexical['objective'] = 'binary:hinge'
best_params_lexical['eval_metric'] = 'error'

# Convert the data into DMatrix format
lexical_train = DMatrix(X_train_lexical, label=y_train)
lexical_valid = DMatrix(X_test_lexical, label=y_test)

# Train the Model
xgb_classifier_lexical = train(best_params_lexical, lexical_train, num_boost_round=3000)
y_pred_lexical = xgb_classifier_lexical.predict(lexical_valid)

print("Model training done.")

In [None]:
# Classification Report
print(classification_report(y_test, y_pred_lexical))

#### Practical Evaluation

In [None]:
# Dumping the model
joblib.dump(xgb_classifier_lexical, 'xgb_filter_lexical.sav')

In [None]:
import lexical_generator_filter_lexical
import time

def xgb_predict_maliciousness(url):

    numerical_values = lexical_generator_filter_lexical.lexical_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier_lexical.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"

url = "www.facebook.com/"
print("Current URL: "+url)

print("------------- Filter-Based (Lexical) -------------")
for i in range(15):
    start = time.perf_counter()
    prediction = xgb_predict_maliciousness(url)
    end = time.perf_counter()
    print("Trial "+str(i))
    print(prediction)
    print(end-start)

#### Evaluation

In [None]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred, labels=xgb_classifier.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier.classes_)
disp.plot()
plt.show()'''

In [None]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(**params_gbm),
                        X_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''