# XGB Classifier with Wrapper-Based FS

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [None]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier, DMatrix, train
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_new_Bacud_unbalanced_lexical.csv")      # Loading the dataset

dataset.head()



In [None]:
dataset.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [None]:
dataset['url_type'].value_counts()

#### 3. Removing Unnecessary Features

In [None]:
important_features_wrapper_33 = ['url_length',
 'url_domain_entropy',
 'url_is_digits_in_domain',
 'url_number_of_parameters',
 'url_number_of_digits',
 'url_string_entropy',
 'url_path_length',
 'url_host_length',
 'get_tld',
 'url_domain_len',
 'url_num_subdomain',
 'url_number_of_fragments',
 'url_is_encoded',
 'url_number_of_letters',
 'url_num_periods',
 'url_num_of_hyphens',
 'url_num_underscore',
 'url_num_forward_slash',
 'url_num_semicolon',
 'url_num_mod_sign',
 'has_login_in_string',
 'has_signin_in_string',
 'has_logon_in_string',
 'has_loginasp_in_string',
 'has_exe_in_string',
 'has_viewerphp_in_string',
 'has_getImageasp_in_string',
 'has_paypal_in_string',
 'has_dbsysphp_in_string',
 'has_shopping_in_string',
 'has_php_in_string',
 'has_bin_in_string',
 'has_personal_in_string',
 'url_scheme'
 ]

important_features_wrapper_12 = ['url_domain_entropy', 
                              'url_number_of_parameters', 
                              'url_number_of_digits', 
                              'url_path_length', 
                              'url_host_length', 
                              'get_tld', 
                              'url_domain_len', 
                              'url_num_subdomain', 
                              'url_number_of_letters', 
                              'url_num_periods', 
                              'url_num_of_hyphens', 
                              'url_num_forward_slash', 
                              'url_num_semicolon', 
                              'has_login_in_string', 
                              'has_exe_in_string', 
                              'has_php_in_string', 
                              'url_scheme']

X_test_12 = x_test[important_features_wrapper_12]
X_train_12 = x_train[important_features_wrapper_12]

X_test_33 = x_test[important_features_wrapper_33]
X_train_33 = x_train[important_features_wrapper_33]

X_test_33.head()

#### Hyper-parameter Tuning

##### 12 Features (Purely Lexical)

In [None]:
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
def objective_12(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_12, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_12 = optuna.create_study(direction='minimize')
study_12.optimize(objective_12, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_12 = study_12.best_params
best_error = study_12.best_value
print("Best Hyperparameters (12 Features): ", best_params_12)
print("Best Error (12 Features): ", best_error)

##### 33 Features (Purely Lexical)

In [None]:
# Define the objective function for Optuna
def objective_33(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_33, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_33 = optuna.create_study(direction='minimize')
study_33.optimize(objective_33, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_33 = study_33.best_params
best_error_33 = study_33.best_value
print("Best Hyperparameters (33 Features): ", best_params_33)
print("Best Error (33 Features): ", best_error)

In [None]:
print(best_params_12)
print(best_params_33)

#### Model Training

In [None]:
from sklearn.model_selection import KFold
import optuna

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# Set Hyperparameters
params_12 = { 'objective': 'binary:hinge',
              'eta': 0.2769198716475172, 
              'max_depth': 10, 
              'subsample': 0.82358642201105, 
              'colsample_bytree': 0.8286208231929323, 
              'gamma': 0.7766612702667438, 
              'min_child_weight': 8.319434489010376, 
              'lambda': 4.43143014244566, 
              'alpha': 4.399273966701367,
              'eval_metric': 'error'
              }

best_params_12['objective'] = 'binary:hinge'
best_params_12['eval_metric'] = 'error'

best_params_33['objective'] = 'binary:hinge'
best_params_33['eval_metric'] = 'error'

# Convert the data into DMatrix format
dtrain_12 = DMatrix(X_train_12, label=y_train)
dvalid_12 = DMatrix(X_test_12, label=y_test)

dtrain_33 = DMatrix(X_train_33, label=y_train)
dvalid_33 = DMatrix(X_test_33, label=y_test)

# Train the Model
'''xgb_classifier_12 = train(best_params_12, dtrain_12, num_boost_round=3000)
y_pred_12 = xgb_classifier_12.predict(dvalid_12)

print("Model with 12 Features Done.")'''

xgb_classifier_33 = train(best_params_33, dtrain_33, num_boost_round=3000)
y_pred_33 = xgb_classifier_33.predict(dvalid_33)

print("Model with 33 Features Done.")

#### Evaluation

In [None]:
# Classification Report
print(classification_report(y_test, y_pred_12))
print(classification_report(y_test, y_pred_33))

In [None]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred_12, labels=xgb_classifier_12.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier_12.classes_)
disp.plot()
plt.show()'''

In [None]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(**params_gbm),
                        X_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

In [None]:
# Dumping the model
joblib.dump(xgb_classifier_12, 'xgb_ffs_12.sav')
joblib.dump(xgb_classifier_33, 'xgb_ffs_33.sav')

In [None]:
import lexical_generator_12
import lexical_generator_33
import time

def xgb_predict_maliciousness_12(url):

    numerical_values = lexical_generator_12.lexical_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier_12.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"
        
def xgb_predict_maliciousness_33(url):

    numerical_values = lexical_generator_33.lexical_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier_33.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"

url = "youtube.com/watch?v=LF4Q4bR9SPw"
print("Current URL: "+url)

start = time.perf_counter()
prediction = xgb_predict_maliciousness_12(url)
end = time.perf_counter()
print("------- 12 Features -------------")
print(prediction)
print(end-start)

'''start = time.perf_counter()
prediction = xgb_predict_maliciousness_33(url)
end = time.perf_counter()
print("------- 33 Features -------------")
print(prediction)
print(end-start)'''