# XGB Classifier with Wrapper-Based FS

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [2]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier, DMatrix, train
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_new_Bacud_unbalanced_lexical.csv")      # Loading the dataset

dataset.head()



Unnamed: 0,url_type,url_length,url_ip_in_domain,url_domain_entropy,url_is_digits_in_domain,url_query_length,url_number_of_parameters,url_number_of_digits,url_string_entropy,url_is_https,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,16,0,3.169925,0,0,0,0,3.375,0,...,0,0,0,0,0,0,0,0,0,0
1,0,35,0,2.807355,1,0,0,1,4.079143,0,...,0,0,0,0,0,0,0,0,0,0
2,0,31,0,2.921928,0,0,0,1,3.708093,0,...,0,0,0,0,0,0,0,0,0,0
3,1,88,0,2.896292,0,49,4,7,4.660343,0,...,0,0,1,0,0,0,0,0,0,27
4,1,235,0,3.405822,0,194,3,22,4.980518,0,...,0,0,1,0,0,0,0,0,0,27


In [3]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

In [4]:
valid = pd.read_csv("../../../02_feature-engineering/final-datasets/valid_unbalanced_with_lexical.csv")
valid.head()

y_valid = valid['url_type']
X_valid = valid.drop(columns=['url_type'])

#### 2. Preprocessing (Balancing)

In [5]:
dataset['url_type'].value_counts()

url_type
0    724778
1    380244
Name: count, dtype: int64

#### 3. Removing Unnecessary Features

In [6]:
important_features_wrapper_33 = ['url_length',
 'url_domain_entropy',
 'url_is_digits_in_domain',
 'url_number_of_parameters',
 'url_number_of_digits',
 'url_string_entropy',
 'url_path_length',
 'url_host_length',
 'get_tld',
 'url_domain_len',
 'url_num_subdomain',
 'url_number_of_fragments',
 'url_is_encoded',
 'url_number_of_letters',
 'url_num_periods',
 'url_num_of_hyphens',
 'url_num_underscore',
 'url_num_forward_slash',
 'url_num_semicolon',
 'url_num_mod_sign',
 'has_login_in_string',
 'has_signin_in_string',
 'has_logon_in_string',
 'has_loginasp_in_string',
 'has_exe_in_string',
 'has_viewerphp_in_string',
 'has_getImageasp_in_string',
 'has_paypal_in_string',
 'has_dbsysphp_in_string',
 'has_shopping_in_string',
 'has_php_in_string',
 'has_bin_in_string',
 'has_personal_in_string',
 'url_scheme'
 ]

important_features_wrapper_12 = ['url_domain_entropy', 
                              'url_number_of_parameters', 
                              'url_number_of_digits', 
                              'url_path_length', 
                              'url_host_length', 
                              'get_tld', 
                              'url_domain_len', 
                              'url_num_subdomain', 
                              'url_number_of_letters', 
                              'url_num_periods', 
                              'url_num_of_hyphens', 
                              'url_num_forward_slash', 
                              'url_num_semicolon', 
                              'has_login_in_string', 
                              'has_exe_in_string', 
                              'has_php_in_string', 
                              'url_scheme']

X_test_12 = x_test[important_features_wrapper_12]
X_train_12 = x_train[important_features_wrapper_12]

X_test_33 = x_test[important_features_wrapper_33]
X_train_33 = x_train[important_features_wrapper_33]
X_valid_33 = X_valid[important_features_wrapper_33]

X_test_33.head()

Unnamed: 0,url_length,url_domain_entropy,url_is_digits_in_domain,url_number_of_parameters,url_number_of_digits,url_string_entropy,url_path_length,url_host_length,get_tld,url_domain_len,...,has_exe_in_string,has_viewerphp_in_string,has_getImageasp_in_string,has_paypal_in_string,has_dbsysphp_in_string,has_shopping_in_string,has_php_in_string,has_bin_in_string,has_personal_in_string,url_scheme
165686,58,1.921928,0,1,14,4.659537,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
712465,76,2.75,0,0,10,4.626107,42,26,152,8,...,0,0,0,0,0,0,0,0,0,2
335773,133,2.664498,0,7,5,4.65759,10,21,332,14,...,0,0,0,0,0,0,1,0,0,27
533676,30,3.38158,0,0,0,3.989898,1,21,320,17,...,0,0,0,0,0,0,0,0,0,28
642230,44,2.699514,0,0,0,3.772185,44,0,202,14,...,0,0,0,0,0,0,0,0,0,0


#### Hyper-parameter Tuning

##### 12 Features (Purely Lexical)

In [7]:
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

# Define the objective function for Optuna
'''def objective_12(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_12, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_12 = optuna.create_study(direction='minimize')
study_12.optimize(objective_12, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_12 = study_12.best_params
best_error = study_12.best_value
print("Best Hyperparameters (12 Features): ", best_params_12)
print("Best Error (12 Features): ", best_error)'''

  from .autonotebook import tqdm as notebook_tqdm


'def objective_12(trial):\n    # Define the search space for hyperparameters\n    param = {\n        \'objective\': \'binary:hinge\',\n        \'eval_metric\': \'error\',\n        \'eta\': trial.suggest_float(\'eta\', 0.01, 0.3),\n        \'n_estimators\': 100000, # Fix the boosting round and use early stopping\n        \'max_depth\': trial.suggest_int(\'max_depth\', 3, 10),\n        \'subsample\': trial.suggest_float(\'subsample\', 0.5, 1.0),\n        \'colsample_bytree\': trial.suggest_float(\'colsample_bytree\', 0.5, 1.0),\n        \'gamma\': trial.suggest_float(\'gamma\', 0.0, 10.0),\n        \'min_child_weight\': trial.suggest_float(\'min_child_weight\', 0.1, 10.0),\n        \'lambda\': trial.suggest_float(\'lambda\', 0.1, 10.0),\n        \'alpha\': trial.suggest_float(\'alpha\', 0.0, 10.0),\n    }\n    \n    # Split the data into further training and validation sets (three sets are preferable)\n    train_data, valid_data, train_target, valid_target = train_test_split(X_train_12, 

##### 33 Features (Purely Lexical)

In [15]:
# Define the objective function for Optuna
def objective_33(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_33, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_33 = optuna.create_study(direction='minimize')
optuna.logging.disable_propagation
study_33.optimize(objective_33, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_33 = study_33.best_params
best_error_33 = study_33.best_value
print("Best Hyperparameters (33 Features): ", best_params_33)
print("Best Error (33 Features): ", best_error_33)

Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65653
[1]	validation-error:0.65653
[2]	validation-error:0.65653
[3]	validation-error:0.39471
[4]	validation-error:0.18653
[5]	validation-error:0.11853
[6]	validation-error:0.09378
[7]	validation-error:0.08795
[8]	validation-error:0.07723
[9]	validation-error:0.07084
[10]	validation-error:0.06605
[11]	validation-error:0.06537
[12]	validation-error:0.06444
[13]	validation-error:0.06175
[14]	validation-error:0.06095
[15]	validation-error:0.06018
[16]	validation-error:0.05981
[17]	validation-error:0.05950
[18]	validation-error:0.05940
[19]	validation-error:0.05904
[20]	validation-error:0.05901
[21]	validation-error:0.05853
[22]	validation-error:0.05822
[23]	validation-error:0.05676
[24]	validation-error:0.05671
[25]	validation-error:0.05630
[26]	validation-error:0.05596
[27]	validation-error:0.05570
[28]	validation-error:0.05545
[29]	validation-error:0.05524
[30]	validation-error:0.05489
[31]	validation-error:0.05446


Trial 0 failed with parameters: {'eta': 0.15942487787595352, 'max_depth': 6, 'subsample': 0.9868074499358724, 'colsample_bytree': 0.7565282277150545, 'gamma': 1.3640586103424746, 'min_child_weight': 8.713758328180456, 'lambda': 7.558214814876345, 'alpha': 5.466847792880651} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\sison\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\sison\AppData\Local\Temp\ipykernel_28244\2053542314.py", line 29, in objective_33
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sison\AppData\Local\Programs

KeyboardInterrupt: 

In [None]:
# print(best_params_12)
print(best_params_33)

{'eta': 0.24226975546058, 'max_depth': 10, 'subsample': 0.6466791951857404, 'colsample_bytree': 0.9360515577115375, 'gamma': 0.7881935821756819, 'min_child_weight': 5.796268337379454, 'lambda': 8.312555103835237, 'alpha': 2.489715207685439}


#### Model Training

In [None]:
from sklearn.model_selection import KFold
import optuna

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

# best_params_12['objective'] = 'binary:hinge'
# best_params_12['eval_metric'] = 'error'

best_params_33['objective'] = 'binary:hinge'
best_params_33['eval_metric'] = 'error'

# Convert the data into DMatrix format
# dtrain_12 = DMatrix(X_train_12, label=y_train)
# dvalid_12 = DMatrix(X_test_12, label=y_test)

dtrain_33 = DMatrix(X_train_33, label=y_train)
dvalid_33 = DMatrix(X_test_33, label=y_test)
lexical_valid = DMatrix(X_valid_33, label = y_valid)

# Train the Model
# xgb_classifier_12 = train(best_params_12, dtrain_12, num_boost_round=3000)
# y_pred_12 = xgb_classifier_12.predict(dvalid_12)

# print("Model with 12 Features Done.")

xgb_classifier_33 = train(best_params_33, dtrain_33, num_boost_round=3000)
y_pred_33 = xgb_classifier_33.predict(dvalid_33)
y_pred_valid = xgb_classifier_33.predict(lexical_valid)

print("Model with 33 Features Done.")

Model with 33 Features Done.


In [None]:
# Exporting some stuff for concept drift

# Actual Values
y_test.to_csv("warm-up-actual.csv", encoding='utf-8', index=False)
y_valid.to_csv("testing-actual.csv", encoding='utf-8', index=False)

temp_pred_test = pd.DataFrame(y_pred_33)
temp_pred_valid = pd.DataFrame(y_pred_valid)

temp_pred_test.to_csv("warm-up-predicted.csv", encoding='utf-8', index=False)
temp_pred_valid.to_csv("testing-predicted.csv", encoding='utf-8', index=False)

#### Evaluation

In [None]:
# Classification Report
print("------------- Test Evaluation -------------")
# print(classification_report(y_test, y_pred_12))
print(classification_report(y_test, y_pred_33))

------------- Test Evaluation -------------
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    144867
           1       0.96      0.95      0.96     76138

    accuracy                           0.97    221005
   macro avg       0.97      0.97      0.97    221005
weighted avg       0.97      0.97      0.97    221005



In [None]:
# Classification Report
print("------------- Practical Evaluation -------------")

print(classification_report(y_valid, y_pred_valid))

------------- Practical Evaluation -------------
              precision    recall  f1-score   support

         0.0       1.00      0.01      0.02      1000
         1.0       0.00      0.00      0.00         0

    accuracy                           0.01      1000
   macro avg       0.50      0.01      0.01      1000
weighted avg       1.00      0.01      0.02      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred_12, labels=xgb_classifier_12.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier_12.classes_)
disp.plot()
plt.show()'''

'# Confusion Matrix for 12 Features\ncm_up = confusion_matrix(y_test, y_pred_12, labels=xgb_classifier_12.classes_)\ndisp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier_12.classes_)\ndisp.plot()\nplt.show()'

In [None]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(**params_gbm),
                        X_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

"# Cross Validation Score\nscores = cross_val_score(XGBClassifier(**params_gbm),\n                        X_train, y_train, scoring='accuracy', cv=cv).mean()\n\nprint(scores)"

In [None]:
# Dumping the model
# joblib.dump(xgb_classifier_12, 'xgb_ffs_12.sav')
# joblib.dump(xgb_classifier_33, 'xgb_ffs_33.sav')

In [None]:
import lexical_generator_12
import lexical_generator_33
import time

'''def xgb_predict_maliciousness_12(url):

    numerical_values = lexical_generator_12.lexical_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier_12.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"'''
        
def xgb_predict_maliciousness_33(url):

    numerical_values = lexical_generator_33.lexical_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier_33.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"

url = "www.facebook.com/"
print("Current URL: "+url)

'''print("------------- Wrapper-Based (12 Features) -------------")
for i in range(15):
    start = time.perf_counter()
    prediction = xgb_predict_maliciousness_12(url)
    end = time.perf_counter()
    print("Trial "+str(i))
    print(prediction)
    print(end-start)'''

print("------------- Wrapper-Based (33 Features) -------------")
for i in range(15):
    start = time.perf_counter()
    prediction = xgb_predict_maliciousness_33(url)
    end = time.perf_counter()
    print("Trial "+str(i))
    print(prediction)
    print(end-start)

Current URL: www.facebook.com/
------------- Wrapper-Based (33 Features) -------------
Trial 0
Benign
0.14897370000835508
Trial 1
Benign
0.018642800045199692
Trial 2
Benign
0.018542799982242286
Trial 3
Benign
0.017739899980369955
Trial 4
Benign
0.019178599992301315
Trial 5
Benign
0.018538899952545762
Trial 6
Benign
0.018301800009794533
Trial 7
Benign
0.017975899972952902
Trial 8
Benign
0.018234599963761866
Trial 9
Benign
0.018532600020989776
Trial 10
Benign
0.01923849998274818
Trial 11
Benign
0.01895750005496666
Trial 12
Benign
0.017861199972685426
Trial 13
Benign
0.017586899979505688
Trial 14
Benign
0.018745700013823807
