# XGB Classifier with Filter-Based FS

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [2]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier, DMatrix, train
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_unbalanced_with_lexicalcontent.csv")      # Loading the dataset

dataset.head()

Unnamed: 0,url_type,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,14,36,0,0.0,1186,5.269303,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,415,3952,11450,4.68393,26155,4.565537,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5715,88450,420,5.833333,339327,4.656704,13,10,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1461,6292,3049,4.515907,89249,5.14958,11,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,43,273,0,0.0,1530,4.754726,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
dataset.head()

Unnamed: 0,url_type,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,14,36,0,0.0,1186,5.269303,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,415,3952,11450,4.68393,26155,4.565537,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5715,88450,420,5.833333,339327,4.656704,13,10,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1461,6292,3049,4.515907,89249,5.14958,11,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,43,273,0,0.0,1530,4.754726,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [5]:
dataset['url_type'].value_counts()

url_type
0    120080
1     62334
Name: count, dtype: int64

#### 3. Removing Unnecessary Features

In [6]:
important_features_lexical = ['url_host_length',
                              'url_is_https',  
                              'url_num_periods',
                              'url_path_length',
                              'js_count',
                              'has_log_in_html',
                              'meta_tag_count',
                              'js_search_count',
                              'url_num_forward_slash',
                              'sus_js_count',
                              'js_link_count',
                              'webpage_entropy',
                              'url_num_ampersand',
                              'url_num_subdomain',
                              'webpage_size',
                              'has_free_in_html',
                              'has_php_in_string',
                              'url_num_of_hyphens',
                              'url_query_length',
                              'get_tld',
                              'url_scheme']

X_test_lexical_content = x_test[important_features_lexical]
X_train_lexical_content = x_train[important_features_lexical]

print("Lexical+Content-Based XGB Model has a total of "+str(len(important_features_lexical))+" features.")

Lexical+Content-Based XGB Model has a total of 21 features.


#### Hyper-parameter Optimization

In [7]:
# Define the objective function for Optuna
def objective_lexical_content(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_lexical_content, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_lexical_content = optuna.create_study(direction='minimize')
study_lexical_content.optimize(objective_lexical_content, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_lexical_content = study_lexical_content.best_params
best_error_lexical_content = study_lexical_content.best_value
print("Best Hyperparameters: ", best_params_lexical_content)
print("Best Error: ", best_error_lexical_content)

[I 2024-04-09 10:35:22,231] A new study created in memory with name: no-name-4c24c94b-83c5-47e0-a777-f8075edf5024


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.




[1]	validation-error:0.65995
[2]	validation-error:0.20413
[3]	validation-error:0.17566
[4]	validation-error:0.12571
[5]	validation-error:0.12327
[6]	validation-error:0.11841
[7]	validation-error:0.11776
[8]	validation-error:0.11656
[9]	validation-error:0.11464
[10]	validation-error:0.09425
[11]	validation-error:0.09258
[12]	validation-error:0.09138
[13]	validation-error:0.08857
[14]	validation-error:0.08826
[15]	validation-error:0.08737
[16]	validation-error:0.08610
[17]	validation-error:0.08521
[18]	validation-error:0.08391
[19]	validation-error:0.08494
[20]	validation-error:0.08319
[21]	validation-error:0.08339
[22]	validation-error:0.08202
[23]	validation-error:0.07997
[24]	validation-error:0.07901
[25]	validation-error:0.07918
[26]	validation-error:0.07843
[27]	validation-error:0.07839
[28]	validation-error:0.07805
[29]	validation-error:0.07716
[30]	validation-error:0.07661
[31]	validation-error:0.07623
[32]	validation-error:0.07634
[33]	validation-error:0.07592
[34]	validation-er

[I 2024-04-09 10:35:30,054] Trial 0 finished with value: 0.2024281288757224 and parameters: {'eta': 0.24134137570737962, 'max_depth': 3, 'subsample': 0.605447303272991, 'colsample_bytree': 0.837582623308372, 'gamma': 3.6869901056534458, 'min_child_weight': 0.9105365592472066, 'lambda': 8.25265806591607, 'alpha': 2.6994109573039227}. Best is trial 0 with value: 0.2024281288757224.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995
[3]	validation-error:0.65995
[4]	validation-error:0.65995
[5]	validation-error:0.21345
[6]	validation-error:0.13348
[7]	validation-error:0.09497
[8]	validation-error:0.07192
[9]	validation-error:0.06304
[10]	validation-error:0.05979
[11]	validation-error:0.05818
[12]	validation-error:0.05650
[13]	validation-error:0.05550
[14]	validation-error:0.05458
[15]	validation-error:0.05407
[16]	validation-error:0.05383
[17]	validation-error:0.05335
[18]	validation-error:0.05280
[19]	validation-error:0.05232
[20]	validation-error:0.05177
[21]	validation-error:0.05146
[22]	validation-error:0.05095
[23]	validation-error:0.05054
[24]	validation-error:0.05054
[25]	validation-error:0.05009
[26]	validation-error:0.04958
[27]	validation-error:0.04923
[28]	validation-error:0.04889
[29]	validation-error:0.04855
[30]	validation-error:0.04827
[31]	validation-error:0.04800
[32]	validation-error:0.04790
[33]	validation-erro

[I 2024-04-09 10:35:36,302] Trial 1 finished with value: 0.18675797658082016 and parameters: {'eta': 0.10031864027110182, 'max_depth': 9, 'subsample': 0.9231738329456532, 'colsample_bytree': 0.7841748561498321, 'gamma': 0.710300338796076, 'min_child_weight': 6.358346542127798, 'lambda': 8.191237945400758, 'alpha': 5.832972334974991}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995
[3]	validation-error:0.65995
[4]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.65995
[6]	validation-error:0.65995
[7]	validation-error:0.65995
[8]	validation-error:0.65995
[9]	validation-error:0.65995
[10]	validation-error:0.65995
[11]	validation-error:0.65995
[12]	validation-error:0.56655
[13]	validation-error:0.42252
[14]	validation-error:0.28729
[15]	validation-error:0.22483
[16]	validation-error:0.17169
[17]	validation-error:0.14390
[18]	validation-error:0.11998
[19]	validation-error:0.10135
[20]	validation-error:0.09151
[21]	validation-error:0.08514
[22]	validation-error:0.07918
[23]	validation-error:0.07544
[24]	validation-error:0.07264
[25]	validation-error:0.06828
[26]	validation-error:0.06506
[27]	validation-error:0.06311
[28]	validation-error:0.06253
[29]	validation-error:0.06119
[30]	validation-error:0.06030
[31]	validation-error:0.05920
[32]	validation-error:0.05872
[33]	validation-error:0.05814
[34]	validation-error:0.05763
[35]	validation-error:0.05698
[36]	validation-error:0.05701
[37]	validation-error:0.05653
[38]	validation

[I 2024-04-09 10:35:45,422] Trial 2 finished with value: 0.18749036361299634 and parameters: {'eta': 0.04062732689081018, 'max_depth': 9, 'subsample': 0.5046019976822411, 'colsample_bytree': 0.6032282180687558, 'gamma': 0.3020848638754192, 'min_child_weight': 7.4374562582728, 'lambda': 6.873383676362839, 'alpha': 1.7653286513654776}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.24847
[3]	validation-error:0.12019


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.07938
[5]	validation-error:0.06472
[6]	validation-error:0.06105
[7]	validation-error:0.05814
[8]	validation-error:0.05698
[9]	validation-error:0.05564
[10]	validation-error:0.05561
[11]	validation-error:0.05496
[12]	validation-error:0.05365
[13]	validation-error:0.05266
[14]	validation-error:0.05139
[15]	validation-error:0.05146
[16]	validation-error:0.05098
[17]	validation-error:0.05006
[18]	validation-error:0.04978
[19]	validation-error:0.04845
[20]	validation-error:0.04769
[21]	validation-error:0.04718
[22]	validation-error:0.04646
[23]	validation-error:0.04584
[24]	validation-error:0.04557
[25]	validation-error:0.04505
[26]	validation-error:0.04509
[27]	validation-error:0.04468
[28]	validation-error:0.04475
[29]	validation-error:0.04454
[30]	validation-error:0.04420
[31]	validation-error:0.04403
[32]	validation-error:0.04358
[33]	validation-error:0.04351
[34]	validation-error:0.04331
[35]	validation-error:0.04310
[36]	validation-error:0.04303
[37]	validation-

[I 2024-04-09 10:35:48,804] Trial 3 finished with value: 0.20166504630217466 and parameters: {'eta': 0.21375017381007977, 'max_depth': 9, 'subsample': 0.7693609422486037, 'colsample_bytree': 0.6379551558126213, 'gamma': 3.152832144126104, 'min_child_weight': 8.31553189861156, 'lambda': 2.434435011659823, 'alpha': 9.174681854007432}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.32093
[3]	validation-error:0.10422


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.08134
[5]	validation-error:0.07503
[6]	validation-error:0.07250
[7]	validation-error:0.06541
[8]	validation-error:0.06366
[9]	validation-error:0.06099
[10]	validation-error:0.05886
[11]	validation-error:0.05770
[12]	validation-error:0.05592
[13]	validation-error:0.05547
[14]	validation-error:0.05393
[15]	validation-error:0.05331
[16]	validation-error:0.05266
[17]	validation-error:0.05245
[18]	validation-error:0.05218
[19]	validation-error:0.05187
[20]	validation-error:0.05122
[21]	validation-error:0.05040
[22]	validation-error:0.05023
[23]	validation-error:0.05009
[24]	validation-error:0.04999
[25]	validation-error:0.04992
[26]	validation-error:0.04920
[27]	validation-error:0.04910
[28]	validation-error:0.04896
[29]	validation-error:0.04869
[30]	validation-error:0.04865
[31]	validation-error:0.04869
[32]	validation-error:0.04851
[33]	validation-error:0.04797
[34]	validation-error:0.04773
[35]	validation-error:0.04738
[36]	validation-error:0.04718
[37]	validation-

[I 2024-04-09 10:35:50,081] Trial 4 finished with value: 0.21145138143279313 and parameters: {'eta': 0.19452717422014057, 'max_depth': 7, 'subsample': 0.8472049488739994, 'colsample_bytree': 0.9112861255626135, 'gamma': 6.308115863513408, 'min_child_weight': 0.5804285465650644, 'lambda': 8.217636396145547, 'alpha': 0.9658057928561159}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:35:50,233] Trial 5 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.23555
[3]	validation-error:0.14719
[4]	validation-error:0.11844
[5]	validation-error:0.09374
[6]	validation-error:0.08877
[7]	validation-error:0.08449
[8]	validation-error:0.07315


[I 2024-04-09 10:35:50,479] Trial 6 pruned. Trial was pruned at iteration 8.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:35:50,654] Trial 7 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:35:50,822] Trial 8 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.54819
[2]	validation-error:0.12016
[3]	validation-error:0.07558
[4]	validation-error:0.06876
[5]	validation-error:0.06678
[6]	validation-error:0.06530
[7]	validation-error:0.06397
[8]	validation-error:0.06301
[9]	validation-error:0.06188


[I 2024-04-09 10:35:51,106] Trial 9 pruned. Trial was pruned at iteration 10.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:51,325] Trial 10 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:35:51,576] Trial 11 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:51,810] Trial 12 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:35:51,992] Trial 13 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:35:52,201] Trial 14 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:52,424] Trial 15 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:35:52,608] Trial 16 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:35:52,803] Trial 17 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:53,009] Trial 18 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:53,187] Trial 19 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:53,371] Trial 20 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.40203


[I 2024-04-09 10:35:53,604] Trial 21 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.28742
[3]	validation-error:0.14911
[4]	validation-error:0.10042
[5]	validation-error:0.07476
[6]	validation-error:0.06774
[7]	validation-error:0.06342
[8]	validation-error:0.06092
[9]	validation-error:0.05903
[10]	validation-error:0.05903
[11]	validation-error:0.05794


[I 2024-04-09 10:35:53,911] Trial 22 pruned. Trial was pruned at iteration 12.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:35:54,107] Trial 23 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.32086


Parameters: { "n_estimators" } are not used.



[2]	validation-error:0.12739
[3]	validation-error:0.08524
[4]	validation-error:0.06660
[5]	validation-error:0.05835
[6]	validation-error:0.05770
[7]	validation-error:0.05544
[8]	validation-error:0.05352
[9]	validation-error:0.05170
[10]	validation-error:0.05040
[11]	validation-error:0.04968
[12]	validation-error:0.04865
[13]	validation-error:0.04762
[14]	validation-error:0.04708
[15]	validation-error:0.04615
[16]	validation-error:0.04574
[17]	validation-error:0.04529
[18]	validation-error:0.04454
[19]	validation-error:0.04403
[20]	validation-error:0.04409
[21]	validation-error:0.04399
[22]	validation-error:0.04331
[23]	validation-error:0.04300
[24]	validation-error:0.04262
[25]	validation-error:0.04242
[26]	validation-error:0.04224
[27]	validation-error:0.04214
[28]	validation-error:0.04211
[29]	validation-error:0.04190
[30]	validation-error:0.04183
[31]	validation-error:0.04170
[32]	validation-error:0.04142
[33]	validation-error:0.04146
[34]	validation-error:0.04098
[35]	validation-er

[I 2024-04-09 10:35:56,937] Trial 24 finished with value: 0.19209370036899007 and parameters: {'eta': 0.2972187080577119, 'max_depth': 9, 'subsample': 0.7179162585730621, 'colsample_bytree': 0.6263605153203864, 'gamma': 0.966641139595508, 'min_child_weight': 8.898608830461171, 'lambda': 3.035243921865512, 'alpha': 6.361470164414561}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995
[1]	validation-error:0.38394


Parameters: { "n_estimators" } are not used.



[2]	validation-error:0.16545
[3]	validation-error:0.11502
[4]	validation-error:0.09336
[5]	validation-error:0.08178
[6]	validation-error:0.08113
[7]	validation-error:0.07534
[8]	validation-error:0.06626
[9]	validation-error:0.06102
[10]	validation-error:0.05944


[I 2024-04-09 10:35:57,193] Trial 25 pruned. Trial was pruned at iteration 10.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.62939
[2]	validation-error:0.15956
[3]	validation-error:0.10313
[4]	validation-error:0.07822
[5]	validation-error:0.07185
[6]	validation-error:0.07000
[7]	validation-error:0.07037


[I 2024-04-09 10:35:57,437] Trial 26 pruned. Trial was pruned at iteration 8.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995


[I 2024-04-09 10:35:57,637] Trial 27 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:35:57,836] Trial 28 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.63922
[2]	validation-error:0.14226
[3]	validation-error:0.07736
[4]	validation-error:0.06362
[5]	validation-error:0.05989
[6]	validation-error:0.05893
[7]	validation-error:0.05828
[8]	validation-error:0.05691
[9]	validation-error:0.05612
[10]	validation-error:0.05437
[11]	validation-error:0.05338
[12]	validation-error:0.05242
[13]	validation-error:0.05177
[14]	validation-error:0.05105
[15]	validation-error:0.05006
[16]	validation-error:0.05002
[17]	validation-error:0.04944
[18]	validation-error:0.04851
[19]	validation-error:0.04797
[20]	validation-error:0.04749
[21]	validation-error:0.04721
[22]	validation-error:0.04701
[23]	validation-error:0.04646
[24]	validation-error:0.04622
[25]	validation-error:0.04581
[26]	validation-error:0.04567
[27]	validation-error:0.04519
[28]	validation-error:0.04516
[29]	validation-error:0.04533
[30]	validation-error:0.04516
[31]	validation-error:0.04488
[32]	validation-error:0.04464
[33]	validation-erro

[I 2024-04-09 10:35:59,687] Trial 29 pruned. Trial was pruned at iteration 269.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:35:59,859] Trial 30 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.22260
[3]	validation-error:0.11536
[4]	validation-error:0.07586
[5]	validation-error:0.06338
[6]	validation-error:0.06088
[7]	validation-error:0.05890
[8]	validation-error:0.05698
[9]	validation-error:0.05605
[10]	validation-error:0.05451
[11]	validation-error:0.05266
[12]	validation-error:0.05222
[13]	validation-error:0.05084
[14]	validation-error:0.05047
[15]	validation-error:0.04958
[16]	validation-error:0.04930
[17]	validation-error:0.04886
[18]	validation-error:0.04804
[19]	validation-error:0.04756
[20]	validation-error:0.04684
[21]	validation-error:0.04615
[22]	validation-error:0.04612
[23]	validation-error:0.04581
[24]	validation-error:0.04499
[25]	validation-error:0.04478
[26]	validation-error:0.04379
[27]	validation-error:0.04362
[28]	validation-error:0.04324
[29]	validation-error:0.04314
[30]	validation-error:0.04290
[31]	validation-error:0.04286
[32]	validation-error:0.04286
[33]	validation-erro

[I 2024-04-09 10:36:01,309] Trial 31 pruned. Trial was pruned at iteration 225.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:01,516] Trial 32 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.62082
[2]	validation-error:0.19382
[3]	validation-error:0.12975


[I 2024-04-09 10:36:01,777] Trial 33 pruned. Trial was pruned at iteration 4.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.22383
[3]	validation-error:0.12622
[4]	validation-error:0.09583
[5]	validation-error:0.07853
[6]	validation-error:0.07606
[7]	validation-error:0.07089
[8]	validation-error:0.06092
[9]	validation-error:0.05681
[10]	validation-error:0.05434
[11]	validation-error:0.05256
[12]	validation-error:0.05235
[13]	validation-error:0.05050
[14]	validation-error:0.04930
[15]	validation-error:0.04889
[16]	validation-error:0.04783
[17]	validation-error:0.04670
[18]	validation-error:0.04615
[19]	validation-error:0.04536
[20]	validation-error:0.04447
[21]	validation-error:0.04420
[22]	validation-error:0.04379
[23]	validation-error:0.04341
[24]	validation-error:0.04324
[25]	validation-error:0.04324
[26]	validation-error:0.04314
[27]	validation-error:0.04286
[28]	validation-error:0.04266
[29]	validation-error:0.04245
[30]	validation-error:0.04228
[31]	validation-error:0.04183
[32]	validation-error:0.04177
[33]	validation-erro

[I 2024-04-09 10:36:03,823] Trial 34 finished with value: 0.19571592859602618 and parameters: {'eta': 0.23713103461886798, 'max_depth': 10, 'subsample': 0.5871887575124759, 'colsample_bytree': 0.5017286880187946, 'gamma': 1.2804497665347958, 'min_child_weight': 7.515768625826826, 'lambda': 3.650566480522631, 'alpha': 0.04623758263714617}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.42625
[2]	validation-error:0.17436
[3]	validation-error:0.10433
[4]	validation-error:0.08487
[5]	validation-error:0.07431
[6]	validation-error:0.07346
[7]	validation-error:0.06935
[8]	validation-error:0.06013
[9]	validation-error:0.05376
[10]	validation-error:0.05249
[11]	validation-error:0.05136
[12]	validation-error:0.05040
[13]	validation-error:0.04882
[14]	validation-error:0.04793
[15]	validation-error:0.04742
[16]	validation-error:0.04697
[17]	validation-error:0.04612
[18]	validation-error:0.04588
[19]	validation-error:0.04505
[20]	validation-error:0.04502
[21]	validation-error:0.04461
[22]	validation-error:0.04399
[23]	validation-error:0.04403
[24]	validation-error:0.04358
[25]	validation-error:0.04338
[26]	validation-error:0.04300
[27]	validation-error:0.04266
[28]	validation-error:0.04276
[29]	validation-error:0.04262
[30]	validation-error:0.04214
[31]	validation-error:0.04190
[32]	validation-error:0.04156
[33]	validation-error:0.04149
[34]	validation-err

[I 2024-04-09 10:36:05,814] Trial 35 finished with value: 0.19483866766440525 and parameters: {'eta': 0.2785539882159368, 'max_depth': 10, 'subsample': 0.5835311363617826, 'colsample_bytree': 0.513046214315326, 'gamma': 1.400429248386738, 'min_child_weight': 7.611466961239313, 'lambda': 3.5973685983312578, 'alpha': 0.5208531547751275}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.35625
[2]	validation-error:0.15384
[3]	validation-error:0.10453
[4]	validation-error:0.08877
[5]	validation-error:0.07627
[6]	validation-error:0.07284
[7]	validation-error:0.06722
[8]	validation-error:0.05698
[9]	validation-error:0.05232
[10]	validation-error:0.05043
[11]	validation-error:0.04965
[12]	validation-error:0.04862
[13]	validation-error:0.04749
[14]	validation-error:0.04666
[15]	validation-error:0.04574
[16]	validation-error:0.04516
[17]	validation-error:0.04440
[18]	validation-error:0.04382
[19]	validation-error:0.04317
[20]	validation-error:0.04238
[21]	validation-error:0.04238
[22]	validation-error:0.04218
[23]	validation-error:0.04197
[24]	validation-error:0.04180
[25]	validation-error:0.04142
[26]	validation-error:0.04098
[27]	validation-error:0.04094
[28]	validation-error:0.04070
[29]	validation-error:0.04074
[30]	validation-error:0.04012
[31]	validation-error:0.04019
[32]	validation-error:0.04022
[33]	validation-error:0.04002
[34]	validation-err

[I 2024-04-09 10:36:08,600] Trial 36 finished with value: 0.19536549692977803 and parameters: {'eta': 0.29043302657211345, 'max_depth': 10, 'subsample': 0.5261062124285181, 'colsample_bytree': 0.5442881421173206, 'gamma': 0.4814682802070315, 'min_child_weight': 6.53939977382153, 'lambda': 4.7636249161879025, 'alpha': 1.1589800645142994}. Best is trial 1 with value: 0.18675797658082016.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.50224
[2]	validation-error:0.15099
[3]	validation-error:0.09422
[4]	validation-error:0.07383
[5]	validation-error:0.06260
[6]	validation-error:0.05931
[7]	validation-error:0.05533
[8]	validation-error:0.05424
[9]	validation-error:0.05249
[10]	validation-error:0.05026
[11]	validation-error:0.04862
[12]	validation-error:0.04834
[13]	validation-error:0.04629
[14]	validation-error:0.04588
[15]	validation-error:0.04533
[16]	validation-error:0.04475
[17]	validation-error:0.04341
[18]	validation-error:0.04310
[19]	validation-error:0.04248
[20]	validation-error:0.04252
[21]	validation-error:0.04262
[22]	validation-error:0.04204
[23]	validation-error:0.04221
[24]	validation-error:0.04201
[25]	validation-error:0.04166
[26]	validation-error:0.04105
[27]	validation-error:0.04132
[28]	validation-error:0.04101
[29]	validation-error:0.04101
[30]	validation-error:0.04081
[31]	validation-error:0.04101
[32]	validation-error:0.04050
[33]	validation-erro

[I 2024-04-09 10:36:10,876] Trial 37 finished with value: 0.19466274106630602 and parameters: {'eta': 0.27337453610411183, 'max_depth': 10, 'subsample': 0.5717102609802761, 'colsample_bytree': 0.5786093853904608, 'gamma': 1.5827365317368074, 'min_child_weight': 5.796489159946553, 'lambda': 8.799155831823297, 'alpha': 0.7722418649931301}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995
[2]	validation-error:0.17494
[3]	validation-error:0.11139
[4]	validation-error:0.08123
[5]	validation-error:0.06561
[6]	validation-error:0.06287
[7]	validation-error:0.06071
[8]	validation-error:0.05838
[9]	validation-error:0.05670


[I 2024-04-09 10:36:11,194] Trial 38 pruned. Trial was pruned at iteration 9.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.62192
[2]	validation-error:0.14928
[3]	validation-error:0.10011
[4]	validation-error:0.07490
[5]	validation-error:0.06592
[6]	validation-error:0.06486
[7]	validation-error:0.06239
[8]	validation-error:0.05951


[I 2024-04-09 10:36:11,449] Trial 39 pruned. Trial was pruned at iteration 9.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:36:11,681] Trial 40 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.42899
[2]	validation-error:0.16442
[3]	validation-error:0.10861
[4]	validation-error:0.08836


[I 2024-04-09 10:36:11,906] Trial 41 pruned. Trial was pruned at iteration 4.


[0]	validation-error:0.65995
[1]	validation-error:0.43345
[2]	validation-error:0.14445
[3]	validation-error:0.09069
[4]	validation-error:0.07277
[5]	validation-error:0.06249
[6]	validation-error:0.06027
[7]	validation-error:0.05790
[8]	validation-error:0.05530
[9]	validation-error:0.05328
[10]	validation-error:0.05167
[11]	validation-error:0.05036
[12]	validation-error:0.04862


Parameters: { "n_estimators" } are not used.



[13]	validation-error:0.04738
[14]	validation-error:0.04649
[15]	validation-error:0.04625
[16]	validation-error:0.04577
[17]	validation-error:0.04461
[18]	validation-error:0.04409
[19]	validation-error:0.04372
[20]	validation-error:0.04310
[21]	validation-error:0.04300
[22]	validation-error:0.04231
[23]	validation-error:0.04211
[24]	validation-error:0.04166
[25]	validation-error:0.04135
[26]	validation-error:0.04101
[27]	validation-error:0.04074
[28]	validation-error:0.04108
[29]	validation-error:0.04091
[30]	validation-error:0.04118
[31]	validation-error:0.04091
[32]	validation-error:0.04094
[33]	validation-error:0.04043
[34]	validation-error:0.04009
[35]	validation-error:0.04009
[36]	validation-error:0.04019
[37]	validation-error:0.04002
[38]	validation-error:0.03998
[39]	validation-error:0.04009
[40]	validation-error:0.03916
[41]	validation-error:0.03916
[42]	validation-error:0.03906
[43]	validation-error:0.03947
[44]	validation-error:0.03954
[45]	validation-error:0.03968
[46]	valid

[I 2024-04-09 10:36:15,014] Trial 42 finished with value: 0.19536549692977803 and parameters: {'eta': 0.2800442701505011, 'max_depth': 10, 'subsample': 0.5266479093678422, 'colsample_bytree': 0.5937265912803098, 'gamma': 0.4660003491320919, 'min_child_weight': 6.158285974697966, 'lambda': 7.974837300674143, 'alpha': 0.5205420859048515}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995
[2]	validation-error:0.23240


[I 2024-04-09 10:36:15,215] Trial 43 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.35769
[2]	validation-error:0.15959


[I 2024-04-09 10:36:15,477] Trial 44 pruned. Trial was pruned at iteration 3.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-09 10:36:15,692] Trial 45 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.




[1]	validation-error:0.65995


[I 2024-04-09 10:36:15,920] Trial 46 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995


[I 2024-04-09 10:36:16,130] Trial 47 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.33638
[2]	validation-error:0.11769
[3]	validation-error:0.07760
[4]	validation-error:0.06332
[5]	validation-error:0.05732
[6]	validation-error:0.05530
[7]	validation-error:0.05204
[8]	validation-error:0.05136
[9]	validation-error:0.04999
[10]	validation-error:0.04889
[11]	validation-error:0.04742
[12]	validation-error:0.04690
[13]	validation-error:0.04571
[14]	validation-error:0.04553
[15]	validation-error:0.04447
[16]	validation-error:0.04372
[17]	validation-error:0.04310
[18]	validation-error:0.04259
[19]	validation-error:0.04272
[20]	validation-error:0.04266
[21]	validation-error:0.04201
[22]	validation-error:0.04187
[23]	validation-error:0.04142
[24]	validation-error:0.04101
[25]	validation-error:0.04029
[26]	validation-error:0.04019
[27]	validation-error:0.04005
[28]	validation-error:0.04029
[29]	validation-error:0.04019
[30]	validation-error:0.03998
[31]	validation-error:0.04002
[32]	validation-error:0.03978
[33]	validation-erro

[I 2024-04-09 10:36:18,158] Trial 48 finished with value: 0.1969375289546276 and parameters: {'eta': 0.2990744798294893, 'max_depth': 10, 'subsample': 0.5587449565801841, 'colsample_bytree': 0.6548999405435446, 'gamma': 1.006977772749516, 'min_child_weight': 3.364885742973213, 'lambda': 3.8823229915244903, 'alpha': 3.107497423410337}. Best is trial 1 with value: 0.18675797658082016.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:18,372] Trial 49 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:18,566] Trial 50 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.42180
[2]	validation-error:0.16189
[3]	validation-error:0.10518


[I 2024-04-09 10:36:18,797] Trial 51 pruned. Trial was pruned at iteration 3.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.41402
[2]	validation-error:0.14085
[3]	validation-error:0.09062
[4]	validation-error:0.07287
[5]	validation-error:0.06123
[6]	validation-error:0.05735
[7]	validation-error:0.05478
[8]	validation-error:0.05232
[9]	validation-error:0.05108
[10]	validation-error:0.04975
[11]	validation-error:0.04848
[12]	validation-error:0.04804
[13]	validation-error:0.04697
[14]	validation-error:0.04639
[15]	validation-error:0.04564
[16]	validation-error:0.04499
[17]	validation-error:0.04485
[18]	validation-error:0.04457
[19]	validation-error:0.04396
[20]	validation-error:0.04334
[21]	validation-error:0.04303
[22]	validation-error:0.04262
[23]	validation-error:0.04262
[24]	validation-error:0.04238
[25]	validation-error:0.04204
[26]	validation-error:0.04177
[27]	validation-error:0.04159
[28]	validation-error:0.04118
[29]	validation-error:0.04129
[30]	validation-error:0.04101
[31]	validation-error:0.04129
[32]	validation-error:0.04067
[33]	validation-erro

[I 2024-04-09 10:36:20,828] Trial 52 finished with value: 0.19953006083707683 and parameters: {'eta': 0.28610320750949897, 'max_depth': 10, 'subsample': 0.5235967540866505, 'colsample_bytree': 0.6131759058310653, 'gamma': 0.524396146464646, 'min_child_weight': 6.54777887883144, 'lambda': 4.937778609668618, 'alpha': 1.1358144462871356}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.27776
[2]	validation-error:0.07092
[3]	validation-error:0.06160
[4]	validation-error:0.05821
[5]	validation-error:0.05520
[6]	validation-error:0.05317
[7]	validation-error:0.05215
[8]	validation-error:0.05019
[9]	validation-error:0.04965
[10]	validation-error:0.04838
[11]	validation-error:0.04776
[12]	validation-error:0.04759
[13]	validation-error:0.04625
[14]	validation-error:0.04553
[15]	validation-error:0.04547
[16]	validation-error:0.04447
[17]	validation-error:0.04413
[18]	validation-error:0.04396
[19]	validation-error:0.04324
[20]	validation-error:0.04307
[21]	validation-error:0.04269
[22]	validation-error:0.04262
[23]	validation-error:0.04248
[24]	validation-error:0.04190
[25]	validation-error:0.04224
[26]	validation-error:0.04190
[27]	validation-error:0.04190
[28]	validation-error:0.04159
[29]	validation-error:0.04135
[30]	validation-error:0.04132
[31]	validation-error:0.04146
[32]	validation-error:0.04118
[33]	validation-error:0.04101
[34]	validation-err

[I 2024-04-09 10:36:21,340] Trial 53 pruned. Trial was pruned at iteration 36.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:21,526] Trial 54 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.




[1]	validation-error:0.37592
[2]	validation-error:0.09535
[3]	validation-error:0.06513
[4]	validation-error:0.05492
[5]	validation-error:0.05249
[6]	validation-error:0.04968
[7]	validation-error:0.04944
[8]	validation-error:0.04786
[9]	validation-error:0.04629
[10]	validation-error:0.04526
[11]	validation-error:0.04409
[12]	validation-error:0.04358
[13]	validation-error:0.04362
[14]	validation-error:0.04314
[15]	validation-error:0.04283
[16]	validation-error:0.04228
[17]	validation-error:0.04221
[18]	validation-error:0.04248
[19]	validation-error:0.04173
[20]	validation-error:0.04156
[21]	validation-error:0.04129
[22]	validation-error:0.04135
[23]	validation-error:0.04139
[24]	validation-error:0.04094
[25]	validation-error:0.04060
[26]	validation-error:0.04057
[27]	validation-error:0.04033
[28]	validation-error:0.04046
[29]	validation-error:0.04043
[30]	validation-error:0.04039
[31]	validation-error:0.03988
[32]	validation-error:0.03971
[33]	validation-error:0.03954
[34]	validation-er

[I 2024-04-09 10:36:23,475] Trial 55 finished with value: 0.19466274106630602 and parameters: {'eta': 0.2899814992441893, 'max_depth': 10, 'subsample': 0.5636935217210775, 'colsample_bytree': 0.7631039564924248, 'gamma': 0.07923156269697784, 'min_child_weight': 6.057268439524371, 'lambda': 3.007983277403706, 'alpha': 1.7007737661514328}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995


[I 2024-04-09 10:36:23,661] Trial 56 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:23,849] Trial 57 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:24,075] Trial 58 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.






[I 2024-04-09 10:36:24,281] Trial 59 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.65995


[I 2024-04-09 10:36:24,468] Trial 60 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.31692
[2]	validation-error:0.15236
[3]	validation-error:0.10306
[4]	validation-error:0.08771


[I 2024-04-09 10:36:24,699] Trial 61 pruned. Trial was pruned at iteration 4.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.39189
[2]	validation-error:0.15661
[3]	validation-error:0.10392


[I 2024-04-09 10:36:24,942] Trial 62 pruned. Trial was pruned at iteration 4.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.48093


[I 2024-04-09 10:36:25,150] Trial 63 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:25,337] Trial 64 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.36475
[2]	validation-error:0.16024
[3]	validation-error:0.10947


[I 2024-04-09 10:36:25,595] Trial 65 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.




[1]	validation-error:0.65995


[I 2024-04-09 10:36:25,796] Trial 66 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:26,010] Trial 67 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:26,214] Trial 68 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:26,424] Trial 69 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.54014


[I 2024-04-09 10:36:26,643] Trial 70 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.41741
[2]	validation-error:0.14301
[3]	validation-error:0.08977
[4]	validation-error:0.07250
[5]	validation-error:0.06253
[6]	validation-error:0.06057
[7]	validation-error:0.05718
[8]	validation-error:0.05444
[9]	validation-error:0.05352
[10]	validation-error:0.05088
[11]	validation-error:0.04886
[12]	validation-error:0.04766
[13]	validation-error:0.04694
[14]	validation-error:0.04632
[15]	validation-error:0.04519
[16]	validation-error:0.04423
[17]	validation-error:0.04386
[18]	validation-error:0.04276
[19]	validation-error:0.04238
[20]	validation-error:0.04214
[21]	validation-error:0.04142
[22]	validation-error:0.04159
[23]	validation-error:0.04146
[24]	validation-error:0.04108
[25]	validation-error:0.04067
[26]	validation-error:0.04067
[27]	validation-error:0.04063
[28]	validation-error:0.04077
[29]	validation-error:0.04060
[30]	validation-error:0.04063
[31]	validation-error:0.04057
[32]	validation-error:0.04029
[33]	validation-erro

[I 2024-04-09 10:36:29,548] Trial 71 finished with value: 0.19510226011999407 and parameters: {'eta': 0.28314803219900386, 'max_depth': 10, 'subsample': 0.528732931038588, 'colsample_bytree': 0.6038720953185559, 'gamma': 0.36120570026534815, 'min_child_weight': 6.149742076081085, 'lambda': 8.03133385115736, 'alpha': 0.5120524123490713}. Best is trial 1 with value: 0.18675797658082016.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.42104
[2]	validation-error:0.13002
[3]	validation-error:0.08589
[4]	validation-error:0.06746
[5]	validation-error:0.05759
[6]	validation-error:0.05496
[7]	validation-error:0.05126
[8]	validation-error:0.05095
[9]	validation-error:0.04917
[10]	validation-error:0.04841
[11]	validation-error:0.04663
[12]	validation-error:0.04571
[13]	validation-error:0.04543
[14]	validation-error:0.04485
[15]	validation-error:0.04444
[16]	validation-error:0.04382
[17]	validation-error:0.04334
[18]	validation-error:0.04310
[19]	validation-error:0.04262
[20]	validation-error:0.04242
[21]	validation-error:0.04204
[22]	validation-error:0.04201
[23]	validation-error:0.04201
[24]	validation-error:0.04166
[25]	validation-error:0.04149
[26]	validation-error:0.04101
[27]	validation-error:0.04060
[28]	validation-error:0.04029
[29]	validation-error:0.03998
[30]	validation-error:0.04012
[31]	validation-error:0.04015
[32]	validation-error:0.04012
[33]	validation-error:0.04012
[34]	validation-err

[I 2024-04-09 10:36:30,370] Trial 72 pruned. Trial was pruned at iteration 62.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:30,583] Trial 73 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.






[I 2024-04-09 10:36:30,791] Trial 74 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:36:30,987] Trial 75 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:31,181] Trial 76 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.56577


[I 2024-04-09 10:36:31,402] Trial 77 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.






[I 2024-04-09 10:36:31,601] Trial 78 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:31,800] Trial 79 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:31,999] Trial 80 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.41851
[2]	validation-error:0.14404
[3]	validation-error:0.09052
[4]	validation-error:0.07281
[5]	validation-error:0.06304
[6]	validation-error:0.05989
[7]	validation-error:0.05763
[8]	validation-error:0.05554
[9]	validation-error:0.05362


[I 2024-04-09 10:36:32,382] Trial 81 pruned. Trial was pruned at iteration 9.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.32871
[2]	validation-error:0.13708
[3]	validation-error:0.09059
[4]	validation-error:0.07277
[5]	validation-error:0.06150
[6]	validation-error:0.05807
[7]	validation-error:0.05537
[8]	validation-error:0.05437
[9]	validation-error:0.05341
[10]	validation-error:0.05211


[I 2024-04-09 10:36:32,694] Trial 82 pruned. Trial was pruned at iteration 10.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.41950
[2]	validation-error:0.14489
[3]	validation-error:0.09021
[4]	validation-error:0.07209
[5]	validation-error:0.06034
[6]	validation-error:0.05729
[7]	validation-error:0.05557
[8]	validation-error:0.05273
[9]	validation-error:0.05153
[10]	validation-error:0.04975
[11]	validation-error:0.04814
[12]	validation-error:0.04800
[13]	validation-error:0.04752
[14]	validation-error:0.04708


[I 2024-04-09 10:36:33,061] Trial 83 pruned. Trial was pruned at iteration 15.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.46140


[I 2024-04-09 10:36:33,286] Trial 84 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.33573
[2]	validation-error:0.12136
[3]	validation-error:0.08192
[4]	validation-error:0.06537
[5]	validation-error:0.05900
[6]	validation-error:0.05756
[7]	validation-error:0.05386
[8]	validation-error:0.05215
[9]	validation-error:0.05040
[10]	validation-error:0.04992
[11]	validation-error:0.04841
[12]	validation-error:0.04762
[13]	validation-error:0.04711
[14]	validation-error:0.04639
[15]	validation-error:0.04526
[16]	validation-error:0.04495
[17]	validation-error:0.04461
[18]	validation-error:0.04416
[19]	validation-error:0.04368
[20]	validation-error:0.04362
[21]	validation-error:0.04338
[22]	validation-error:0.04276
[23]	validation-error:0.04286
[24]	validation-error:0.04293


[I 2024-04-09 10:36:33,774] Trial 85 pruned. Trial was pruned at iteration 24.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:33,993] Trial 86 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:34,199] Trial 87 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:34,393] Trial 88 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:34,598] Trial 89 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:34,827] Trial 90 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:35,024] Trial 91 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:35,245] Trial 92 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:35,448] Trial 93 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:35,644] Trial 94 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-09 10:36:35,851] Trial 95 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.34611
[2]	validation-error:0.15466


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:36:36,169] Trial 96 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-09 10:36:36,366] Trial 97 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:36,568] Trial 98 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-09 10:36:36,767] Trial 99 pruned. Trial was pruned at iteration 1.


Best Hyperparameters:  {'eta': 0.10031864027110182, 'max_depth': 9, 'subsample': 0.9231738329456532, 'colsample_bytree': 0.7841748561498321, 'gamma': 0.710300338796076, 'min_child_weight': 6.358346542127798, 'lambda': 8.191237945400758, 'alpha': 5.832972334974991}
Best Error:  0.18675797658082016


#### Model Training

In [8]:
from sklearn.model_selection import KFold
from xgboost import DMatrix, train

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

best_params_lexical_content['objective'] = 'binary:hinge'
best_params_lexical_content['eval_metric'] = 'error'

# Convert the data into DMatrix format
lexical_content_train = DMatrix(X_train_lexical_content, label=y_train)
lexical_content_valid = DMatrix(X_test_lexical_content, label=y_test)

# Train the Model
xgb_classifier = train(best_params_lexical_content, lexical_content_train, num_boost_round=1000)
y_pred_lexical_content = xgb_classifier.predict(lexical_content_valid)

print("Model training done.")

Model training done.


In [9]:
# Classification Report
print(classification_report(y_test, y_pred_lexical_content))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     23942
           1       0.96      0.94      0.95     12541

    accuracy                           0.96     36483
   macro avg       0.96      0.96      0.96     36483
weighted avg       0.96      0.96      0.96     36483



#### Practical Evaluation

In [10]:
# Dumping the model
joblib.dump(xgb_classifier, 'xgb_filter_lexical-content.sav')

['xgb_filter_lexical-content.sav']

In [14]:
import content_generator_filter
import feature_generation_content_function_htmlin
import time

def xgb_predict_maliciousness(url):

    numerical_values = content_generator_filter.feature_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"

url = "www.facebook.com/"
print("Current URL: "+url)

print("------------- Filter-Based (Lexical + Content) -------------")
for i in range(15):
    start = time.perf_counter()
    prediction = xgb_predict_maliciousness(url)
    end = time.perf_counter()
    print("Trial "+str(i))
    print(prediction)
    print(end-start)

Current URL: www.facebook.com/
------------- Filter-Based (Lexical + Content) -------------
Trial 0
Benign
0.5154141000239179
Trial 1
Benign
0.4878694000071846
Trial 2
Benign
0.42477990000043064
Trial 3
Benign
0.4510425999760628
Trial 4
Benign
0.43153169995639473
Trial 5
Benign
0.4045920000062324
Trial 6
Benign
0.43066009995527565
Trial 7
Benign
0.4085478000342846
Trial 8
Benign
0.448262199992314
Trial 9
Benign
0.5521377000259236
Trial 10
Benign
0.4245381999644451
Trial 11
Benign
0.43366730003617704
Trial 12
Benign
0.5210599999991246
Trial 13
Benign
0.5085515999817289
Trial 14
Benign
0.4594624000019394


#### Evaluation

In [12]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred, labels=xgb_classifier.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier.classes_)
disp.plot()
plt.show()'''

'# Confusion Matrix for 12 Features\ncm_up = confusion_matrix(y_test, y_pred, labels=xgb_classifier.classes_)\ndisp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier.classes_)\ndisp.plot()\nplt.show()'

In [13]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(**params_gbm),
                        X_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

"# Cross Validation Score\nscores = cross_val_score(XGBClassifier(**params_gbm),\n                        X_train, y_train, scoring='accuracy', cv=cv).mean()\n\nprint(scores)"