# XGB Classifier with Filter-Based FS

Steven Sison | March 9, 2024

## Description

This document will be used to train a model using the reduced feature set obtain by using the wrapper-based method, forward feature selection. The model will be evaluated in terms of the usual metrics (accuracy, precision, F1-score, recall) as well as the training time. The model will also be stored for future evaluation purposes.

## Training the Model

### Preliminaries

#### 1. Loading the Dataset

In [1]:
import pandas as pd                     # For data transformation
import numpy as numpy                   # For scientific calculations
import seaborn as sns                   # For data visualizations
import matplotlib.pyplot as plt         # For plotting
import plotly.graph_objects as go       # For plotting
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay
from xgboost import XGBClassifier, DMatrix, train
from sklearn.pipeline import Pipeline
import time
from datetime import datetime
import joblib
import os
import optuna
from sklearn.metrics import mean_squared_error # or any other metric
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("../../../02_feature-engineering/final-datasets/binary_unbalanced_with_lexical-content.csv")      # Loading the dataset

dataset.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,url_type,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,14,36,0,0.0,1186,5.269303,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,415,3952,11450,4.68393,26155,4.565537,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5715,88450,420,5.833333,339327,4.656704,13,10,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1461,6292,3049,4.515907,89249,5.14958,11,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,43,273,0,0.0,1530,4.754726,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
dataset.head()

Unnamed: 0,url_type,blank_lines_count,blank_spaces_count,word_count,average_word_len,webpage_size,webpage_entropy,js_count,sus_js_count,js_eval_count,...,has_swf_in_string,has_cgi_in_string,has_php_in_string,has_abuse_in_string,has_admin_in_string,has_bin_in_string,has_personal_in_string,has_update_in_string,has_verification_in_string,url_scheme
0,1,14,36,0,0.0,1186,5.269303,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,415,3952,11450,4.68393,26155,4.565537,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,5715,88450,420,5.833333,339327,4.656704,13,10,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1461,6292,3049,4.515907,89249,5.14958,11,2,0,...,0,0,0,0,0,0,0,0,0,0
4,0,43,273,0,0.0,1530,4.754726,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop(columns=['url_type']), dataset['url_type'], test_size = 0.2, random_state=42)

#### 2. Preprocessing (Balancing)

In [4]:
dataset['url_type'].value_counts()

url_type
0    120080
1     62334
Name: count, dtype: int64

#### 3. Removing Unnecessary Features

In [5]:
important_features_lexical = ['url_host_length',
                              'url_is_https',  
                              'url_num_periods',
                              'url_path_length',
                              'js_count',
                              'has_log_in_html',
                              'meta_tag_count',
                              'js_search_count',
                              'url_num_forward_slash',
                              'sus_js_count',
                              'js_link_count',
                              'webpage_entropy',
                              'url_num_ampersand',
                              'url_num_subdomain',
                              'webpage_size',
                              'has_free_in_html',
                              'has_php_in_string',
                              'url_num_of_hyphens',
                              'url_query_length',
                              'get_tld',
                              'url_scheme']

X_test_lexical_content = x_test[important_features_lexical]
X_train_lexical_content = x_train[important_features_lexical]

print("Lexical+Content-Based XGB Model has a total of "+str(len(important_features_lexical))+" features.")

Lexical+Content-Based XGB Model has a total of 21 features.


#### Hyper-parameter Optimization

In [6]:
# Define the objective function for Optuna
def objective_lexical_content(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:hinge',
        'eval_metric': 'error',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'n_estimators': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train_lexical_content, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = DMatrix(train_data, label=train_target)
    dvalid = DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    
    # Train the model with early stopping
    model = train(param, dtrain, num_boost_round=100000, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    error = mean_squared_error(valid_target, y_pred, squared=False)
    
    return error

# Create an Optuna study and optimize the objective function
study_lexical_content = optuna.create_study(direction='minimize')
study_lexical_content.optimize(objective_lexical_content, n_trials=100) # Control the number of trials

# Print the best hyperparameters and the best RMSE
best_params_lexical_content = study_lexical_content.best_params
best_error_lexical_content = study_lexical_content.best_value
print("Best Hyperparameters (33 Features): ", best_params_lexical_content)
print("Best Error (33 Features): ", best_error_lexical_content)

[I 2024-04-02 13:15:17,113] A new study created in memory with name: no-name-fd0287d0-2ee8-4d22-b338-f79e6dab5a76


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65683
[3]	validation-error:0.24004
[4]	validation-error:0.18789


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.17597
[6]	validation-error:0.15404
[7]	validation-error:0.13359
[8]	validation-error:0.10052
[9]	validation-error:0.10083
[10]	validation-error:0.09610
[11]	validation-error:0.09545
[12]	validation-error:0.09347
[13]	validation-error:0.09062
[14]	validation-error:0.09079
[15]	validation-error:0.09014
[16]	validation-error:0.08870
[17]	validation-error:0.08956
[18]	validation-error:0.08764
[19]	validation-error:0.08754
[20]	validation-error:0.08761
[21]	validation-error:0.08627
[22]	validation-error:0.08661
[23]	validation-error:0.08648
[24]	validation-error:0.08545
[25]	validation-error:0.08422
[26]	validation-error:0.08404
[27]	validation-error:0.08356
[28]	validation-error:0.08216
[29]	validation-error:0.08161
[30]	validation-error:0.08158
[31]	validation-error:0.08117
[32]	validation-error:0.08000
[33]	validation-error:0.07993
[34]	validation-error:0.07747
[35]	validation-error:0.07699
[36]	validation-error:0.07572
[37]	validation-error:0.07514
[38]	validation

[I 2024-04-02 13:15:23,559] Trial 0 finished with value: 0.2157818932863543 and parameters: {'eta': 0.2069185763712498, 'max_depth': 3, 'subsample': 0.732808535536775, 'colsample_bytree': 0.5338231138458792, 'gamma': 3.859224125978178, 'min_child_weight': 4.623991397095878, 'lambda': 7.384862731691721, 'alpha': 7.32042329305727}. Best is trial 0 with value: 0.2157818932863543.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.44133
[2]	validation-error:0.10522
[3]	validation-error:0.07161
[4]	validation-error:0.06150
[5]	validation-error:0.05677
[6]	validation-error:0.05420
[7]	validation-error:0.05372
[8]	validation-error:0.05242
[9]	validation-error:0.05150
[10]	validation-error:0.05084
[11]	validation-error:0.04992
[12]	validation-error:0.04879
[13]	validation-error:0.04865
[14]	validation-error:0.04824
[15]	validation-error:0.04838
[16]	validation-error:0.04793
[17]	validation-error:0.04762
[18]	validation-error:0.04735
[19]	validation-error:0.04704
[20]	validation-error:0.04708
[21]	validation-error:0.04636
[22]	validation-error:0.04605
[23]	validation-error:0.04608
[24]	validation-error:0.04601
[25]	validation-error:0.04595
[26]	validation-error:0.04588
[27]	validation-error:0.04560
[28]	validation-error:0.04560
[29]	validation-error:0.04571
[30]	validation-error:0.04564
[31]	validation-error:0.04564
[32]	validation-error:0.04567
[33]	validation-erro

[I 2024-04-02 13:15:26,200] Trial 1 finished with value: 0.21039553924497312 and parameters: {'eta': 0.2838420857776958, 'max_depth': 10, 'subsample': 0.5917105756328409, 'colsample_bytree': 0.8039485449205859, 'gamma': 6.492852134765885, 'min_child_weight': 5.323448646252555, 'lambda': 9.727227527123738, 'alpha': 5.031160093275552}. Best is trial 1 with value: 0.21039553924497312.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995
[3]	validation-error:0.64895
[4]	validation-error:0.13862
[5]	validation-error:0.13472
[6]	validation-error:0.10292
[7]	validation-error:0.09532
[8]	validation-error:0.09850
[9]	validation-error:0.08922
[10]	validation-error:0.08918
[11]	validation-error:0.08682
[12]	validation-error:0.08367
[13]	validation-error:0.08463
[14]	validation-error:0.08291
[15]	validation-error:0.08237
[16]	validation-error:0.08257
[17]	validation-error:0.08195
[18]	validation-error:0.08089
[19]	validation-error:0.08130
[20]	validation-error:0.08089
[21]	validation-error:0.08045
[22]	validation-error:0.08010
[23]	validation-error:0.07911
[24]	validation-error:0.07839
[25]	validation-error:0.07825
[26]	validation-error:0.07740
[27]	validation-error:0.07568
[28]	validation-error:0.07534
[29]	validation-error:0.07538
[30]	validation-error:0.07486
[31]	validation-error:0.07469
[32]	validation-error:0.07366
[33]	validation-erro

[I 2024-04-02 13:15:33,094] Trial 2 finished with value: 0.21689050480552063 and parameters: {'eta': 0.1573478471505321, 'max_depth': 4, 'subsample': 0.5071637320043747, 'colsample_bytree': 0.9939020392138341, 'gamma': 6.017115361962472, 'min_child_weight': 6.711522087349976, 'lambda': 5.561705469764615, 'alpha': 7.800634147842176}. Best is trial 1 with value: 0.21039553924497312.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995
[3]	validation-error:0.65995
[4]	validation-error:0.65995
[5]	validation-error:0.26848
[6]	validation-error:0.15425
[7]	validation-error:0.10840
[8]	validation-error:0.08058
[9]	validation-error:0.07240
[10]	validation-error:0.06945
[11]	validation-error:0.06599
[12]	validation-error:0.06325
[13]	validation-error:0.06078
[14]	validation-error:0.05972
[15]	validation-error:0.05876
[16]	validation-error:0.05753
[17]	validation-error:0.05698
[18]	validation-error:0.05701
[19]	validation-error:0.05554
[20]	validation-error:0.05520
[21]	validation-error:0.05516
[22]	validation-error:0.05454
[23]	validation-error:0.05424
[24]	validation-error:0.05317
[25]	validation-error:0.05290
[26]	validation-error:0.05293
[27]	validation-error:0.05280
[28]	validation-error:0.05304
[29]	validation-error:0.05235
[30]	validation-error:0.05194
[31]	validation-error:0.05201
[32]	validation-error:0.05156
[33]	validation-erro

[I 2024-04-02 13:15:36,590] Trial 3 finished with value: 0.21434811067538695 and parameters: {'eta': 0.10018621639241859, 'max_depth': 9, 'subsample': 0.7154033585178307, 'colsample_bytree': 0.6487828254929053, 'gamma': 6.123041256532076, 'min_child_weight': 4.070528980759753, 'lambda': 2.4580671494994557, 'alpha': 9.782135987758071}. Best is trial 1 with value: 0.21039553924497312.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65683
[2]	validation-error:0.22404
[3]	validation-error:0.13588
[4]	validation-error:0.10066
[5]	validation-error:0.11913
[6]	validation-error:0.12070
[7]	validation-error:0.09508
[8]	validation-error:0.09309
[9]	validation-error:0.09258
[10]	validation-error:0.09206
[11]	validation-error:0.09288
[12]	validation-error:0.08942
[13]	validation-error:0.08877
[14]	validation-error:0.08720
[15]	validation-error:0.08730
[16]	validation-error:0.08583
[17]	validation-error:0.08535
[18]	validation-error:0.08339
[19]	validation-error:0.08226
[20]	validation-error:0.08117
[21]	validation-error:0.07983
[22]	validation-error:0.07938
[23]	validation-error:0.07956
[24]	validation-error:0.07829
[25]	validation-error:0.07808
[26]	validation-error:0.07716
[27]	validation-error:0.07589
[28]	validation-error:0.07397
[29]	validation-error:0.07363
[30]	validation-error:0.07246
[31]	validation-error:0.07202
[32]	validation-error:0.07144
[33]	validation-erro

[I 2024-04-02 13:15:46,038] Trial 4 finished with value: 0.2018348695291552 and parameters: {'eta': 0.2887729281323822, 'max_depth': 3, 'subsample': 0.6345495925728502, 'colsample_bytree': 0.597256726764093, 'gamma': 2.7894495299701005, 'min_child_weight': 3.7345369331262077, 'lambda': 8.239549731991243, 'alpha': 2.261862957626033}. Best is trial 4 with value: 0.2018348695291552.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.27783
[3]	validation-error:0.17028
[4]	validation-error:0.15606


[I 2024-04-02 13:15:46,301] Trial 5 pruned. Trial was pruned at iteration 4.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.56450
[3]	validation-error:0.18683
[4]	validation-error:0.10888
[5]	validation-error:0.07736
[6]	validation-error:0.06863
[7]	validation-error:0.06071
[8]	validation-error:0.05592
[9]	validation-error:0.05362
[10]	validation-error:0.05311
[11]	validation-error:0.05204
[12]	validation-error:0.05208
[13]	validation-error:0.05108
[14]	validation-error:0.05074
[15]	validation-error:0.04992
[16]	validation-error:0.04923
[17]	validation-error:0.04865
[18]	validation-error:0.04776
[19]	validation-error:0.04725
[20]	validation-error:0.04677
[21]	validation-error:0.04636
[22]	validation-error:0.04605
[23]	validation-error:0.04529
[24]	validation-error:0.04471
[25]	validation-error:0.04437
[26]	validation-error:0.04420
[27]	validation-error:0.04327
[28]	validation-error:0.04296
[29]	validation-error:0.04286
[30]	validation-error:0.04245
[31]	validation-error:0.04231
[32]	validation-error:0.04207
[33]	validation-erro

[I 2024-04-02 13:15:50,238] Trial 6 finished with value: 0.19858339573032507 and parameters: {'eta': 0.1746936939255727, 'max_depth': 10, 'subsample': 0.869295461565226, 'colsample_bytree': 0.5731651429489562, 'gamma': 3.468315576464658, 'min_child_weight': 8.433891536200917, 'lambda': 2.935876798202686, 'alpha': 3.70842216783001}. Best is trial 6 with value: 0.19858339573032507.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-02 13:15:50,489] Trial 7 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:15:50,730] Trial 8 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.65995


[I 2024-04-02 13:15:50,965] Trial 9 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:15:51,248] Trial 10 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.41580
[2]	validation-error:0.12906
[3]	validation-error:0.08994
[4]	validation-error:0.07538
[5]	validation-error:0.06287
[6]	validation-error:0.06311
[7]	validation-error:0.06054
[8]	validation-error:0.05804
[9]	validation-error:0.05756
[10]	validation-error:0.05694


Parameters: { "n_estimators" } are not used.



[11]	validation-error:0.05588
[12]	validation-error:0.05492
[13]	validation-error:0.05287
[14]	validation-error:0.05167
[15]	validation-error:0.05102
[16]	validation-error:0.05067
[17]	validation-error:0.04982
[18]	validation-error:0.04910
[19]	validation-error:0.04834
[20]	validation-error:0.04790
[21]	validation-error:0.04749
[22]	validation-error:0.04732
[23]	validation-error:0.04701
[24]	validation-error:0.04595
[25]	validation-error:0.04605
[26]	validation-error:0.04581
[27]	validation-error:0.04533
[28]	validation-error:0.04523
[29]	validation-error:0.04509
[30]	validation-error:0.04471
[31]	validation-error:0.04399
[32]	validation-error:0.04386
[33]	validation-error:0.04386
[34]	validation-error:0.04362
[35]	validation-error:0.04368
[36]	validation-error:0.04341
[37]	validation-error:0.04320
[38]	validation-error:0.04327
[39]	validation-error:0.04283
[40]	validation-error:0.04286
[41]	validation-error:0.04272
[42]	validation-error:0.04235
[43]	validation-error:0.04235
[44]	valid

[I 2024-04-02 13:15:57,722] Trial 11 finished with value: 0.19271694923832286 and parameters: {'eta': 0.29591564729388425, 'max_depth': 7, 'subsample': 0.80410766434227, 'colsample_bytree': 0.6449338176227917, 'gamma': 1.8878590520534537, 'min_child_weight': 2.243930473124974, 'lambda': 3.0861333225180765, 'alpha': 2.8962538933187076}. Best is trial 11 with value: 0.19271694923832286.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.17460
[3]	validation-error:0.10083
[4]	validation-error:0.07757
[5]	validation-error:0.06815
[6]	validation-error:0.06407
[7]	validation-error:0.06338
[8]	validation-error:0.06136
[9]	validation-error:0.06109
[10]	validation-error:0.05968
[11]	validation-error:0.05917
[12]	validation-error:0.05866
[13]	validation-error:0.05811
[14]	validation-error:0.05670
[15]	validation-error:0.05578
[16]	validation-error:0.05465
[17]	validation-error:0.05427
[18]	validation-error:0.05331
[19]	validation-error:0.05259
[20]	validation-error:0.05160
[21]	validation-error:0.05078
[22]	validation-error:0.04995
[23]	validation-error:0.04886
[24]	validation-error:0.04834
[25]	validation-error:0.04752
[26]	validation-error:0.04711
[27]	validation-error:0.04670
[28]	validation-error:0.04670
[29]	validation-error:0.04625
[30]	validation-error:0.04571
[31]	validation-error:0.04543
[32]	validation-error:0.04536
[33]	validation-erro

[I 2024-04-02 13:16:04,781] Trial 12 finished with value: 0.19146842277298975 and parameters: {'eta': 0.23289527322512946, 'max_depth': 7, 'subsample': 0.8426057739550341, 'colsample_bytree': 0.687956471511436, 'gamma': 0.9761944828203424, 'min_child_weight': 1.6790613962407819, 'lambda': 3.029134374457925, 'alpha': 3.557374053743783}. Best is trial 12 with value: 0.19146842277298975.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.17597
[3]	validation-error:0.10289
[4]	validation-error:0.08168
[5]	validation-error:0.07401


Parameters: { "n_estimators" } are not used.



[6]	validation-error:0.07181
[7]	validation-error:0.06688
[8]	validation-error:0.06397
[9]	validation-error:0.06225
[10]	validation-error:0.06147
[11]	validation-error:0.05996
[12]	validation-error:0.05886
[13]	validation-error:0.05777
[14]	validation-error:0.05674
[15]	validation-error:0.05605
[16]	validation-error:0.05417
[17]	validation-error:0.05311
[18]	validation-error:0.05252
[19]	validation-error:0.05208
[20]	validation-error:0.05156
[21]	validation-error:0.05088
[22]	validation-error:0.05030
[23]	validation-error:0.04947
[24]	validation-error:0.04903
[25]	validation-error:0.04845
[26]	validation-error:0.04804
[27]	validation-error:0.04738
[28]	validation-error:0.04773
[29]	validation-error:0.04694
[30]	validation-error:0.04571
[31]	validation-error:0.04543
[32]	validation-error:0.04502
[33]	validation-error:0.04468
[34]	validation-error:0.04420
[35]	validation-error:0.04409
[36]	validation-error:0.04368
[37]	validation-error:0.04344
[38]	validation-error:0.04320
[39]	validatio

[I 2024-04-02 13:16:19,171] Trial 13 finished with value: 0.19111020277209337 and parameters: {'eta': 0.23207638903397323, 'max_depth': 7, 'subsample': 0.8068504528551167, 'colsample_bytree': 0.6934325437479969, 'gamma': 0.840899278205417, 'min_child_weight': 1.5691143860163883, 'lambda': 1.167391404361846, 'alpha': 5.891158842216067}. Best is trial 13 with value: 0.19111020277209337.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.16483
[3]	validation-error:0.09830
[4]	validation-error:0.07740
[5]	validation-error:0.06541
[6]	validation-error:0.06102
[7]	validation-error:0.06099
[8]	validation-error:0.06075
[9]	validation-error:0.06016
[10]	validation-error:0.05907
[11]	validation-error:0.05825
[12]	validation-error:0.05708
[13]	validation-error:0.05561
[14]	validation-error:0.05475
[15]	validation-error:0.05454
[16]	validation-error:0.05355
[17]	validation-error:0.05290
[18]	validation-error:0.05304
[19]	validation-error:0.05215
[20]	validation-error:0.05156
[21]	validation-error:0.05023
[22]	validation-error:0.05013
[23]	validation-error:0.04965


[I 2024-04-02 13:16:19,888] Trial 14 pruned. Trial was pruned at iteration 23.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.14308
[3]	validation-error:0.08569
[4]	validation-error:0.07034
[5]	validation-error:0.05951
[6]	validation-error:0.05859
[7]	validation-error:0.05797
[8]	validation-error:0.05616
[9]	validation-error:0.05492
[10]	validation-error:0.05451
[11]	validation-error:0.05345
[12]	validation-error:0.05201
[13]	validation-error:0.05119
[14]	validation-error:0.04965
[15]	validation-error:0.04906
[16]	validation-error:0.04834
[17]	validation-error:0.04756
[18]	validation-error:0.04684
[19]	validation-error:0.04636
[20]	validation-error:0.04591
[21]	validation-error:0.04523
[22]	validation-error:0.04478
[23]	validation-error:0.04444
[24]	validation-error:0.04416
[25]	validation-error:0.04338
[26]	validation-error:0.04348
[27]	validation-error:0.04307
[28]	validation-error:0.04300
[29]	validation-error:0.04290
[30]	validation-error:0.04238
[31]	validation-error:0.04204
[32]	validation-error:0.04177
[33]	validation-erro

[I 2024-04-02 13:16:24,381] Trial 15 finished with value: 0.18885593419636032 and parameters: {'eta': 0.24920171651018697, 'max_depth': 8, 'subsample': 0.7969243749302927, 'colsample_bytree': 0.7025268248381582, 'gamma': 0.23722771541916587, 'min_child_weight': 1.9542417160152912, 'lambda': 1.7466848789938463, 'alpha': 5.8218178623611525}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.57413
[2]	validation-error:0.13794
[3]	validation-error:0.07555
[4]	validation-error:0.06325
[5]	validation-error:0.06010
[6]	validation-error:0.05811
[7]	validation-error:0.05763
[8]	validation-error:0.05633
[9]	validation-error:0.05564
[10]	validation-error:0.05468
[11]	validation-error:0.05348
[12]	validation-error:0.05187
[13]	validation-error:0.05132
[14]	validation-error:0.05108
[15]	validation-error:0.05108
[16]	validation-error:0.05026
[17]	validation-error:0.04917
[18]	validation-error:0.04858
[19]	validation-error:0.04807
[20]	validation-error:0.04752
[21]	validation-error:0.04708
[22]	validation-error:0.04642
[23]	validation-error:0.04612
[24]	validation-error:0.04612
[25]	validation-error:0.04526
[26]	validation-error:0.04547
[27]	validation-error:0.04519
[28]	validation-error:0.04464
[29]	validation-error:0.04461
[30]	validation-error:0.04423
[31]	validation-error:0.04423
[32]	validation-error:0.04358
[33]	validation-erro

[I 2024-04-02 13:16:28,998] Trial 16 finished with value: 0.19003149122404298 and parameters: {'eta': 0.26519684358892637, 'max_depth': 8, 'subsample': 0.688269501698383, 'colsample_bytree': 0.7273871242426084, 'gamma': 0.1619166019767153, 'min_child_weight': 2.3180605635942806, 'lambda': 1.5664736223508595, 'alpha': 6.642807480745711}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.61061
[2]	validation-error:0.12375
[3]	validation-error:0.07682
[4]	validation-error:0.06671
[5]	validation-error:0.05968
[6]	validation-error:0.05842
[7]	validation-error:0.05763
[8]	validation-error:0.05705
[9]	validation-error:0.05526
[10]	validation-error:0.05413
[11]	validation-error:0.05249
[12]	validation-error:0.05218
[13]	validation-error:0.05102
[14]	validation-error:0.05040
[15]	validation-error:0.04971
[16]	validation-error:0.04896
[17]	validation-error:0.04848
[18]	validation-error:0.04814
[19]	validation-error:0.04742
[20]	validation-error:0.04701
[21]	validation-error:0.04687
[22]	validation-error:0.04612
[23]	validation-error:0.04601
[24]	validation-error:0.04550
[25]	validation-error:0.04529
[26]	validation-error:0.04526
[27]	validation-error:0.04447
[28]	validation-error:0.04454
[29]	validation-error:0.04372
[30]	validation-error:0.04358
[31]	validation-error:0.04338
[32]	validation-error:0.04314
[33]	validation-erro

[I 2024-04-02 13:16:34,296] Trial 17 finished with value: 0.19289465047408436 and parameters: {'eta': 0.26459595298198874, 'max_depth': 8, 'subsample': 0.679155461427541, 'colsample_bytree': 0.7954120834169186, 'gamma': 1.8456639775842714, 'min_child_weight': 2.833104188497363, 'lambda': 1.4159353746321321, 'alpha': 8.41332933449683}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.57577
[2]	validation-error:0.09590
[3]	validation-error:0.06441
[4]	validation-error:0.05920
[5]	validation-error:0.05547
[6]	validation-error:0.05444
[7]	validation-error:0.05355
[8]	validation-error:0.05324
[9]	validation-error:0.05208
[10]	validation-error:0.05108
[11]	validation-error:0.05023
[12]	validation-error:0.04985
[13]	validation-error:0.04954
[14]	validation-error:0.04869
[15]	validation-error:0.04814
[16]	validation-error:0.04749
[17]	validation-error:0.04697
[18]	validation-error:0.04697
[19]	validation-error:0.04622
[20]	validation-error:0.04591
[21]	validation-error:0.04550
[22]	validation-error:0.04516
[23]	validation-error:0.04488
[24]	validation-error:0.04433
[25]	validation-error:0.04423
[26]	validation-error:0.04403
[27]	validation-error:0.04386
[28]	validation-error:0.04310
[29]	validation-error:0.04303
[30]	validation-error:0.04293
[31]	validation-error:0.04272
[32]	validation-error:0.04259
[33]	validation-erro

[I 2024-04-02 13:16:36,193] Trial 18 pruned. Trial was pruned at iteration 142.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:36,476] Trial 19 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.40491


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:16:36,826] Trial 20 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.14373
[3]	validation-error:0.08446
[4]	validation-error:0.06613


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.05910
[6]	validation-error:0.05759
[7]	validation-error:0.05540
[8]	validation-error:0.05359
[9]	validation-error:0.05245
[10]	validation-error:0.05153
[11]	validation-error:0.05064
[12]	validation-error:0.04865
[13]	validation-error:0.04800
[14]	validation-error:0.04766
[15]	validation-error:0.04649
[16]	validation-error:0.04557
[17]	validation-error:0.04488
[18]	validation-error:0.04433
[19]	validation-error:0.04331
[20]	validation-error:0.04272
[21]	validation-error:0.04286
[22]	validation-error:0.04211
[23]	validation-error:0.04170
[24]	validation-error:0.04132
[25]	validation-error:0.04135
[26]	validation-error:0.04105
[27]	validation-error:0.04105
[28]	validation-error:0.04081
[29]	validation-error:0.04043
[30]	validation-error:0.04029
[31]	validation-error:0.04019
[32]	validation-error:0.04022
[33]	validation-error:0.04026
[34]	validation-error:0.04015
[35]	validation-error:0.03988
[36]	validation-error:0.03964
[37]	validation-error:0.03950
[38]	validation

[I 2024-04-02 13:16:45,038] Trial 21 finished with value: 0.1920044996674685 and parameters: {'eta': 0.24428769084376534, 'max_depth': 9, 'subsample': 0.7777297737920733, 'colsample_bytree': 0.6705876325523696, 'gamma': 0.08834671798956428, 'min_child_weight': 1.4880429624106457, 'lambda': 1.3762879164796524, 'alpha': 5.844548819736611}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.57413
[2]	validation-error:0.13763
[3]	validation-error:0.07647
[4]	validation-error:0.06171
[5]	validation-error:0.05811
[6]	validation-error:0.05749
[7]	validation-error:0.05698
[8]	validation-error:0.05612
[9]	validation-error:0.05496
[10]	validation-error:0.05359
[11]	validation-error:0.05225
[12]	validation-error:0.05187
[13]	validation-error:0.05098
[14]	validation-error:0.05013
[15]	validation-error:0.04882
[16]	validation-error:0.04821
[17]	validation-error:0.04728
[18]	validation-error:0.04714
[19]	validation-error:0.04660
[20]	validation-error:0.04598
[21]	validation-error:0.04543
[22]	validation-error:0.04468
[23]	validation-error:0.04433
[24]	validation-error:0.04437
[25]	validation-error:0.04396
[26]	validation-error:0.04348
[27]	validation-error:0.04317
[28]	validation-error:0.04341
[29]	validation-error:0.04310
[30]	validation-error:0.04283
[31]	validation-error:0.04276
[32]	validation-error:0.04252
[33]	validation-erro

[I 2024-04-02 13:16:47,347] Trial 22 pruned. Trial was pruned at iteration 222.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.46134


[I 2024-04-02 13:16:47,569] Trial 23 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:47,790] Trial 24 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.14376
[3]	validation-error:0.07671
[4]	validation-error:0.06078
[5]	validation-error:0.05523
[6]	validation-error:0.05482
[7]	validation-error:0.05328
[8]	validation-error:0.05232
[9]	validation-error:0.05108
[10]	validation-error:0.04985
[11]	validation-error:0.04910
[12]	validation-error:0.04821
[13]	validation-error:0.04752
[14]	validation-error:0.04697
[15]	validation-error:0.04649
[16]	validation-error:0.04625
[17]	validation-error:0.04567
[18]	validation-error:0.04540
[19]	validation-error:0.04492
[20]	validation-error:0.04399
[21]	validation-error:0.04392
[22]	validation-error:0.04379
[23]	validation-error:0.04362
[24]	validation-error:0.04331
[25]	validation-error:0.04310
[26]	validation-error:0.04286
[27]	validation-error:0.04248
[28]	validation-error:0.04245
[29]	validation-error:0.04242
[30]	validation-error:0.04231
[31]	validation-error:0.04183
[32]	validation-error:0.04177
[33]	validation-erro

[I 2024-04-02 13:16:49,316] Trial 25 pruned. Trial was pruned at iteration 142.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.57663
[2]	validation-error:0.14030
[3]	validation-error:0.09090
[4]	validation-error:0.07044
[5]	validation-error:0.06314
[6]	validation-error:0.06246
[7]	validation-error:0.05986
[8]	validation-error:0.05787
[9]	validation-error:0.05715
[10]	validation-error:0.05650
[11]	validation-error:0.05561
[12]	validation-error:0.05420
[13]	validation-error:0.05235
[14]	validation-error:0.05136
[15]	validation-error:0.05095
[16]	validation-error:0.04961
[17]	validation-error:0.04923
[18]	validation-error:0.04855
[19]	validation-error:0.04824
[20]	validation-error:0.04725
[21]	validation-error:0.04694
[22]	validation-error:0.04660
[23]	validation-error:0.04595
[24]	validation-error:0.04526
[25]	validation-error:0.04492
[26]	validation-error:0.04430
[27]	validation-error:0.04399
[28]	validation-error:0.04392
[29]	validation-error:0.04348
[30]	validation-error:0.04317
[31]	validation-error:0.04300
[32]	validation-error:0.04252
[33]	validation-erro

[I 2024-04-02 13:16:52,748] Trial 26 finished with value: 0.19209370036899007 and parameters: {'eta': 0.2715767537984052, 'max_depth': 8, 'subsample': 0.7888106494491375, 'colsample_bytree': 0.6342872535208592, 'gamma': 0.04684229976559062, 'min_child_weight': 3.402286227933132, 'lambda': 1.9265974122178118, 'alpha': 6.436408770998912}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:53,067] Trial 27 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.46168


[I 2024-04-02 13:16:53,296] Trial 28 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995

Parameters: { "n_estimators" } are not used.




[1]	validation-error:0.65995


[I 2024-04-02 13:16:53,514] Trial 29 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.23541


[I 2024-04-02 13:16:53,764] Trial 30 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.17912


[I 2024-04-02 13:16:54,021] Trial 31 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.16826


[I 2024-04-02 13:16:54,270] Trial 32 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.40107
[2]	validation-error:0.11591
[3]	validation-error:0.07586
[4]	validation-error:0.06332
[5]	validation-error:0.06376
[6]	validation-error:0.06112
[7]	validation-error:0.06027


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:16:54,641] Trial 33 pruned. Trial was pruned at iteration 8.


[0]	validation-error:0.65995
[1]	validation-error:0.57974
[2]	validation-error:0.15675


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:16:54,901] Trial 34 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995
[1]	validation-error:0.52982
[2]	validation-error:0.12927
[3]	validation-error:0.07346
[4]	validation-error:0.06383
[5]	validation-error:0.06088
[6]	validation-error:0.06040
[7]	validation-error:0.05821
[8]	validation-error:0.05602


Parameters: { "n_estimators" } are not used.



[9]	validation-error:0.05513
[10]	validation-error:0.05427
[11]	validation-error:0.05328
[12]	validation-error:0.05198
[13]	validation-error:0.05043
[14]	validation-error:0.04992
[15]	validation-error:0.04968
[16]	validation-error:0.04906
[17]	validation-error:0.04899
[18]	validation-error:0.04882
[19]	validation-error:0.04797
[20]	validation-error:0.04690
[21]	validation-error:0.04636
[22]	validation-error:0.04639
[23]	validation-error:0.04642
[24]	validation-error:0.04622


[I 2024-04-02 13:16:55,418] Trial 35 pruned. Trial was pruned at iteration 24.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.18899


[I 2024-04-02 13:16:55,684] Trial 36 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:55,931] Trial 37 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.63922
[2]	validation-error:0.15096
[3]	validation-error:0.09271
[4]	validation-error:0.06866
[5]	validation-error:0.05958
[6]	validation-error:0.05629
[7]	validation-error:0.05218
[8]	validation-error:0.05013
[9]	validation-error:0.04906
[10]	validation-error:0.04773
[11]	validation-error:0.04656
[12]	validation-error:0.04502
[13]	validation-error:0.04447
[14]	validation-error:0.04365
[15]	validation-error:0.04368
[16]	validation-error:0.04324
[17]	validation-error:0.04248
[18]	validation-error:0.04266
[19]	validation-error:0.04207
[20]	validation-error:0.04129
[21]	validation-error:0.04125
[22]	validation-error:0.04081
[23]	validation-error:0.04084
[24]	validation-error:0.04039
[25]	validation-error:0.04046
[26]	validation-error:0.04036
[27]	validation-error:0.04009
[28]	validation-error:0.03988
[29]	validation-error:0.04002
[30]	validation-error:0.03964
[31]	validation-error:0.03954
[32]	validation-error:0.03944
[33]	validation-erro

[I 2024-04-02 13:16:59,239] Trial 38 finished with value: 0.19191525750616725 and parameters: {'eta': 0.25375655015354315, 'max_depth': 10, 'subsample': 0.8867336903284979, 'colsample_bytree': 0.660724945555454, 'gamma': 1.3092832300100392, 'min_child_weight': 1.98103218633461, 'lambda': 3.526673612750085, 'alpha': 4.874375798512224}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995
[2]	validation-error:0.63915


[I 2024-04-02 13:16:59,448] Trial 39 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:59,676] Trial 40 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:16:59,920] Trial 41 pruned. Trial was pruned at iteration 2.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.38017
[2]	validation-error:0.10405
[3]	validation-error:0.07466
[4]	validation-error:0.06171
[5]	validation-error:0.05427
[6]	validation-error:0.05187
[7]	validation-error:0.04858
[8]	validation-error:0.04769
[9]	validation-error:0.04663
[10]	validation-error:0.04553
[11]	validation-error:0.04499
[12]	validation-error:0.04457
[13]	validation-error:0.04375
[14]	validation-error:0.04338
[15]	validation-error:0.04279
[16]	validation-error:0.04231
[17]	validation-error:0.04190
[18]	validation-error:0.04156
[19]	validation-error:0.04156
[20]	validation-error:0.04098
[21]	validation-error:0.04098
[22]	validation-error:0.04070
[23]	validation-error:0.04050
[24]	validation-error:0.04070
[25]	validation-error:0.04015
[26]	validation-error:0.03998
[27]	validation-error:0.03992
[28]	validation-error:0.04002
[29]	validation-error:0.03978
[30]	validation-error:0.03985
[31]	validation-error:0.03968
[32]	validation-error:0.03968
[33]	validation-erro

[I 2024-04-02 13:17:04,775] Trial 42 finished with value: 0.1940457419197152 and parameters: {'eta': 0.28151922253912437, 'max_depth': 10, 'subsample': 0.8594666229412944, 'colsample_bytree': 0.6940307941966863, 'gamma': 0.6955753348347785, 'min_child_weight': 2.7626294717046207, 'lambda': 3.1453867680388283, 'alpha': 5.1217417561941545}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:17:05,039] Trial 43 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:05,307] Trial 44 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:05,546] Trial 45 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:05,774] Trial 46 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.40131
[2]	validation-error:0.11745
[3]	validation-error:0.08165
[4]	validation-error:0.06616
[5]	validation-error:0.05938
[6]	validation-error:0.05920
[7]	validation-error:0.05663
[8]	validation-error:0.05516


Parameters: { "n_estimators" } are not used.



[9]	validation-error:0.05389
[10]	validation-error:0.05228
[11]	validation-error:0.05139
[12]	validation-error:0.05060
[13]	validation-error:0.04899
[14]	validation-error:0.04831
[15]	validation-error:0.04766
[16]	validation-error:0.04639
[17]	validation-error:0.04543
[18]	validation-error:0.04485
[19]	validation-error:0.04499
[20]	validation-error:0.04392
[21]	validation-error:0.04368
[22]	validation-error:0.04314
[23]	validation-error:0.04248
[24]	validation-error:0.04218
[25]	validation-error:0.04194
[26]	validation-error:0.04139
[27]	validation-error:0.04135
[28]	validation-error:0.04125
[29]	validation-error:0.04132
[30]	validation-error:0.04149
[31]	validation-error:0.04142
[32]	validation-error:0.04129
[33]	validation-error:0.04129
[34]	validation-error:0.04094
[35]	validation-error:0.04057
[36]	validation-error:0.04043
[37]	validation-error:0.04057
[38]	validation-error:0.04046
[39]	validation-error:0.04029
[40]	validation-error:0.04012
[41]	validation-error:0.03978
[42]	valida

[I 2024-04-02 13:17:10,081] Trial 47 finished with value: 0.19209370036899007 and parameters: {'eta': 0.2835340696909532, 'max_depth': 8, 'subsample': 0.8709465137155672, 'colsample_bytree': 0.7065053664096445, 'gamma': 1.0687234689618577, 'min_child_weight': 3.009067057914921, 'lambda': 2.707707297645591, 'alpha': 4.1190834866464625}. Best is trial 15 with value: 0.18885593419636032.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:17:10,296] Trial 48 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995


[I 2024-04-02 13:17:10,553] Trial 49 pruned. Trial was pruned at iteration 1.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.34245
[2]	validation-error:0.11330
[3]	validation-error:0.08596


[I 2024-04-02 13:17:10,865] Trial 50 pruned. Trial was pruned at iteration 4.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.65995


[I 2024-04-02 13:17:11,119] Trial 51 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:11,410] Trial 52 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.47898
[2]	validation-error:0.11646
[3]	validation-error:0.08082
[4]	validation-error:0.06551
[5]	validation-error:0.05763
[6]	validation-error:0.05653


Parameters: { "n_estimators" } are not used.



[7]	validation-error:0.05434
[8]	validation-error:0.05434
[9]	validation-error:0.05242
[10]	validation-error:0.05033
[11]	validation-error:0.04913
[12]	validation-error:0.04872
[13]	validation-error:0.04708
[14]	validation-error:0.04649
[15]	validation-error:0.04595
[16]	validation-error:0.04557
[17]	validation-error:0.04457
[18]	validation-error:0.04406
[19]	validation-error:0.04406
[20]	validation-error:0.04296
[21]	validation-error:0.04255
[22]	validation-error:0.04221
[23]	validation-error:0.04149
[24]	validation-error:0.04135
[25]	validation-error:0.04139
[26]	validation-error:0.04156
[27]	validation-error:0.04091
[28]	validation-error:0.04057
[29]	validation-error:0.04067
[30]	validation-error:0.04067
[31]	validation-error:0.04026
[32]	validation-error:0.04019
[33]	validation-error:0.04036
[34]	validation-error:0.04029
[35]	validation-error:0.04009
[36]	validation-error:0.04002
[37]	validation-error:0.04012
[38]	validation-error:0.04019
[39]	validation-error:0.04026
[40]	validati

[I 2024-04-02 13:17:13,268] Trial 53 pruned. Trial was pruned at iteration 145.
Parameters: { "n_estimators" } are not used.



[0]	validation-error:0.65995
[1]	validation-error:0.58225
[2]	validation-error:0.12533
[3]	validation-error:0.08298
[4]	validation-error:0.06619
[5]	validation-error:0.05848
[6]	validation-error:0.05715
[7]	validation-error:0.05413
[8]	validation-error:0.05311
[9]	validation-error:0.05074
[10]	validation-error:0.05009
[11]	validation-error:0.04899
[12]	validation-error:0.04780
[13]	validation-error:0.04684
[14]	validation-error:0.04622
[15]	validation-error:0.04567
[16]	validation-error:0.04505
[17]	validation-error:0.04475
[18]	validation-error:0.04464
[19]	validation-error:0.04420
[20]	validation-error:0.04403
[21]	validation-error:0.04368
[22]	validation-error:0.04334
[23]	validation-error:0.04293
[24]	validation-error:0.04228
[25]	validation-error:0.04235
[26]	validation-error:0.04235
[27]	validation-error:0.04207
[28]	validation-error:0.04173
[29]	validation-error:0.04173
[30]	validation-error:0.04125
[31]	validation-error:0.04098
[32]	validation-error:0.04060
[33]	validation-erro

[I 2024-04-02 13:17:21,059] Trial 54 finished with value: 0.19413400471341732 and parameters: {'eta': 0.25882370562279666, 'max_depth': 9, 'subsample': 0.7099818686862374, 'colsample_bytree': 0.6857994000316666, 'gamma': 1.0122526661692863, 'min_child_weight': 9.9100700595468, 'lambda': 2.298006009216603, 'alpha': 5.1075640000765725}. Best is trial 15 with value: 0.18885593419636032.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:21,376] Trial 55 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:21,714] Trial 56 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:22,058] Trial 57 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:22,364] Trial 58 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:22,685] Trial 59 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.62850


Parameters: { "n_estimators" } are not used.



[2]	validation-error:0.12136
[3]	validation-error:0.07449
[4]	validation-error:0.06167
[5]	validation-error:0.05962
[6]	validation-error:0.05780
[7]	validation-error:0.05588
[8]	validation-error:0.05345
[9]	validation-error:0.05249
[10]	validation-error:0.05245
[11]	validation-error:0.05102
[12]	validation-error:0.05054
[13]	validation-error:0.04958
[14]	validation-error:0.04903
[15]	validation-error:0.04845
[16]	validation-error:0.04834
[17]	validation-error:0.04752
[18]	validation-error:0.04742
[19]	validation-error:0.04732
[20]	validation-error:0.04684
[21]	validation-error:0.04618
[22]	validation-error:0.04601
[23]	validation-error:0.04557
[24]	validation-error:0.04577
[25]	validation-error:0.04488
[26]	validation-error:0.04485
[27]	validation-error:0.04440
[28]	validation-error:0.04403
[29]	validation-error:0.04389
[30]	validation-error:0.04365
[31]	validation-error:0.04331
[32]	validation-error:0.04317
[33]	validation-error:0.04283


[I 2024-04-02 13:17:24,205] Trial 60 pruned. Trial was pruned at iteration 33.


[0]	validation-error:0.65995
[1]	validation-error:0.53520
[2]	validation-error:0.14064


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:24,747] Trial 61 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.



[1]	validation-error:0.52088
[2]	validation-error:0.17617


[I 2024-04-02 13:17:25,338] Trial 62 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.49238
[2]	validation-error:0.13102


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:25,676] Trial 63 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:26,048] Trial 64 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:26,377] Trial 65 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.57008
[2]	validation-error:0.13050


Parameters: { "n_estimators" } are not used.



[3]	validation-error:0.08281
[4]	validation-error:0.06777
[5]	validation-error:0.06140
[6]	validation-error:0.05979
[7]	validation-error:0.05759


[I 2024-04-02 13:17:26,998] Trial 66 pruned. Trial was pruned at iteration 8.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:27,347] Trial 67 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:27,667] Trial 68 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.44801
[2]	validation-error:0.10847
[3]	validation-error:0.06763


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.05502
[5]	validation-error:0.05146
[6]	validation-error:0.05074
[7]	validation-error:0.04841
[8]	validation-error:0.04742
[9]	validation-error:0.04560
[10]	validation-error:0.04512
[11]	validation-error:0.04416
[12]	validation-error:0.04386
[13]	validation-error:0.04355
[14]	validation-error:0.04334
[15]	validation-error:0.04146
[16]	validation-error:0.04111
[17]	validation-error:0.04081
[18]	validation-error:0.04029
[19]	validation-error:0.04002
[20]	validation-error:0.03981
[21]	validation-error:0.03995
[22]	validation-error:0.03971
[23]	validation-error:0.03981
[24]	validation-error:0.03981
[25]	validation-error:0.03954
[26]	validation-error:0.03923
[27]	validation-error:0.03899
[28]	validation-error:0.03865
[29]	validation-error:0.03861
[30]	validation-error:0.03875
[31]	validation-error:0.03889
[32]	validation-error:0.03878
[33]	validation-error:0.03868
[34]	validation-error:0.03858
[35]	validation-error:0.03844
[36]	validation-error:0.03844
[37]	validation-

[I 2024-04-02 13:17:31,635] Trial 69 finished with value: 0.19431041002442972 and parameters: {'eta': 0.27641831165135294, 'max_depth': 10, 'subsample': 0.7440457032325415, 'colsample_bytree': 0.7468173948917753, 'gamma': 0.27443442586990463, 'min_child_weight': 0.5760763410988008, 'lambda': 0.48620894948044047, 'alpha': 2.080479018853975}. Best is trial 15 with value: 0.18885593419636032.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:31,923] Trial 70 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.38853
[2]	validation-error:0.11666
[3]	validation-error:0.07973
[4]	validation-error:0.06791


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.05883
[6]	validation-error:0.05814
[7]	validation-error:0.05592
[8]	validation-error:0.05444
[9]	validation-error:0.05362
[10]	validation-error:0.05146
[11]	validation-error:0.04989
[12]	validation-error:0.04896
[13]	validation-error:0.04756
[14]	validation-error:0.04680
[15]	validation-error:0.04625
[16]	validation-error:0.04550
[17]	validation-error:0.04430
[18]	validation-error:0.04396
[19]	validation-error:0.04372
[20]	validation-error:0.04344
[21]	validation-error:0.04279
[22]	validation-error:0.04266
[23]	validation-error:0.04245
[24]	validation-error:0.04269
[25]	validation-error:0.04272
[26]	validation-error:0.04252
[27]	validation-error:0.04262
[28]	validation-error:0.04224
[29]	validation-error:0.04197
[30]	validation-error:0.04187
[31]	validation-error:0.04163
[32]	validation-error:0.04170
[33]	validation-error:0.04170
[34]	validation-error:0.04170
[35]	validation-error:0.04129
[36]	validation-error:0.04105
[37]	validation-error:0.04087
[38]	validation

[I 2024-04-02 13:17:33,086] Trial 71 pruned. Trial was pruned at iteration 47.


[0]	validation-error:0.65995
[1]	validation-error:0.57210
[2]	validation-error:0.13667
[3]	validation-error:0.07188


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.06167
[5]	validation-error:0.05735
[6]	validation-error:0.05794
[7]	validation-error:0.05674
[8]	validation-error:0.05485
[9]	validation-error:0.05359
[10]	validation-error:0.05263
[11]	validation-error:0.05143
[12]	validation-error:0.05023
[13]	validation-error:0.04913
[14]	validation-error:0.04841
[15]	validation-error:0.04766
[16]	validation-error:0.04714
[17]	validation-error:0.04598
[18]	validation-error:0.04550
[19]	validation-error:0.04553
[20]	validation-error:0.04475
[21]	validation-error:0.04461
[22]	validation-error:0.04399
[23]	validation-error:0.04365
[24]	validation-error:0.04327
[25]	validation-error:0.04307
[26]	validation-error:0.04276
[27]	validation-error:0.04238
[28]	validation-error:0.04221
[29]	validation-error:0.04235
[30]	validation-error:0.04201
[31]	validation-error:0.04187
[32]	validation-error:0.04156
[33]	validation-error:0.04087
[34]	validation-error:0.04091
[35]	validation-error:0.04057
[36]	validation-error:0.04060
[37]	validation-

[I 2024-04-02 13:17:40,149] Trial 72 finished with value: 0.19066148131992602 and parameters: {'eta': 0.26517599510812145, 'max_depth': 8, 'subsample': 0.8750666502675967, 'colsample_bytree': 0.7200994232871057, 'gamma': 1.1728891380407498, 'min_child_weight': 3.0346293488024165, 'lambda': 4.07139504455853, 'alpha': 4.137399946116095}. Best is trial 15 with value: 0.18885593419636032.


[0]	validation-error:0.65995
[1]	validation-error:0.57323
[2]	validation-error:0.13455
[3]	validation-error:0.07126
[4]	validation-error:0.06153


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.05687
[6]	validation-error:0.05701
[7]	validation-error:0.05530
[8]	validation-error:0.05496
[9]	validation-error:0.05393
[10]	validation-error:0.05211
[11]	validation-error:0.05105
[12]	validation-error:0.05009
[13]	validation-error:0.04910
[14]	validation-error:0.04838
[15]	validation-error:0.04783
[16]	validation-error:0.04704
[17]	validation-error:0.04660
[18]	validation-error:0.04663
[19]	validation-error:0.04584
[20]	validation-error:0.04519
[21]	validation-error:0.04471
[22]	validation-error:0.04392
[23]	validation-error:0.04355
[24]	validation-error:0.04327
[25]	validation-error:0.04290
[26]	validation-error:0.04218
[27]	validation-error:0.04173
[28]	validation-error:0.04163
[29]	validation-error:0.04177
[30]	validation-error:0.04173
[31]	validation-error:0.04194
[32]	validation-error:0.04177
[33]	validation-error:0.04163
[34]	validation-error:0.04142
[35]	validation-error:0.04146
[36]	validation-error:0.04139
[37]	validation-error:0.04125
[38]	validation

[I 2024-04-02 13:17:47,395] Trial 73 finished with value: 0.18994132206536074 and parameters: {'eta': 0.26541597325056676, 'max_depth': 8, 'subsample': 0.8786172077399419, 'colsample_bytree': 0.7243215849744432, 'gamma': 0.3382212812055816, 'min_child_weight': 2.203774762304369, 'lambda': 4.769902031896164, 'alpha': 3.506065913258626}. Best is trial 15 with value: 0.18885593419636032.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:47,736] Trial 74 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.60777
[2]	validation-error:0.15044


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:48,078] Trial 75 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:48,379] Trial 76 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:48,681] Trial 77 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:48,972] Trial 78 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.29047
[2]	validation-error:0.09933
[3]	validation-error:0.07325


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.06304
[5]	validation-error:0.05783
[6]	validation-error:0.05629
[7]	validation-error:0.05222
[8]	validation-error:0.05047
[9]	validation-error:0.04927
[10]	validation-error:0.04797
[11]	validation-error:0.04680
[12]	validation-error:0.04608
[13]	validation-error:0.04547
[14]	validation-error:0.04430
[15]	validation-error:0.04372
[16]	validation-error:0.04348
[17]	validation-error:0.04255
[18]	validation-error:0.04269
[19]	validation-error:0.04242
[20]	validation-error:0.04211
[21]	validation-error:0.04204
[22]	validation-error:0.04201
[23]	validation-error:0.04142
[24]	validation-error:0.04132
[25]	validation-error:0.04139
[26]	validation-error:0.04159
[27]	validation-error:0.04111
[28]	validation-error:0.04111
[29]	validation-error:0.04108
[30]	validation-error:0.04139
[31]	validation-error:0.04146
[32]	validation-error:0.04115
[33]	validation-error:0.04105
[34]	validation-error:0.04111
[35]	validation-error:0.04087
[36]	validation-error:0.04067
[37]	validation-

[I 2024-04-02 13:17:50,451] Trial 79 pruned. Trial was pruned at iteration 51.


[0]	validation-error:0.65995
[1]	validation-error:0.57210
[2]	validation-error:0.13139
[3]	validation-error:0.07126
[4]	validation-error:0.06174


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.05660
[6]	validation-error:0.05643
[7]	validation-error:0.05602
[8]	validation-error:0.05520
[9]	validation-error:0.05461
[10]	validation-error:0.05331
[11]	validation-error:0.05245
[12]	validation-error:0.05170
[13]	validation-error:0.05088
[14]	validation-error:0.04961
[15]	validation-error:0.04882
[16]	validation-error:0.04882
[17]	validation-error:0.04814
[18]	validation-error:0.04780


[I 2024-04-02 13:17:51,128] Trial 80 pruned. Trial was pruned at iteration 18.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:51,450] Trial 81 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:51,742] Trial 82 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.54168
[2]	validation-error:0.14270


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:52,117] Trial 83 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:52,423] Trial 84 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.38490
[2]	validation-error:0.11522
[3]	validation-error:0.08031
[4]	validation-error:0.06448
[5]	validation-error:0.05951


Parameters: { "n_estimators" } are not used.



[6]	validation-error:0.05859
[7]	validation-error:0.05753
[8]	validation-error:0.05592
[9]	validation-error:0.05424
[10]	validation-error:0.05225
[11]	validation-error:0.05084
[12]	validation-error:0.04917
[13]	validation-error:0.04879
[14]	validation-error:0.04769
[15]	validation-error:0.04738
[16]	validation-error:0.04632
[17]	validation-error:0.04601
[18]	validation-error:0.04588
[19]	validation-error:0.04519
[20]	validation-error:0.04468
[21]	validation-error:0.04433
[22]	validation-error:0.04447
[23]	validation-error:0.04399
[24]	validation-error:0.04389
[25]	validation-error:0.04372
[26]	validation-error:0.04283
[27]	validation-error:0.04276
[28]	validation-error:0.04242
[29]	validation-error:0.04228
[30]	validation-error:0.04214
[31]	validation-error:0.04173
[32]	validation-error:0.04159
[33]	validation-error:0.04129
[34]	validation-error:0.04135
[35]	validation-error:0.04091
[36]	validation-error:0.04070
[37]	validation-error:0.04043
[38]	validation-error:0.04060
[39]	validatio

[I 2024-04-02 13:17:57,261] Trial 85 pruned. Trial was pruned at iteration 243.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:57,548] Trial 86 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:57,843] Trial 87 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:58,174] Trial 88 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:58,453] Trial 89 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:17:58,747] Trial 90 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.48525
[2]	validation-error:0.11916
[3]	validation-error:0.08384


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.06489
[5]	validation-error:0.05907


[I 2024-04-02 13:17:59,296] Trial 91 pruned. Trial was pruned at iteration 6.


[0]	validation-error:0.65995
[1]	validation-error:0.37945
[2]	validation-error:0.11505
[3]	validation-error:0.07986
[4]	validation-error:0.06513


Parameters: { "n_estimators" } are not used.



[5]	validation-error:0.06023


[I 2024-04-02 13:17:59,735] Trial 92 pruned. Trial was pruned at iteration 5.


[0]	validation-error:0.65995
[1]	validation-error:0.51482
[2]	validation-error:0.12615
[3]	validation-error:0.08459


Parameters: { "n_estimators" } are not used.



[4]	validation-error:0.06688


[I 2024-04-02 13:18:00,162] Trial 93 pruned. Trial was pruned at iteration 5.


[0]	validation-error:0.65995
[1]	validation-error:0.51571
[2]	validation-error:0.13605
[3]	validation-error:0.09196


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:18:00,516] Trial 94 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:18:00,885] Trial 95 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.50766
[2]	validation-error:0.13838


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:18:01,242] Trial 96 pruned. Trial was pruned at iteration 3.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:18:01,547] Trial 97 pruned. Trial was pruned at iteration 1.


[0]	validation-error:0.65995
[1]	validation-error:0.58653


Parameters: { "n_estimators" } are not used.



[2]	validation-error:0.14325


[I 2024-04-02 13:18:01,992] Trial 98 pruned. Trial was pruned at iteration 2.


[0]	validation-error:0.65995
[1]	validation-error:0.65995


Parameters: { "n_estimators" } are not used.

[I 2024-04-02 13:18:02,401] Trial 99 pruned. Trial was pruned at iteration 1.


Best Hyperparameters (33 Features):  {'eta': 0.24920171651018697, 'max_depth': 8, 'subsample': 0.7969243749302927, 'colsample_bytree': 0.7025268248381582, 'gamma': 0.23722771541916587, 'min_child_weight': 1.9542417160152912, 'lambda': 1.7466848789938463, 'alpha': 5.8218178623611525}
Best Error (33 Features):  0.18885593419636032


#### Model Training

In [20]:
from sklearn.model_selection import KFold
from xgboost import DMatrix, train

# Initialize CV
cv = KFold(n_splits=10, shuffle=True, random_state=1)

best_params_lexical_content['objective'] = 'binary:hinge'
best_params_lexical_content['eval_metric'] = 'error'

# Convert the data into DMatrix format
lexical_content_train = DMatrix(X_train_lexical_content, label=y_train)
lexical_content_valid = DMatrix(X_test_lexical_content, label=y_test)

# Train the Model
xgb_classifier = train(best_params_lexical_content, lexical_content_train, num_boost_round=1000)
y_pred_lexical_content = xgb_classifier.predict(lexical_content_valid)

print("Model training done.")

Model training done.


In [21]:
# Classification Report
print(classification_report(y_test, y_pred_lexical_content))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     23942
           1       0.95      0.93      0.94     12541

    accuracy                           0.96     36483
   macro avg       0.96      0.95      0.96     36483
weighted avg       0.96      0.96      0.96     36483



#### Practical Evaluation

In [22]:
# Dumping the model
joblib.dump(xgb_classifier, 'xgb_filter_lexical-content.sav')

['xgb_filter_lexical-content.sav']

In [28]:
import content_generator
import feature_generation_content_function_htmlin
import time

def xgb_predict_maliciousness(url):

    numerical_values = content_generator.feature_generator(url)
    # print(numerical_values)
    numerical_values = DMatrix(numerical_values)

    match xgb_classifier.predict(numerical_values):
        case 0:
            return "Benign"
        case 1:
            return "Malware"
        case 2:
            return "Phishing"
        case 3:
            return "Defacement"

url = "www.mizuhubasnk.top"
print("Current URL: "+url)

start = time.perf_counter()
prediction = xgb_predict_maliciousness(url)
end = time.perf_counter()
print("------- Lexical+Content-Based Model -------------")
print(prediction)
print(end-start)

Current URL: www.mizuhubasnk.top


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:js_count: object, has_log_in_html: object, meta_tag_count: object, js_search_count: object, sus_js_count: object, js_link_count: object, webpage_entropy: object, webpage_size: object, has_free_in_html: object

#### Evaluation

In [11]:
'''# Confusion Matrix for 12 Features
cm_up = confusion_matrix(y_test, y_pred, labels=xgb_classifier.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier.classes_)
disp.plot()
plt.show()'''

'# Confusion Matrix for 12 Features\ncm_up = confusion_matrix(y_test, y_pred, labels=xgb_classifier.classes_)\ndisp = ConfusionMatrixDisplay(confusion_matrix = cm_up, display_labels = xgb_classifier.classes_)\ndisp.plot()\nplt.show()'

In [12]:
'''# Cross Validation Score
scores = cross_val_score(XGBClassifier(**params_gbm),
                        X_train, y_train, scoring='accuracy', cv=cv).mean()

print(scores)'''

"# Cross Validation Score\nscores = cross_val_score(XGBClassifier(**params_gbm),\n                        X_train, y_train, scoring='accuracy', cv=cv).mean()\n\nprint(scores)"