In [1]:
# APPROACH TO ONLY TRAIN ON ONE AND TWO 

In [2]:
import datetime
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.inspection import PartialDependenceDisplay, partial_dependence
from sklearn.metrics import (ConfusionMatrixDisplay, accuracy_score, precision_recall_curve, roc_auc_score, roc_curve)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import label_binarize
from xgboost import XGBClassifier

import shap

%config InlineBackend.figure_format = 'retina'

In [3]:
from notebook import notebookapp
import urllib
import json
import ipykernel
from shutil import copy2

def notebook_path():
    """Returns the absolute path of the Notebook or None if it cannot be determined
    NOTE: works only when the security is token-based or there is also no password
    """
    connection_file = os.path.basename(ipykernel.get_connection_file())
    kernel_id = connection_file.split('-', 1)[1].split('.')[0]

    for srv in notebookapp.list_running_servers():
        try:
            if srv['token']=='' and not srv['password']:  # No token and no password, ahem...
                req = urllib.request.urlopen(srv['url']+'api/sessions')
            else:
                req = urllib.request.urlopen(srv['url']+'api/sessions?token='+srv['token'])
            sessions = json.load(req)
            for sess in sessions:
                if sess['kernel']['id'] == kernel_id:
                    return os.path.join(srv['notebook_dir'],sess['notebook']['path'])
        except:
            pass  # There may be stale entries in the runtime directory 
    return None


def copy_current_nb(new_name):
    nb = notebook_path()
    if nb:
        new_path = os.path.join(os.path.dirname(nb), new_name+'.ipynb')
        copy2(nb, new_path)
    else:
        print("Current notebook path cannot be determined.")

In [4]:
df = pd.read_csv('Data/cover_type_engineered.csv')

In [16]:
df = df.loc[:, [col for col in df if not col.startswith('Cover_Type_')]]
df = df.loc[(df['Cover_Type'] == 1) | (df['Cover_Type'] == 2)]

X = df.drop(columns=['Cover_Type', 'Aspect_Sector'])
y = df['Cover_Type'] - 1

In [None]:
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M")

warnings.filterwarnings("ignore", category=FutureWarning, module="pandas.api.types")

# Assuming X and y are defined
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

# Define the estimator
estimator = XGBClassifier(tree_method="hist")

# Define hyperparameters for tuning
hyperparameters = {
    "n_estimators": stats.randint(50, 150),  # Increased range
    "learning_rate": stats.uniform(0.05, 0.1),  # Adjusted range
    "colsample_bytree": stats.uniform(0.6, 0.4),
    "colsample_bylevel": stats.uniform(0.6, 0.4),
    "colsample_bynode": stats.uniform(0.6, 0.4),
    "max_depth": stats.randint(15, 50),  # Increased range
    "subsample": stats.uniform(0.6, 0.4),
    "gamma": stats.uniform(0, 2),  # Adjusted range
    "reg_lambda": stats.uniform(0, 2),  # Adjusted range
    "reg_alpha": stats.uniform(0, 2),  # Adjusted range
}

# Define the RandomizedSearchCV parameters
random_search = RandomizedSearchCV(estimator, 
                                   param_distributions=hyperparameters, 
                                   scoring='accuracy',
                                   return_train_score=True,
                                   n_iter=500, 
                                   cv=5, 
                                   verbose=10, 
                                   n_jobs=-1)

# Fit the RandomizedSearchCV
try:
    random_search.fit(X_train, y_train)  # Assuming X_train and y_train are defined
    print("Best parameters found:", random_search.best_params_)
    print("Best score found:", random_search.best_score_)

    
    # Save results
    results_path = f"./tuning_results/tuning_xgb/{timestamp}"
    if not os.path.exists(results_path):
        os.makedirs(results_path)
        
    # Saving cross-validation results
    cv_results = pd.DataFrame(random_search.cv_results_)
    cv_results_file = f"{timestamp}_results.csv"
    cv_results.to_csv(os.path.join(results_path, cv_results_file), index=False)
    
    # Save .ipynb
    copy_current_nb(os.path.join(results_path, 'Evaluation_Notebook'))
    
    # Save Model
    file_name = f"xgb_{timestamp}.pkl"
    pickle.dump(random_search, open(os.path.join(results_path, file_name), "wb"))
        

except Exception as e:
    print(f"An error occurred during model optimization: {e}")


Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV 2/5; 2/500] START colsample_bylevel=0.6398760460621272, colsample_bynode=0.6604297339327864, colsample_bytree=0.8909285194831833, gamma=0.4561738017712562, learning_rate=0.07795562585902209, max_depth=24, n_estimators=86, reg_alpha=0.010790441341474333, reg_lambda=1.8140258781293448, subsample=0.8589472484759024
[CV 2/5; 2/500] END colsample_bylevel=0.6398760460621272, colsample_bynode=0.6604297339327864, colsample_bytree=0.8909285194831833, gamma=0.4561738017712562, learning_rate=0.07795562585902209, max_depth=24, n_estimators=86, reg_alpha=0.010790441341474333, reg_lambda=1.8140258781293448, subsample=0.8589472484759024;, score=(train=0.997, test=0.803) total time=   0.7s
[CV 4/5; 3/500] START colsample_bylevel=0.7847967251785161, colsample_bynode=0.6626516517825252, colsample_bytree=0.922542827216404, gamma=0.2575271803657049, learning_rate=0.08655512044711312, max_depth=39, n_estimators=140, reg_alpha=1.97621676248

[CV 1/5; 1/500] START colsample_bylevel=0.9568623721184107, colsample_bynode=0.9675192403964803, colsample_bytree=0.7438389587386695, gamma=1.850842060817991, learning_rate=0.07355647037225387, max_depth=30, n_estimators=65, reg_alpha=1.5015903556461276, reg_lambda=1.7995961878471345, subsample=0.9897003095921357
[CV 1/5; 1/500] END colsample_bylevel=0.9568623721184107, colsample_bynode=0.9675192403964803, colsample_bytree=0.7438389587386695, gamma=1.850842060817991, learning_rate=0.07355647037225387, max_depth=30, n_estimators=65, reg_alpha=1.5015903556461276, reg_lambda=1.7995961878471345, subsample=0.9897003095921357;, score=(train=0.905, test=0.798) total time=   0.4s
[CV 2/5; 3/500] START colsample_bylevel=0.7847967251785161, colsample_bynode=0.6626516517825252, colsample_bytree=0.922542827216404, gamma=0.2575271803657049, learning_rate=0.08655512044711312, max_depth=39, n_estimators=140, reg_alpha=1.9762167624875888, reg_lambda=0.06384409799692592, subsample=0.6868162676021907
[C

[CV 2/5; 1/500] START colsample_bylevel=0.9568623721184107, colsample_bynode=0.9675192403964803, colsample_bytree=0.7438389587386695, gamma=1.850842060817991, learning_rate=0.07355647037225387, max_depth=30, n_estimators=65, reg_alpha=1.5015903556461276, reg_lambda=1.7995961878471345, subsample=0.9897003095921357
[CV 2/5; 1/500] END colsample_bylevel=0.9568623721184107, colsample_bynode=0.9675192403964803, colsample_bytree=0.7438389587386695, gamma=1.850842060817991, learning_rate=0.07355647037225387, max_depth=30, n_estimators=65, reg_alpha=1.5015903556461276, reg_lambda=1.7995961878471345, subsample=0.9897003095921357;, score=(train=0.910, test=0.807) total time=   0.5s
[CV 3/5; 3/500] START colsample_bylevel=0.7847967251785161, colsample_bynode=0.6626516517825252, colsample_bytree=0.922542827216404, gamma=0.2575271803657049, learning_rate=0.08655512044711312, max_depth=39, n_estimators=140, reg_alpha=1.9762167624875888, reg_lambda=0.06384409799692592, subsample=0.6868162676021907
[C

[CV 1/5; 2/500] START colsample_bylevel=0.6398760460621272, colsample_bynode=0.6604297339327864, colsample_bytree=0.8909285194831833, gamma=0.4561738017712562, learning_rate=0.07795562585902209, max_depth=24, n_estimators=86, reg_alpha=0.010790441341474333, reg_lambda=1.8140258781293448, subsample=0.8589472484759024
[CV 1/5; 2/500] END colsample_bylevel=0.6398760460621272, colsample_bynode=0.6604297339327864, colsample_bytree=0.8909285194831833, gamma=0.4561738017712562, learning_rate=0.07795562585902209, max_depth=24, n_estimators=86, reg_alpha=0.010790441341474333, reg_lambda=1.8140258781293448, subsample=0.8589472484759024;, score=(train=0.999, test=0.808) total time=   0.8s
[CV 5/5; 3/500] START colsample_bylevel=0.7847967251785161, colsample_bynode=0.6626516517825252, colsample_bytree=0.922542827216404, gamma=0.2575271803657049, learning_rate=0.08655512044711312, max_depth=39, n_estimators=140, reg_alpha=1.9762167624875888, reg_lambda=0.06384409799692592, subsample=0.6868162676021

[CV 2/5; 36/500] START colsample_bylevel=0.7086150588478461, colsample_bynode=0.9341157407916101, colsample_bytree=0.7062256458235723, gamma=1.1258743160663542, learning_rate=0.08117478432038895, max_depth=44, n_estimators=124, reg_alpha=1.0353303115188224, reg_lambda=1.7544976288320286, subsample=0.9415046135965655
[CV 2/5; 36/500] END colsample_bylevel=0.7086150588478461, colsample_bynode=0.9341157407916101, colsample_bytree=0.7062256458235723, gamma=1.1258743160663542, learning_rate=0.08117478432038895, max_depth=44, n_estimators=124, reg_alpha=1.0353303115188224, reg_lambda=1.7544976288320286, subsample=0.9415046135965655;, score=(train=0.955, test=0.810) total time=   0.5s
[CV 5/5; 37/500] START colsample_bylevel=0.7429053327961056, colsample_bynode=0.7015250604611424, colsample_bytree=0.8187127049444709, gamma=1.9442478980963538, learning_rate=0.051764122594187736, max_depth=30, n_estimators=92, reg_alpha=0.04113185720171986, reg_lambda=1.9538137911947664, subsample=0.99745659672

[CV 4/5; 38/500] START colsample_bylevel=0.8533379974187558, colsample_bynode=0.7689041541199688, colsample_bytree=0.6139042603778949, gamma=0.329774285497058, learning_rate=0.08880491966245235, max_depth=30, n_estimators=105, reg_alpha=1.8740961107816287, reg_lambda=1.0504147487068114, subsample=0.7300579341940454
[CV 4/5; 38/500] END colsample_bylevel=0.8533379974187558, colsample_bynode=0.7689041541199688, colsample_bytree=0.6139042603778949, gamma=0.329774285497058, learning_rate=0.08880491966245235, max_depth=30, n_estimators=105, reg_alpha=1.8740961107816287, reg_lambda=1.0504147487068114, subsample=0.7300579341940454;, score=(train=0.969, test=0.798) total time=   0.5s
[CV 1/5; 41/500] START colsample_bylevel=0.9125865355137719, colsample_bynode=0.7443192905376471, colsample_bytree=0.9793143700413002, gamma=1.1390459061657412, learning_rate=0.14689321274281575, max_depth=18, n_estimators=117, reg_alpha=1.9313116156304864, reg_lambda=0.8996207683343165, subsample=0.67715319063129

[CV 4/5; 37/500] START colsample_bylevel=0.7429053327961056, colsample_bynode=0.7015250604611424, colsample_bytree=0.8187127049444709, gamma=1.9442478980963538, learning_rate=0.051764122594187736, max_depth=30, n_estimators=92, reg_alpha=0.04113185720171986, reg_lambda=1.9538137911947664, subsample=0.9974565967215258
[CV 4/5; 37/500] END colsample_bylevel=0.7429053327961056, colsample_bynode=0.7015250604611424, colsample_bytree=0.8187127049444709, gamma=1.9442478980963538, learning_rate=0.051764122594187736, max_depth=30, n_estimators=92, reg_alpha=0.04113185720171986, reg_lambda=1.9538137911947664, subsample=0.9974565967215258;, score=(train=0.933, test=0.793) total time=   0.5s
[CV 1/5; 39/500] START colsample_bylevel=0.9732768042025277, colsample_bynode=0.7314931684323632, colsample_bytree=0.9865004749894739, gamma=1.0774663350270328, learning_rate=0.13909528437301172, max_depth=27, n_estimators=73, reg_alpha=1.8748166206170567, reg_lambda=0.463704109390902, subsample=0.650380890941

[CV 5/5; 38/500] START colsample_bylevel=0.8533379974187558, colsample_bynode=0.7689041541199688, colsample_bytree=0.6139042603778949, gamma=0.329774285497058, learning_rate=0.08880491966245235, max_depth=30, n_estimators=105, reg_alpha=1.8740961107816287, reg_lambda=1.0504147487068114, subsample=0.7300579341940454
[CV 5/5; 38/500] END colsample_bylevel=0.8533379974187558, colsample_bynode=0.7689041541199688, colsample_bytree=0.6139042603778949, gamma=0.329774285497058, learning_rate=0.08880491966245235, max_depth=30, n_estimators=105, reg_alpha=1.8740961107816287, reg_lambda=1.0504147487068114, subsample=0.7300579341940454;, score=(train=0.976, test=0.792) total time=   0.5s
[CV 5/5; 40/500] START colsample_bylevel=0.6076533718280501, colsample_bynode=0.6068662927764096, colsample_bytree=0.9677582122884986, gamma=0.03396464289255552, learning_rate=0.11886911473090288, max_depth=29, n_estimators=109, reg_alpha=1.973545593828676, reg_lambda=0.8073748274819197, subsample=0.67533248361171

[CV 4/5; 39/500] START colsample_bylevel=0.9732768042025277, colsample_bynode=0.7314931684323632, colsample_bytree=0.9865004749894739, gamma=1.0774663350270328, learning_rate=0.13909528437301172, max_depth=27, n_estimators=73, reg_alpha=1.8748166206170567, reg_lambda=0.463704109390902, subsample=0.6503808909417246
[CV 4/5; 39/500] END colsample_bylevel=0.9732768042025277, colsample_bynode=0.7314931684323632, colsample_bytree=0.9865004749894739, gamma=1.0774663350270328, learning_rate=0.13909528437301172, max_depth=27, n_estimators=73, reg_alpha=1.8748166206170567, reg_lambda=0.463704109390902, subsample=0.6503808909417246;, score=(train=0.944, test=0.791) total time=   0.3s
[CV 2/5; 41/500] START colsample_bylevel=0.9125865355137719, colsample_bynode=0.7443192905376471, colsample_bytree=0.9793143700413002, gamma=1.1390459061657412, learning_rate=0.14689321274281575, max_depth=18, n_estimators=117, reg_alpha=1.9313116156304864, reg_lambda=0.8996207683343165, subsample=0.677153190631297


[CV 4/5; 40/500] START colsample_bylevel=0.6076533718280501, colsample_bynode=0.6068662927764096, colsample_bytree=0.9677582122884986, gamma=0.03396464289255552, learning_rate=0.11886911473090288, max_depth=29, n_estimators=109, reg_alpha=1.973545593828676, reg_lambda=0.8073748274819197, subsample=0.6753324836117197
[CV 4/5; 40/500] END colsample_bylevel=0.6076533718280501, colsample_bynode=0.6068662927764096, colsample_bytree=0.9677582122884986, gamma=0.03396464289255552, learning_rate=0.11886911473090288, max_depth=29, n_estimators=109, reg_alpha=1.973545593828676, reg_lambda=0.8073748274819197, subsample=0.6753324836117197;, score=(train=0.989, test=0.794) total time=   0.6s
[CV 4/5; 42/500] START colsample_bylevel=0.7533888965104223, colsample_bynode=0.9462174535287875, colsample_bytree=0.8537537449392676, gamma=0.26531258551254067, learning_rate=0.05470070254621831, max_depth=41, n_estimators=142, reg_alpha=1.365932159306721, reg_lambda=1.229136439464359, subsample=0.9035896797932

[CV 5/5; 69/500] START colsample_bylevel=0.7861507629284736, colsample_bynode=0.8402329078763715, colsample_bytree=0.8519767244243206, gamma=0.7681756461861258, learning_rate=0.09417131749406626, max_depth=36, n_estimators=138, reg_alpha=1.7200422194259717, reg_lambda=0.2106821831343173, subsample=0.991100210338206
[CV 5/5; 69/500] END colsample_bylevel=0.7861507629284736, colsample_bynode=0.8402329078763715, colsample_bytree=0.8519767244243206, gamma=0.7681756461861258, learning_rate=0.09417131749406626, max_depth=36, n_estimators=138, reg_alpha=1.7200422194259717, reg_lambda=0.2106821831343173, subsample=0.991100210338206;, score=(train=0.957, test=0.803) total time=   0.4s
[CV 3/5; 71/500] START colsample_bylevel=0.9402478069478089, colsample_bynode=0.7537226030466246, colsample_bytree=0.8016641392648811, gamma=1.5532811698510567, learning_rate=0.06462268813024839, max_depth=34, n_estimators=65, reg_alpha=0.9866422357696696, reg_lambda=0.57644817920697, subsample=0.7243332644154905


[CV 2/5; 74/500] START colsample_bylevel=0.816513274959898, colsample_bynode=0.7643205836044628, colsample_bytree=0.8116128206396764, gamma=1.5171111189816406, learning_rate=0.1163525065671689, max_depth=44, n_estimators=103, reg_alpha=1.907337332030161, reg_lambda=0.2642178295276385, subsample=0.7154348405102912
[CV 2/5; 74/500] END colsample_bylevel=0.816513274959898, colsample_bynode=0.7643205836044628, colsample_bytree=0.8116128206396764, gamma=1.5171111189816406, learning_rate=0.1163525065671689, max_depth=44, n_estimators=103, reg_alpha=1.907337332030161, reg_lambda=0.2642178295276385, subsample=0.7154348405102912;, score=(train=0.914, test=0.788) total time=   0.4s
[CV 1/5; 75/500] START colsample_bylevel=0.6432688173800886, colsample_bynode=0.9222519408660952, colsample_bytree=0.8609422606129813, gamma=0.07074917738186981, learning_rate=0.059621776929956585, max_depth=43, n_estimators=54, reg_alpha=0.6139589343152598, reg_lambda=0.6733521168393279, subsample=0.8618663268289417


[CV 4/5; 76/500] START colsample_bylevel=0.8903535536433689, colsample_bynode=0.7053059820811457, colsample_bytree=0.6328506262259835, gamma=1.2652945461333378, learning_rate=0.14545119572063453, max_depth=17, n_estimators=95, reg_alpha=1.6876822667219638, reg_lambda=1.1240628556909877, subsample=0.6882788931698155
[CV 4/5; 76/500] END colsample_bylevel=0.8903535536433689, colsample_bynode=0.7053059820811457, colsample_bytree=0.6328506262259835, gamma=1.2652945461333378, learning_rate=0.14545119572063453, max_depth=17, n_estimators=95, reg_alpha=1.6876822667219638, reg_lambda=1.1240628556909877, subsample=0.6882788931698155;, score=(train=0.921, test=0.803) total time=   0.3s
[CV 1/5; 78/500] START colsample_bylevel=0.9320796107045175, colsample_bynode=0.9058789130114927, colsample_bytree=0.6567012482889769, gamma=1.7897900633382504, learning_rate=0.08638260457862318, max_depth=20, n_estimators=78, reg_alpha=1.5066309140721104, reg_lambda=0.5949072388301382, subsample=0.857761604487618

[CV 3/5; 76/500] END colsample_bylevel=0.8903535536433689, colsample_bynode=0.7053059820811457, colsample_bytree=0.6328506262259835, gamma=1.2652945461333378, learning_rate=0.14545119572063453, max_depth=17, n_estimators=95, reg_alpha=1.6876822667219638, reg_lambda=1.1240628556909877, subsample=0.6882788931698155;, score=(train=0.915, test=0.848) total time=   0.3s
[CV 1/5; 77/500] START colsample_bylevel=0.6133323463489875, colsample_bynode=0.8299173729347484, colsample_bytree=0.743983933363942, gamma=0.3116804377567177, learning_rate=0.07445551834626589, max_depth=41, n_estimators=102, reg_alpha=1.66953856429376, reg_lambda=1.6143257296824394, subsample=0.7901203117900335
[CV 1/5; 77/500] END colsample_bylevel=0.6133323463489875, colsample_bynode=0.8299173729347484, colsample_bytree=0.743983933363942, gamma=0.3116804377567177, learning_rate=0.07445551834626589, max_depth=41, n_estimators=102, reg_alpha=1.66953856429376, reg_lambda=1.6143257296824394, subsample=0.7901203117900335;, sc

[CV 4/5; 75/500] START colsample_bylevel=0.6432688173800886, colsample_bynode=0.9222519408660952, colsample_bytree=0.8609422606129813, gamma=0.07074917738186981, learning_rate=0.059621776929956585, max_depth=43, n_estimators=54, reg_alpha=0.6139589343152598, reg_lambda=0.6733521168393279, subsample=0.8618663268289417
[CV 4/5; 75/500] END colsample_bylevel=0.6432688173800886, colsample_bynode=0.9222519408660952, colsample_bytree=0.8609422606129813, gamma=0.07074917738186981, learning_rate=0.059621776929956585, max_depth=43, n_estimators=54, reg_alpha=0.6139589343152598, reg_lambda=0.6733521168393279, subsample=0.8618663268289417;, score=(train=0.984, test=0.798) total time=   0.7s
[CV 2/5; 78/500] START colsample_bylevel=0.9320796107045175, colsample_bynode=0.9058789130114927, colsample_bytree=0.6567012482889769, gamma=1.7897900633382504, learning_rate=0.08638260457862318, max_depth=20, n_estimators=78, reg_alpha=1.5066309140721104, reg_lambda=0.5949072388301382, subsample=0.85776160448

[CV 5/5; 77/500] START colsample_bylevel=0.6133323463489875, colsample_bynode=0.8299173729347484, colsample_bytree=0.743983933363942, gamma=0.3116804377567177, learning_rate=0.07445551834626589, max_depth=41, n_estimators=102, reg_alpha=1.66953856429376, reg_lambda=1.6143257296824394, subsample=0.7901203117900335
[CV 5/5; 77/500] END colsample_bylevel=0.6133323463489875, colsample_bynode=0.8299173729347484, colsample_bytree=0.743983933363942, gamma=0.3116804377567177, learning_rate=0.07445551834626589, max_depth=41, n_estimators=102, reg_alpha=1.66953856429376, reg_lambda=1.6143257296824394, subsample=0.7901203117900335;, score=(train=0.971, test=0.791) total time=   0.6s
[CV 4/5; 79/500] START colsample_bylevel=0.8826411750613326, colsample_bynode=0.977100894680274, colsample_bytree=0.9582322732187195, gamma=0.32877862014063486, learning_rate=0.1130179124122629, max_depth=19, n_estimators=62, reg_alpha=1.607002884208785, reg_lambda=0.5658585704091323, subsample=0.9182910308678451
[CV 

[CV 3/5; 75/500] START colsample_bylevel=0.6432688173800886, colsample_bynode=0.9222519408660952, colsample_bytree=0.8609422606129813, gamma=0.07074917738186981, learning_rate=0.059621776929956585, max_depth=43, n_estimators=54, reg_alpha=0.6139589343152598, reg_lambda=0.6733521168393279, subsample=0.8618663268289417
[CV 3/5; 75/500] END colsample_bylevel=0.6432688173800886, colsample_bynode=0.9222519408660952, colsample_bytree=0.8609422606129813, gamma=0.07074917738186981, learning_rate=0.059621776929956585, max_depth=43, n_estimators=54, reg_alpha=0.6139589343152598, reg_lambda=0.6733521168393279, subsample=0.8618663268289417;, score=(train=0.986, test=0.872) total time=   0.8s
[CV 3/5; 78/500] START colsample_bylevel=0.9320796107045175, colsample_bynode=0.9058789130114927, colsample_bytree=0.6567012482889769, gamma=1.7897900633382504, learning_rate=0.08638260457862318, max_depth=20, n_estimators=78, reg_alpha=1.5066309140721104, reg_lambda=0.5949072388301382, subsample=0.85776160448

[CV 2/5; 79/500] START colsample_bylevel=0.8826411750613326, colsample_bynode=0.977100894680274, colsample_bytree=0.9582322732187195, gamma=0.32877862014063486, learning_rate=0.1130179124122629, max_depth=19, n_estimators=62, reg_alpha=1.607002884208785, reg_lambda=0.5658585704091323, subsample=0.9182910308678451
[CV 2/5; 79/500] END colsample_bylevel=0.8826411750613326, colsample_bynode=0.977100894680274, colsample_bytree=0.9582322732187195, gamma=0.32877862014063486, learning_rate=0.1130179124122629, max_depth=19, n_estimators=62, reg_alpha=1.607002884208785, reg_lambda=0.5658585704091323, subsample=0.9182910308678451;, score=(train=0.989, test=0.826) total time=   0.6s
[CV 5/5; 80/500] START colsample_bylevel=0.6565090876939432, colsample_bynode=0.7861097787732053, colsample_bytree=0.8929703715099165, gamma=0.12570170590298702, learning_rate=0.09772771236756976, max_depth=37, n_estimators=72, reg_alpha=1.051036609441931, reg_lambda=1.4999332335139017, subsample=0.6841430481699251
[C

[CV 1/5; 111/500] START colsample_bylevel=0.6526929425861245, colsample_bynode=0.8087437276152152, colsample_bytree=0.8970005268703023, gamma=0.3994919930097731, learning_rate=0.11932554159823328, max_depth=24, n_estimators=76, reg_alpha=1.8778363988103959, reg_lambda=0.9879823726424515, subsample=0.6987083786537416
[CV 1/5; 111/500] END colsample_bylevel=0.6526929425861245, colsample_bynode=0.8087437276152152, colsample_bytree=0.8970005268703023, gamma=0.3994919930097731, learning_rate=0.11932554159823328, max_depth=24, n_estimators=76, reg_alpha=1.8778363988103959, reg_lambda=0.9879823726424515, subsample=0.6987083786537416;, score=(train=0.968, test=0.808) total time=   0.5s
[CV 1/5; 112/500] START colsample_bylevel=0.9865461065387513, colsample_bynode=0.9040759294639148, colsample_bytree=0.848613148841141, gamma=0.9125505517512384, learning_rate=0.1375535926819279, max_depth=40, n_estimators=116, reg_alpha=0.6301721474565924, reg_lambda=1.321488244662644, subsample=0.62703657657643

[CV 5/5; 110/500] START colsample_bylevel=0.8749334853040864, colsample_bynode=0.8321099170790713, colsample_bytree=0.9808834670017592, gamma=0.5717571625684303, learning_rate=0.08077677738430511, max_depth=25, n_estimators=75, reg_alpha=0.5223356002590851, reg_lambda=1.6989808747557944, subsample=0.8508108359120189
[CV 5/5; 110/500] END colsample_bylevel=0.8749334853040864, colsample_bynode=0.8321099170790713, colsample_bytree=0.9808834670017592, gamma=0.5717571625684303, learning_rate=0.08077677738430511, max_depth=25, n_estimators=75, reg_alpha=0.5223356002590851, reg_lambda=1.6989808747557944, subsample=0.8508108359120189;, score=(train=0.991, test=0.804) total time=   0.7s
[CV 4/5; 112/500] START colsample_bylevel=0.9865461065387513, colsample_bynode=0.9040759294639148, colsample_bytree=0.848613148841141, gamma=0.9125505517512384, learning_rate=0.1375535926819279, max_depth=40, n_estimators=116, reg_alpha=0.6301721474565924, reg_lambda=1.321488244662644, subsample=0.62703657657643

[CV 5/5; 111/500] START colsample_bylevel=0.6526929425861245, colsample_bynode=0.8087437276152152, colsample_bytree=0.8970005268703023, gamma=0.3994919930097731, learning_rate=0.11932554159823328, max_depth=24, n_estimators=76, reg_alpha=1.8778363988103959, reg_lambda=0.9879823726424515, subsample=0.6987083786537416
[CV 5/5; 111/500] END colsample_bylevel=0.6526929425861245, colsample_bynode=0.8087437276152152, colsample_bytree=0.8970005268703023, gamma=0.3994919930097731, learning_rate=0.11932554159823328, max_depth=24, n_estimators=76, reg_alpha=1.8778363988103959, reg_lambda=0.9879823726424515, subsample=0.6987083786537416;, score=(train=0.964, test=0.796) total time=   0.4s
[CV 3/5; 113/500] START colsample_bylevel=0.8063831730297228, colsample_bynode=0.8615401338759308, colsample_bytree=0.7377615258886423, gamma=0.15187009712020072, learning_rate=0.13671859056098, max_depth=47, n_estimators=71, reg_alpha=0.20019564664296352, reg_lambda=1.444314164963072, subsample=0.93318323355910

[CV 2/5; 113/500] END colsample_bylevel=0.8063831730297228, colsample_bynode=0.8615401338759308, colsample_bytree=0.7377615258886423, gamma=0.15187009712020072, learning_rate=0.13671859056098, max_depth=47, n_estimators=71, reg_alpha=0.20019564664296352, reg_lambda=1.444314164963072, subsample=0.9331832335591065;, score=(train=1.000, test=0.820) total time=   0.7s
[CV 1/5; 115/500] START colsample_bylevel=0.6800193110659308, colsample_bynode=0.815955478321755, colsample_bytree=0.9245299980193984, gamma=1.2610909199252818, learning_rate=0.12842311842297177, max_depth=30, n_estimators=82, reg_alpha=1.8742913918420754, reg_lambda=0.556439832330893, subsample=0.6508114997148012
[CV 1/5; 115/500] END colsample_bylevel=0.6800193110659308, colsample_bynode=0.815955478321755, colsample_bytree=0.9245299980193984, gamma=1.2610909199252818, learning_rate=0.12842311842297177, max_depth=30, n_estimators=82, reg_alpha=1.8742913918420754, reg_lambda=0.556439832330893, subsample=0.6508114997148012;, s

[CV 5/5; 112/500] START colsample_bylevel=0.9865461065387513, colsample_bynode=0.9040759294639148, colsample_bytree=0.848613148841141, gamma=0.9125505517512384, learning_rate=0.1375535926819279, max_depth=40, n_estimators=116, reg_alpha=0.6301721474565924, reg_lambda=1.321488244662644, subsample=0.6270365765764355
[CV 5/5; 112/500] END colsample_bylevel=0.9865461065387513, colsample_bynode=0.9040759294639148, colsample_bytree=0.848613148841141, gamma=0.9125505517512384, learning_rate=0.1375535926819279, max_depth=40, n_estimators=116, reg_alpha=0.6301721474565924, reg_lambda=1.321488244662644, subsample=0.6270365765764355;, score=(train=0.984, test=0.804) total time=   0.5s
[CV 3/5; 114/500] START colsample_bylevel=0.9853545158181803, colsample_bynode=0.8231630697300159, colsample_bytree=0.9571361144089734, gamma=0.9462390514323729, learning_rate=0.088130992414501, max_depth=46, n_estimators=80, reg_alpha=1.9446100698134519, reg_lambda=0.9379540576173735, subsample=0.7351985807590454
[

[CV 2/5; 120/500] START colsample_bylevel=0.916702932254086, colsample_bynode=0.7342209862786098, colsample_bytree=0.7447280002158199, gamma=0.7156543927283554, learning_rate=0.14008846098139324, max_depth=44, n_estimators=115, reg_alpha=1.6415506394478245, reg_lambda=0.03668023636495077, subsample=0.8978936663363877
[CV 2/5; 120/500] END colsample_bylevel=0.916702932254086, colsample_bynode=0.7342209862786098, colsample_bytree=0.7447280002158199, gamma=0.7156543927283554, learning_rate=0.14008846098139324, max_depth=44, n_estimators=115, reg_alpha=1.6415506394478245, reg_lambda=0.03668023636495077, subsample=0.8978936663363877;, score=(train=0.973, test=0.819) total time=   0.4s
[CV 1/5; 122/500] START colsample_bylevel=0.6133774079150762, colsample_bynode=0.7431368085367506, colsample_bytree=0.9848302193948584, gamma=1.620808407481306, learning_rate=0.12296086524994283, max_depth=39, n_estimators=62, reg_alpha=1.3428843388137093, reg_lambda=1.7485380798032848, subsample=0.79284215617

[CV 5/5; 113/500] START colsample_bylevel=0.8063831730297228, colsample_bynode=0.8615401338759308, colsample_bytree=0.7377615258886423, gamma=0.15187009712020072, learning_rate=0.13671859056098, max_depth=47, n_estimators=71, reg_alpha=0.20019564664296352, reg_lambda=1.444314164963072, subsample=0.9331832335591065
[CV 5/5; 113/500] END colsample_bylevel=0.8063831730297228, colsample_bynode=0.8615401338759308, colsample_bytree=0.7377615258886423, gamma=0.15187009712020072, learning_rate=0.13671859056098, max_depth=47, n_estimators=71, reg_alpha=0.20019564664296352, reg_lambda=1.444314164963072, subsample=0.9331832335591065;, score=(train=1.000, test=0.815) total time=   0.7s
[CV 5/5; 115/500] START colsample_bylevel=0.6800193110659308, colsample_bynode=0.815955478321755, colsample_bytree=0.9245299980193984, gamma=1.2610909199252818, learning_rate=0.12842311842297177, max_depth=30, n_estimators=82, reg_alpha=1.8742913918420754, reg_lambda=0.556439832330893, subsample=0.6508114997148012
[

[CV 3/5; 143/500] START colsample_bylevel=0.6570016620023355, colsample_bynode=0.8299915546111754, colsample_bytree=0.6645444945466868, gamma=0.3551396133888678, learning_rate=0.06773278948032882, max_depth=21, n_estimators=115, reg_alpha=0.5848349274222098, reg_lambda=0.9850477102622979, subsample=0.8773485165910452
[CV 3/5; 143/500] END colsample_bylevel=0.6570016620023355, colsample_bynode=0.8299915546111754, colsample_bytree=0.6645444945466868, gamma=0.3551396133888678, learning_rate=0.06773278948032882, max_depth=21, n_estimators=115, reg_alpha=0.5848349274222098, reg_lambda=0.9850477102622979, subsample=0.8773485165910452;, score=(train=0.997, test=0.864) total time=   0.7s
[CV 4/5; 145/500] START colsample_bylevel=0.8712711218593014, colsample_bynode=0.7045822290357917, colsample_bytree=0.8132805635934726, gamma=0.2840123251644795, learning_rate=0.11272604544173793, max_depth=27, n_estimators=123, reg_alpha=1.3947216938644549, reg_lambda=1.1900890707280667, subsample=0.944342826

[CV 2/5; 144/500] START colsample_bylevel=0.7271547058236041, colsample_bynode=0.9423863948431693, colsample_bytree=0.901825468102549, gamma=1.2471620778444616, learning_rate=0.08813465789670241, max_depth=45, n_estimators=147, reg_alpha=1.7269895654175473, reg_lambda=0.7850395898644076, subsample=0.8464749564338466
[CV 2/5; 144/500] END colsample_bylevel=0.7271547058236041, colsample_bynode=0.9423863948431693, colsample_bytree=0.901825468102549, gamma=1.2471620778444616, learning_rate=0.08813465789670241, max_depth=45, n_estimators=147, reg_alpha=1.7269895654175473, reg_lambda=0.7850395898644076, subsample=0.8464749564338466;, score=(train=0.941, test=0.817) total time=   0.5s
[CV 1/5; 145/500] START colsample_bylevel=0.8712711218593014, colsample_bynode=0.7045822290357917, colsample_bytree=0.8132805635934726, gamma=0.2840123251644795, learning_rate=0.11272604544173793, max_depth=27, n_estimators=123, reg_alpha=1.3947216938644549, reg_lambda=1.1900890707280667, subsample=0.94434282627

[CV 3/5; 151/500] START colsample_bylevel=0.7791791082765237, colsample_bynode=0.988489496807271, colsample_bytree=0.7149855273526661, gamma=0.5093366172966134, learning_rate=0.09370710880896604, max_depth=30, n_estimators=80, reg_alpha=0.17772480164434734, reg_lambda=1.9268576969267042, subsample=0.7960597047015683
[CV 3/5; 151/500] END colsample_bylevel=0.7791791082765237, colsample_bynode=0.988489496807271, colsample_bytree=0.7149855273526661, gamma=0.5093366172966134, learning_rate=0.09370710880896604, max_depth=30, n_estimators=80, reg_alpha=0.17772480164434734, reg_lambda=1.9268576969267042, subsample=0.7960597047015683;, score=(train=0.997, test=0.853) total time=   0.6s
[CV 1/5; 153/500] START colsample_bylevel=0.6630124279373604, colsample_bynode=0.8777497007641897, colsample_bytree=0.7246685035902694, gamma=0.6465660362095078, learning_rate=0.1245411042045488, max_depth=26, n_estimators=68, reg_alpha=1.0305871589550804, reg_lambda=1.9596415185143026, subsample=0.6121452742372

[CV 1/5; 147/500] START colsample_bylevel=0.6894728472099472, colsample_bynode=0.8620730544564424, colsample_bytree=0.8278940349416486, gamma=1.7575942720879016, learning_rate=0.12100213077774147, max_depth=28, n_estimators=137, reg_alpha=1.28270788019452, reg_lambda=1.191333308545089, subsample=0.6221402572210699
[CV 1/5; 147/500] END colsample_bylevel=0.6894728472099472, colsample_bynode=0.8620730544564424, colsample_bytree=0.8278940349416486, gamma=1.7575942720879016, learning_rate=0.12100213077774147, max_depth=28, n_estimators=137, reg_alpha=1.28270788019452, reg_lambda=1.191333308545089, subsample=0.6221402572210699;, score=(train=0.921, test=0.796) total time=   0.3s
[CV 1/5; 148/500] START colsample_bylevel=0.7572520821352974, colsample_bynode=0.6546631118486762, colsample_bytree=0.6093434971172785, gamma=0.9548414060590678, learning_rate=0.08871172463112575, max_depth=22, n_estimators=133, reg_alpha=0.3069789387022035, reg_lambda=0.7956252698005124, subsample=0.670331748393967

[CV 3/5; 150/500] END colsample_bylevel=0.6271672394339626, colsample_bynode=0.6685836471630136, colsample_bytree=0.9329400580481438, gamma=0.308324211087474, learning_rate=0.11005225388222703, max_depth=22, n_estimators=109, reg_alpha=1.4729545966288518, reg_lambda=1.273474134812241, subsample=0.7098758768418946;, score=(train=0.985, test=0.843) total time=   0.6s
[CV 2/5; 152/500] START colsample_bylevel=0.9407536556835913, colsample_bynode=0.8739428878675626, colsample_bytree=0.6286682209237492, gamma=0.8895799070999872, learning_rate=0.05429517688999516, max_depth=26, n_estimators=144, reg_alpha=0.8139697681771352, reg_lambda=1.4317213742717871, subsample=0.7008422164975224
[CV 2/5; 152/500] END colsample_bylevel=0.9407536556835913, colsample_bynode=0.8739428878675626, colsample_bytree=0.6286682209237492, gamma=0.8895799070999872, learning_rate=0.05429517688999516, max_depth=26, n_estimators=144, reg_alpha=0.8139697681771352, reg_lambda=1.4317213742717871, subsample=0.7008422164975

[CV 4/5; 152/500] START colsample_bylevel=0.9407536556835913, colsample_bynode=0.8739428878675626, colsample_bytree=0.6286682209237492, gamma=0.8895799070999872, learning_rate=0.05429517688999516, max_depth=26, n_estimators=144, reg_alpha=0.8139697681771352, reg_lambda=1.4317213742717871, subsample=0.7008422164975224
[CV 4/5; 152/500] END colsample_bylevel=0.9407536556835913, colsample_bynode=0.8739428878675626, colsample_bytree=0.6286682209237492, gamma=0.8895799070999872, learning_rate=0.05429517688999516, max_depth=26, n_estimators=144, reg_alpha=0.8139697681771352, reg_lambda=1.4317213742717871, subsample=0.7008422164975224;, score=(train=0.966, test=0.798) total time=   0.7s
[CV 2/5; 155/500] START colsample_bylevel=0.7895765652253706, colsample_bynode=0.8104976838265809, colsample_bytree=0.8850406615814057, gamma=1.0605609250951016, learning_rate=0.12891906277076692, max_depth=43, n_estimators=52, reg_alpha=0.883479460797552, reg_lambda=1.880981027576238, subsample=0.901492059369

[CV 2/5; 154/500] START colsample_bylevel=0.9226925573979037, colsample_bynode=0.724042526661451, colsample_bytree=0.726371814691965, gamma=1.7273635123679978, learning_rate=0.054520784884717954, max_depth=46, n_estimators=140, reg_alpha=0.6183867572449455, reg_lambda=1.2764056551635303, subsample=0.8080725877517597
[CV 2/5; 154/500] END colsample_bylevel=0.9226925573979037, colsample_bynode=0.724042526661451, colsample_bytree=0.726371814691965, gamma=1.7273635123679978, learning_rate=0.054520784884717954, max_depth=46, n_estimators=140, reg_alpha=0.6183867572449455, reg_lambda=1.2764056551635303, subsample=0.8080725877517597;, score=(train=0.940, test=0.810) total time=   0.6s
[CV 1/5; 156/500] START colsample_bylevel=0.7808854287060637, colsample_bynode=0.7949739104206494, colsample_bytree=0.7905287184298329, gamma=0.5221002017067107, learning_rate=0.1154475335028413, max_depth=15, n_estimators=136, reg_alpha=0.7534325573425025, reg_lambda=1.8648927372010733, subsample=0.841051105730

[CV 2/5; 179/500] START colsample_bylevel=0.9313541510585166, colsample_bynode=0.989615383367197, colsample_bytree=0.6917735828476701, gamma=1.3122953541901665, learning_rate=0.119205760263678, max_depth=29, n_estimators=69, reg_alpha=0.5193680845243374, reg_lambda=1.8175105647850247, subsample=0.9643145496234462
[CV 2/5; 179/500] END colsample_bylevel=0.9313541510585166, colsample_bynode=0.989615383367197, colsample_bytree=0.6917735828476701, gamma=1.3122953541901665, learning_rate=0.119205760263678, max_depth=29, n_estimators=69, reg_alpha=0.5193680845243374, reg_lambda=1.8175105647850247, subsample=0.9643145496234462;, score=(train=0.962, test=0.800) total time=   0.4s
[CV 2/5; 180/500] START colsample_bylevel=0.8418779163708849, colsample_bynode=0.9492707412962511, colsample_bytree=0.6004761927565422, gamma=0.665231297708192, learning_rate=0.09219934446016165, max_depth=36, n_estimators=143, reg_alpha=1.7707096402508509, reg_lambda=1.1290272556919185, subsample=0.8958387704991233
[

[CV 3/5; 180/500] START colsample_bylevel=0.8418779163708849, colsample_bynode=0.9492707412962511, colsample_bytree=0.6004761927565422, gamma=0.665231297708192, learning_rate=0.09219934446016165, max_depth=36, n_estimators=143, reg_alpha=1.7707096402508509, reg_lambda=1.1290272556919185, subsample=0.8958387704991233
[CV 3/5; 180/500] END colsample_bylevel=0.8418779163708849, colsample_bynode=0.9492707412962511, colsample_bytree=0.6004761927565422, gamma=0.665231297708192, learning_rate=0.09219934446016165, max_depth=36, n_estimators=143, reg_alpha=1.7707096402508509, reg_lambda=1.1290272556919185, subsample=0.8958387704991233;, score=(train=0.960, test=0.865) total time=   0.5s
[CV 1/5; 182/500] START colsample_bylevel=0.7864717139644052, colsample_bynode=0.6342106670750735, colsample_bytree=0.6535033289457751, gamma=1.379800615309562, learning_rate=0.07533539068960743, max_depth=38, n_estimators=129, reg_alpha=1.7523663783889547, reg_lambda=0.49346934787958596, subsample=0.90366074607

[CV 1/5; 186/500] END colsample_bylevel=0.638771506921351, colsample_bynode=0.9878044389084039, colsample_bytree=0.972443613061869, gamma=0.9864108040321056, learning_rate=0.07627093478516055, max_depth=38, n_estimators=61, reg_alpha=0.8060054128554563, reg_lambda=0.8419833997044317, subsample=0.6301949998061146;, score=(train=0.952, test=0.789) total time=   0.4s
[CV 4/5; 187/500] START colsample_bylevel=0.9060107643152893, colsample_bynode=0.7105637582095913, colsample_bytree=0.8374436658149662, gamma=0.8683895591105961, learning_rate=0.05950687082797343, max_depth=38, n_estimators=78, reg_alpha=1.163160148024224, reg_lambda=1.477713896496085, subsample=0.8466550289591769
[CV 4/5; 187/500] END colsample_bylevel=0.9060107643152893, colsample_bynode=0.7105637582095913, colsample_bytree=0.8374436658149662, gamma=0.8683895591105961, learning_rate=0.05950687082797343, max_depth=38, n_estimators=78, reg_alpha=1.163160148024224, reg_lambda=1.477713896496085, subsample=0.8466550289591769;, s

[CV 4/5; 186/500] START colsample_bylevel=0.638771506921351, colsample_bynode=0.9878044389084039, colsample_bytree=0.972443613061869, gamma=0.9864108040321056, learning_rate=0.07627093478516055, max_depth=38, n_estimators=61, reg_alpha=0.8060054128554563, reg_lambda=0.8419833997044317, subsample=0.6301949998061146
[CV 4/5; 186/500] END colsample_bylevel=0.638771506921351, colsample_bynode=0.9878044389084039, colsample_bytree=0.972443613061869, gamma=0.9864108040321056, learning_rate=0.07627093478516055, max_depth=38, n_estimators=61, reg_alpha=0.8060054128554563, reg_lambda=0.8419833997044317, subsample=0.6301949998061146;, score=(train=0.950, test=0.798) total time=   0.5s
[CV 2/5; 188/500] START colsample_bylevel=0.627838789807344, colsample_bynode=0.9994265349331235, colsample_bytree=0.6373510690481731, gamma=1.585457967192495, learning_rate=0.08217566751414102, max_depth=49, n_estimators=105, reg_alpha=0.01445109240177711, reg_lambda=0.8017715255778679, subsample=0.9944037836918367

[CV 2/5; 185/500] START colsample_bylevel=0.6552297264328412, colsample_bynode=0.7603335670280968, colsample_bytree=0.6922365703255811, gamma=1.150620950056974, learning_rate=0.09382494647471123, max_depth=38, n_estimators=87, reg_alpha=1.681725832266658, reg_lambda=0.129575434604843, subsample=0.9712539383979314
[CV 2/5; 185/500] END colsample_bylevel=0.6552297264328412, colsample_bynode=0.7603335670280968, colsample_bytree=0.6922365703255811, gamma=1.150620950056974, learning_rate=0.09382494647471123, max_depth=38, n_estimators=87, reg_alpha=1.681725832266658, reg_lambda=0.129575434604843, subsample=0.9712539383979314;, score=(train=0.939, test=0.807) total time=   0.3s
[CV 4/5; 185/500] START colsample_bylevel=0.6552297264328412, colsample_bynode=0.7603335670280968, colsample_bytree=0.6922365703255811, gamma=1.150620950056974, learning_rate=0.09382494647471123, max_depth=38, n_estimators=87, reg_alpha=1.681725832266658, reg_lambda=0.129575434604843, subsample=0.9712539383979314
[CV 

[CV 1/5; 189/500] START colsample_bylevel=0.8345613993604226, colsample_bynode=0.8096305249022304, colsample_bytree=0.7510366471125143, gamma=1.8593882749077053, learning_rate=0.11164953481454304, max_depth=22, n_estimators=127, reg_alpha=1.1039273297862957, reg_lambda=0.8080214252295805, subsample=0.6476557906123913
[CV 1/5; 189/500] END colsample_bylevel=0.8345613993604226, colsample_bynode=0.8096305249022304, colsample_bytree=0.7510366471125143, gamma=1.8593882749077053, learning_rate=0.11164953481454304, max_depth=22, n_estimators=127, reg_alpha=1.1039273297862957, reg_lambda=0.8080214252295805, subsample=0.6476557906123913;, score=(train=0.922, test=0.801) total time=   0.3s
[CV 1/5; 190/500] START colsample_bylevel=0.8651412035031776, colsample_bynode=0.9302956095071913, colsample_bytree=0.8200195071390025, gamma=0.018222869660705854, learning_rate=0.06044526574719425, max_depth=48, n_estimators=93, reg_alpha=1.4020227662549396, reg_lambda=1.331596231155463, subsample=0.845753961

In [None]:
cv_results.head()

In [None]:
results_path = f"./tuning_results/tuning_xgb/{timestamp}/Assets"
if not os.path.exists(results_path):
    os.makedirs(results_path)

# CV Evaluation

In [None]:
cv_results.columns

In [None]:
cv_results.sort_values(by='rank_test_score', ascending=True).head(5)

In [None]:
sorted_cv = cv_results.sort_values(by='rank_test_score', ascending=True)

# Train vs Test Comparison

In [None]:
plt.figure(figsize=(16, 6))    

plt.plot(sorted_cv['rank_test_score'], sorted_cv['mean_train_score'], label="Train Score")
plt.plot(sorted_cv['rank_test_score'], sorted_cv['mean_test_score'], label="Validation Score")

plt.grid()
plt.xlabel('Sorted Validation Rank')
plt.ylabel('Accuracy')
plt.title('Train and Test Accuracy by Final Rank')
plt.legend(loc='best')

filename = "test_train_by_rank.png"
plt.savefig(os.path.join(results_path, filename))

plt.show()

In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(10, 3))
fig.suptitle('Test Accuracy by Rank')

ax = fig.add_subplot(111)

plt.boxplot(sorted_cv.iloc[:10, :][['split0_test_score', 'split1_test_score', 'split2_test_score',
   'split3_test_score', 'split4_test_score']].T)
ax.set_xticklabels(range(1, 11))
ax.set_xlabel("Rank")
ax.set_ylabel("Accuracy")

filename = "test_accuracy_by_rank.png"
plt.savefig(os.path.join(results_path, filename))

plt.show()

In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(10, 3))
fig.suptitle('Train Accuracy by Rank')

ax = fig.add_subplot(111)

plt.boxplot(sorted_cv.iloc[:10, :][['split0_train_score', 'split1_train_score', 'split2_train_score',
   'split3_train_score', 'split4_train_score']].T)
ax.set_xticklabels(range(1, 11))
ax.set_xlabel("Rank")
ax.set_ylabel("Accuracy")

filename = "train_accuracy_by_rank.png"
plt.savefig(os.path.join(results_path, filename))

plt.show()

In [None]:
max_params = cv_results.loc[cv_results['rank_test_score'] == 1]
best_params = max_params.params.values[0]

In [None]:
print(f"Mean Train Accuracy = {max_params['mean_train_score'].values[0]:.2f}")
print(f"Mean Validation Accuracy = {max_params['mean_test_score'].values[0]:.2f}")

In [None]:
random_search = pickle.load(open(os.path.join(f'./tuning_results/tuning_xgb/{timestamp}/', file_name), "rb"))
model = random_search.best_estimator_

#model = XGBClassifier(**best_params)
#model.fit(X_train, y_train)

y_train_prediction = model.predict(X_train)
y_test_prediction = model.predict(X_test)

In [None]:
print(f"Train set, Accuracy = {accuracy_score(y_train, y_train_prediction):.2f}")
print(f"Test set, Accuracy = {accuracy_score(y_test, y_test_prediction):.2f}")

In [None]:
ind = np.argpartition(model.feature_importances_, -20)[-20:]

features = X.columns[ind]
importance = model.feature_importances_[ind]

fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(ind)), importance, align='center')
plt.yticks(range(len(ind)), features)
plt.title('Feature Importance XGB')
plt.grid()

filename = "feature_importance.png"
plt.savefig(os.path.join(results_path, filename))
            
plt.show()

In the context of XGBoost, these scores are often computed based on the following:

- Weight: The number of times a feature appears in a tree across the ensemble of trees.
- Gain: The average gain of a feature when it is used in trees.
- Cover: The average coverage of a feature when it is used in trees.

In [None]:
# TEST
for col in ['param_colsample_bylevel', 'param_colsample_bynode',
       'param_colsample_bytree', 'param_gamma', 'param_learning_rate',
       'param_max_depth', 'param_n_estimators', 'param_reg_alpha',
       'param_reg_lambda', 'param_subsample']:
    
    plt.figure(figsize=(16, 6))    

    m, b = np.polyfit(list(sorted_cv['mean_test_score'].values), list(sorted_cv[col].values), 1)
    plt.plot(sorted_cv['mean_test_score'], m * sorted_cv['mean_test_score'] + b, c='r', label="Regression Line")
    plt.scatter(sorted_cv['mean_test_score'], sorted_cv[col], label=f"{col} Values")
    
    plt.grid()
    plt.xlabel('Mean Validation Score')
    plt.ylabel('Parameter Value')
    plt.title(col)
    plt.legend(loc='best')

    
    filename = f"{col}_by_rank.png"
    plt.savefig(os.path.join(results_path, filename))
                  
    plt.show()


# Hyperparameter Evaluation

In [None]:

def plot_parameters(x_values, title):
    
    fig, ax1 = plt.subplots(figsize=(16, 6))
    ax2 = ax1.twinx()

    ax1.scatter(x_values, cv_results['mean_test_score'], label='mean_test_score', c='b')
    #ax2.scatter(x_values, cv_results['std_test_score'], label='std_test_score', c='r')

    m, b = np.polyfit(list(x_values.values), list(cv_results['mean_test_score'].values), 1)
    ax1.plot(x_values, m * x_values + b, c='b')

    m, b = np.polyfit(list(x_values.values), list(cv_results['std_test_score'].values), 1)
    ax2.plot(x_values, m * x_values + b, c='r', label='std_test_score')
    
    ax1.set_title(title)
    ax1.set_xlabel('Parameter Value')
    ax1.set_ylabel('Mean Test Score')
    ax2.set_ylabel('Standard Deviation of Test Score')
    ax1.grid(True)
    
    
    # Combine the legends from both axes
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='upper right')

    filename = f"{title}_test_score.png"
    plt.savefig(os.path.join(results_path, filename))
            
    plt.show()



In [None]:
for param in ['param_colsample_bylevel', 'param_colsample_bynode', 'param_colsample_bytree',
                     'param_gamma', 'param_learning_rate', 'param_max_depth', 'param_n_estimators', 
                     'param_reg_alpha', 'param_reg_lambda', 'param_subsample']:
    x_values = cv_results[param]
    plot_parameters(x_values, param)

# Plotting Evaluation Metrics (Precision, Recall, F1-Score, AUC-ROC):


In [None]:

# For multiclass classification, you need to binarize the labels
y_true_bin = label_binarize(y_test, classes=np.unique(y_test))
y_score_bin = label_binarize(y_test_prediction, classes=np.unique(y_test_prediction))

auc_roc = roc_auc_score(y_true_bin, y_score_bin, average='macro')

# Plot Precision-Recall curve for each class
precision = dict()
recall = dict()

plt.figure(figsize=(16, 6))    
for i in range(7):
    precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_score_bin[:, i])
    plt.plot(recall[i], precision[i], label='Covertype {}'.format(i + 1))

plt.grid()
plt.xlabel('Recall')
plt.ylabel('True Positive Rate / Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')

filename = "precision_recall.png"
plt.savefig(os.path.join(results_path, filename))
            
plt.show()


# Plot AUC-ROC curve for each class
fpr = dict()
tpr = dict()

plt.figure(figsize=(16, 6))    
for i in range(7):
    fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_score_bin[:, i])
    plt.plot(fpr[i], tpr[i], label='Covertype {}'.format(i + 1))

plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate / Precision')
plt.title('ROC Curve')
plt.legend(loc='best')
            
filename = "roc_curve.png"
plt.savefig(os.path.join(results_path, filename))
            
plt.show()


# Partial Dependence

In [None]:
# potentially iterate over features (and relation ie 0 to 1)

In [None]:

features, feature_names = [(0,)], [f"Features #{i}" for i in range(X.shape[1])]
deciles = {0: np.linspace(0, 1, num=5)}

pd_results = partial_dependence(
    model, X, features=1, kind="average", grid_resolution=5)

display = PartialDependenceDisplay(
    [pd_results], features=features, feature_names=feature_names,
    target_idx=0, deciles=deciles
)
display.plot(pdp_lim={1: (-1.38, 0.66)})

plt.grid()
plt.xlabel('Feature Value')
plt.ylabel('Partial Dependence') 
plt.title('Partial Dependence')

filename = "partial_dependence.png"
plt.savefig(os.path.join(results_path, filename))
            
plt.show()


# Confusion Matrix

In [None]:

class_names = np.unique(y)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization", None),
    ("Normalized confusion matrix", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model,
        X_test,
        y_test,
        display_labels=class_names + 1,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    png_name = title.lower().replace(" ", "_")
    filename = f"{png_name}.png"
    plt.savefig(os.path.join(results_path, filename))

plt.show()

# Shap Values

In [None]:
explainer = shap.TreeExplainer(model)
explanation = explainer.shap_values(X_test, check_additivity=False)


In [None]:
shap.summary_plot(explanation, X_test, plot_type="bar", show=False)

filename = f"shap_summary.png"
plt.savefig(os.path.join(results_path, filename))
plt.close()  

SHAP values show how each feature affects each final prediction, the significance of each feature compared to others, and the model's reliance on the interaction between features.


In [None]:
# IF WE SCALE: SCALE ONLY TRAIN DATA SET
# BOOSTING ALG:

# - Feature Selection: AUTOMATIC (NON CRITICAL)
# - Scaling: AUTOMATIC
# - Handling Missing Values: AUTOMATIC
# - Outliers: NEED TO DEAL WITH

# KAGGLE Prediction

In [None]:
test_processed = pd.read_csv('Data/test_engineered.csv')

In [None]:
test_processed.head()

In [None]:
test_processed = test_processed.loc[:, [col for col in test_processed if not col.startswith('Cover_Type_')]]
X_kaggle = test_processed.drop(columns=['Aspect_Sector'])
y_kaggle = model.predict(X_kaggle) + 1

In [None]:
pd.read_csv("Data/Kaggle/full_submission.csv").head()

In [None]:
test_processed['Cover_Type'] = y_kaggle

In [None]:
kaggle_submission = test_processed.loc[:, ['Id', 'Cover_Type']]

In [None]:
kaggle_submission.Cover_Type.value_counts()

In [None]:
kaggle_submission.to_csv(f'Data/kaggle_submission_{timestamp}.csv', index=False)

In [None]:
pd.read_csv(f'Data/kaggle_submission_{timestamp}.csv')

In [None]:
timestamp