# Hyperparam tuning

Code [here](https://www.kaggle.com/code/bigironsphere/parameter-tuning-in-one-function-with-hyperopt/notebook)

In [1]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import gc
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# #optional but advised
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import some test data
#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**8 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**10 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

In [13]:

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                         'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        

        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            space_params['verbose'] = -1
            
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
#             cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
#                                 early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)

            train = lgb.Dataset(data, labels)


            #for reg, set stratified=False and metrics=EVAL_METRIC_LGBM_REG
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=True,
                                 early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_CLASS, seed=42)
            
            #best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        #metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_class

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 64, 2),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 128, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 128, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01)
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_CLASS],
                             early_stopping_rounds=100, stratified=False, seed=42)
            
            #best_loss = cv_results['test-mae-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE
            #for classification, comment out the line above and uncomment the line below:
            best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        #metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [#{'tree_method' : 'exact'},
               #{'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:linear', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_class
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')       

In [5]:
df = pd.read_csv('heart_2020_cleaned.csv')

# reformat
df['HeartDisease'] = [1 if i == 'Yes' else 0 for i in df.HeartDisease]

categoricals = list(df.columns[df.dtypes == 'object'])

df_cat_num = pd.get_dummies(df[categoricals],drop_first=True)

# fix names
df_cat_num = df_cat_num.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df_num_only = df.drop(columns = categoricals)

df_all = pd.concat([df_num_only, df_cat_num], axis=1)

data = df_all.iloc[:,1:]
labels = df_all['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(data, labels)

In [None]:
# train lgb model
lgbm_params = quick_hyperopt(X_train, y_train, 'lgbm', 20)

In [24]:
# train xgb model

xgb.set_config(verbosity=0)
xgb_params = quick_hyperopt(X_train, y_train, 'xgb', 400)

Running 400 rounds of XGBoost parameter optimisation:
100%|██████████| 400/400 [34:07<00:00,  5.12s/trial, best loss: 0.1638643999999999] 
{boosting: gblinear
colsample_bylevel: 0.72
colsample_bynode: 0.47000000000000003
colsample_bytree: 0.92
eval_metric: auc
gamma: 4.270674828918584
learning_rate: 0.19540363900814706
max_bin: 86
max_depth: 17
max_leaves: 245
min_child_weight: 2.1891587625492277
objective: reg:logistic
reg_alpha: 0.20145542455429888
reg_lambda: 2.117907617599555
subsample: 1.0
tree_method: hist}


In [3]:
# try an oos evaluation

lgbm_params_default={}

lgbm_params['verbose'] = -1


lgb_train = lgb.Dataset(X_train, y_train)
lgb_model = lgb.train(params=lgbm_params, train_set=lgb_train)

NameError: name 'lgbm_params' is not defined

In [6]:
# default list
lgb_train = lgb.Dataset(X_train, y_train)

lgb_model_default = lgb.train(params=lgbm_params_default, train_set=lgb_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 405
[LightGBM] [Info] Number of data points in the train set: 239846, number of used features: 37
[LightGBM] [Info] Start training from score 0.086051


In [15]:
y_test_predict = lgb_model.predict(X_test)
roc_auc_score(y_true=y_test, y_score=y_test_predict)

NameError: name 'lgb_model' is not defined

In [8]:
y_test_predict = lgb_model_default.predict(X_test)
roc_auc_score(y_true=y_test, y_score=y_test_predict)

0.8404841555118137

In [None]:
plt.hist(y_test_predict, bins=50);

In [None]:
xgb_params

In [25]:
# xgb eval with nice params
#xgb_params['objective'] = 'binary:logistic'

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(xgb_params, dtrain)

In [26]:
y_test_predict_xgb = xgb_model.predict(dtest)
roc_auc_score(y_true=y_test, y_score=y_test_predict_xgb)

0.8314291482699596

In [27]:
# use default params
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

xgb_params_default = {}
xgb_params_default['objective'] = 'binary:logistic'

xgb_model = xgb.train(xgb_params_default, dtrain)

y_test_predict_xgb = xgb_model.predict(dtest)

roc_auc_score(y_true=y_test, y_score=y_test_predict_xgb)

0.8330652024874521