# Hyperparameter Optimization Notebook
* we will use Scikit-Optimize, refere docs [here](https://scikit-optimize.github.io/stable/)

## Variables

In [116]:
# dataset info
dataset_name = 'Bank_Personal_Loan_Modelling_transformed.xlsx'
dataset_path = '../dataset/' + dataset_name
dataset_format = 'xlsx'

# target column for dataset
target_col = 'Personal Loan'

# where to save the model?
model_store_location = '../store/model/'

# model names
model_names = ['LGBMClassifier', 'XGBClassifier', 'GradientBoostingClassifier']

## define parameter search grid for all selected models here
## do note that we need to define param grid as dict in case of grid search
## uncomment other parameters before actual run
#param_grid_search = {
#     'n_estimators': range(20, 400, 40),
#     'max_depth': range (1, 5, 1),
#     'min_child_weight': range(1, 4, 1),
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 0.9,1],
#     'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5]  
#}
params_grid_list = {
    'LGBMClassifier': [
        Integer(20, 400, name='n_estimators'),
        Integer(1, 5, name='max_depth')
    ],
    'XGBClassifier': [
        Integer(20, 400, name='n_estimators'),
        Integer(1, 5, name='max_depth')
    ],
    'GradientBoostingClassifier': [
        Integer(20, 400, name='n_estimators'),
        Integer(1, 5, name='max_depth')
    ]
}


# which cv technique you want to use?
    # 'kfold'
    # 'repeatedkfold'
    # 'leaveoneout'
    # 'leavepout'
    # 'stratifiedkfold'
    # 'groupkfold'
    # 'grouplogo'
    # 'timeserieskfold'

# choose from above
cv_technique = 'kfold'
# which metric to optimize?
metric = 'roc_auc'
# maximize or minimize metric?
maximize_metric = True

# choose one search technique from below
    # grid_search
    # random_search
    # bayesian_gp
    # bayesian_forest_gp
    # bayesian_gbrt_gp

# todo: as of now we cant use the grid search as it expects params grid in dictionary
# where as scikit optimize expects it in list, hence the grid above is in list form.
# need to handle this internally so that we can define the params grid consistently
search_technique = 'random_search'

### Steps for Hyperparameter Optimization
1. Load pycaret's top 3 model saved in modelling notebook
2. Define Hyperparameter Space
3. Choose performance metrics to mimimize/maximize
4. Choose Cross Validation Technique:
    1. K-Fold Cross Validation
    2. Leave one out
    3. Leave p-out
    4. Repeated K-Fold Cross Validation
    5. Stratified K-Fold Cross Validation
    6. Group Cross Validation
    7. Nested Cross Validation
5. Choose hyperparameter search process:
    1. Manual approach
    2. Automated approach
       * Parallel 
        1. Grid Search 
        2. Random Search
       * Sequential
        1. Bayesin Search

### INSTALL REQUIRED DEPENDENCIES

In [85]:
# !pip install hyperopt==0.2.5
# !pip install scikit-optimize==0.8.1

### IMPORTING LIBRARIES

In [86]:
import pandas as pd
from pycaret.classification import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    LeaveOneOut,
    LeavePOut,
    StratifiedKFold,
    GroupKFold,
    LeaveOneGroupOut,
    TimeSeriesSplit,
    cross_validate,
    cross_val_score,
    train_test_split,
    GridSearchCV
)

# skopt search function
from skopt import (
    dummy_minimize, # for the randomized search
    gp_minimize,
    forest_minimize,
    gbrt_minimize
)

# for the analysis
from skopt.plots import (
    plot_convergence,
    plot_evaluations,
)

from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

In [87]:
## to-do: note that we should be here with less features, target around 15-20 features
## to-do: create research diagram for classification, regression

### CROSS VALIDATION TECHNIQUES

In [88]:
def perform_kfold_cv(model, x_train, y_train, n_splits = 5, metric = 'accuracy'):
    # K-Fold Cross-Validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=4)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=kf
    )

In [89]:
def perform_repeatedkfold_cv(model, x_train, y_train, n_splits = 5, n_repeats=10,  metric = 'accuracy'):
    # Repeated K-Fold Cross-Validation
    rkf = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=4,
    )

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=rkf
    )

In [90]:
def perform_leaveoneout_cv(model, x_train, y_train, metric = 'accuracy'):
    # Leave One Out Cross-Validation
    loo = LeaveOneOut()

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=loo
    )

In [91]:
def perform_leavepout_cv(model, x_train, y_train, p=2,  metric = 'accuracy'):
    # Leave P Out Cross-Validation
    lpo = LeavePOut(p=p)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=lpo
    )

In [92]:
def perform_stratifiedkfold_cv(model, x_train, y_train, n_splits = 5, n_repeats=10,  metric = 'accuracy'):
    # stratified kfold Cross-Validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=skf 
    )

#### Group cross validation technique

In [93]:
def perform_group_kfold_cv(model, x_train, y_train, group_col, n_splits = 5, metric = 'accuracy'):
    # Group K-Fold Cross-Validation
    gkf = GroupKFold(n_splits=5)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=gkf.split(x_train.drop(group_col, axis=1), y_train, groups=x_train[group_col]) 
    )

In [94]:
def perform_group_logo_cv(model, x_train, y_train, group_col, metric = 'accuracy'):
    # Cross-Validation
    logo = LeaveOneGroupOut()

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=logo.split(x_train.drop(group_col, axis=1), y_train, groups=x_train[group_col]) 
    )

#### Time Series shuffle split

In [95]:
def perform_timeseries_kfold_cv(model, x_train, y_train, n_splits=5, gap=0, metric = 'accuracy'):
    # Cross-Validation
    ts = TimeSeriesSplit(gap=gap, n_splits=n_splits)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=ts
    )

#### Create a dispatcher for cross validation methods

In [96]:
cv_dispatcher = {
    'kfold': perform_kfold_cv,
    'repeatedkfold' : perform_repeatedkfold_cv,
    'leaveoneout': perform_leaveoneout_cv,
    'leavepout': perform_leavepout_cv,
    'stratifiedkfold': perform_stratifiedkfold_cv,
    'groupkfold': perform_group_kfold_cv,
    'grouplogo': perform_group_logo_cv,
    'timeserieskfold': perform_timeseries_kfold_cv
}

In [97]:
## uncomment below code to see the list of supported metrics in sklearn
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())

In [98]:
def evaluate_models_using_cv(models, x_train, y_train, metric, cv_technique):
    for model_name in models.keys():
        score = cv_dispatcher[cv_technique](models[model_name], x_train, y_train, metric=metric)
        print("{} evaluation completed!".format(model_name))
        print('mean train set accuracy: ', np.mean(score['train_score']), ' +- ', np.std(score['train_score']))
        print('mean test set accuracy: ', np.mean(score['test_score']), ' +- ', np.std(score['test_score']))
        print('*'*100, end='\n\n')       

## Search Techniques

In [100]:
def perform_gridsearch(model,x_train, y_train, 
                          param_grid = param_grid_search, metric=metric, cv_technique = 'kfold'):
    # to-do: add capability to take all cv techniques
    # as of now gridsearch will support only kfold
    kf = KFold(n_splits=5, shuffle=True, random_state=4)
    
    # grid search, i.e search all combinations
    clf =  GridSearchCV(
        model,
        param_grid,
        scoring=metric,
        cv=kf, # k-fold
        refit=True, # refits best model to entire dataset
    )

    search = clf.fit(x_train, y_train)
    results = pd.DataFrame(search.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
    best_params = search.best_params_
    print(best_params)
    return (search, best_params, results)

In [101]:
def perform_randomsearch(objective, param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = dummy_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [102]:
def perform_bayesian_gp(objective, param_grid, n_calls=30):
    # dummy_minimize performs the randomized search
    search = gp_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [103]:
def perform_bayesian_forestgp(objective, param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = forest_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        base_estimator = 'RF', # the surrogate
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
        n_jobs=4 # should be equal to number of cores for parallelization
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [104]:
def perform_bayesian_gbrtgp(objective, param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = gbrt_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
        n_jobs=4 # should be equal to number of cores for parallelization
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [123]:
search_method_dispatcher = {
    'grid_search': perform_gridsearch,
    'random_search': perform_randomsearch,
    'bayesian_gp': perform_bayesian_gp,
    'bayesian_forestgp': perform_bayesian_forestgp,
    'bayesian_gbrtgp': perform_bayesian_gbrtgp
}

In [129]:
def run_search(model, x_train, y_train, x_test, y_test, param_grid, cv_technique, search_technique, metric, maximize_metric):
    # objective function, takes param_grid and returns optimiztion score
    @use_named_args(param_grid)
    def objective(**params):
        # model with new parameters
        model.set_params(**params)
        # tech debt
        score = cv_dispatcher[cv_technique](model, x_train, y_train, metric=metric)
        value = np.mean(score['test_score'])
        
        # todo: review when do we need to negate as per scikit-optimize
        if (maximize_metric):
            # negate because we need to maximize and scikit-optimize by default minimize
            return -value
        else:
            return value
    
    res={}
    if search_technique == 'grid_search':
        print('search technique: grid_search, cv technique: {0}'.format(cv_technique))
        res = perform_gridsearch(params_grid, x_train, y_train, metric)
    else:
        print('search technique: {0}, cv technique: {1}'.format(search_technique, cv_technique))
        res = search_method_dispatcher[search_technique](objective, param_grid)
        
    return res

In [135]:
def tune_models(dataset_path, dataset_format, target_col, test_size, model_store_location, model_names, params_grid_list, 
               cv_technique, search_technique, metric, maximize_metric):
    # load dataset
    df = None
    if dataset_format == 'csv':
        df = pd.read_csv(dataset_path)
    else:
        df = pd.read_excel(dataset_path, index_col=0)
    print('Dataset loaded!, size:({0}, {1})'.format(str(df.shape[0]), str(df.shape[1])), end='\n\n')
        
    # train - test split
    # specify high test size for faster experimentation runs
    x_train, x_test, y_train, y_test = train_test_split(
        df.drop(columns=[target_col]), 
        df[target_col], 
        test_size=test_size, 
        random_state=0
    )
    print('Split done!, train set size:({0}, {1})'.format(str(x_train.shape[0]), str(x_train.shape[1])), end='\n\n')
    
    # loading models from model store
    models = {}
    for name in model_names:
        pipeline_and_model = load_model(model_store_location+name)
        # pycaret seems to store this in last index, review this later - todo
        model = pipeline_and_model[len(pipeline_and_model)-1]
        models[name] = model
        print('{0} loaded!'.format(name))
    print('all models loaded!', end='\n\n')
          
    # run hyperparmeter seach
    tuned_models={}
    for model_name in models.keys():
        model = models[model_name]
        param_grid = params_grid_list[model_name]
        print('hyperparmeter search started for {0}!'.format(model_name), end=' ... ')
        res = run_search(model, x_train, y_train, x_test, y_test, param_grid, cv_technique, 
                                             search_technique, metric, maximize_metric)
        print('search finished!')
        print('best hyperparmeters:{0}'.format(res['best_params']))
        print('best score:{0}'.format(res['best_score']), end='\n\n')
        # tune model using best params
        # tuned_model = tune_model(best_params)
        # tuned_models[model_name] = tuned_model
    
    return tuned_models

In [136]:
tune_models(dataset_path, dataset_format, target_col, 0.9, model_store_location, model_names, params_grid_list, 
               cv_technique, search_technique, metric, maximize_metric)

Dataset loaded!, size:(5000, 13)

Split done!, train set size:(500, 12)

Transformation Pipeline and Model Successfully Loaded
LGBMClassifier loaded!
Transformation Pipeline and Model Successfully Loaded
XGBClassifier loaded!
Transformation Pipeline and Model Successfully Loaded
GradientBoostingClassifier loaded!
all models loaded!

hyperparmeter search started for LGBMClassifier! ... search technique: random_search, cv technique: kfold
search finished!
best hyperparmeters:{'n_estimators': 51, 'max_depth': 3}
best score:-0.9925402854946739
hyperparmeter search started for XGBClassifier! ... search technique: random_search, cv technique: kfold
search finished!
best hyperparmeters:{'n_estimators': 357, 'max_depth': 2}
best score:-0.9899928835125035
hyperparmeter search started for GradientBoostingClassifier! ... search technique: random_search, cv technique: kfold
search finished!
best hyperparmeters:{'n_estimators': 285, 'max_depth': 2}
best score:-0.9929363932308171


{}