# Hyperparameter Optimization Notebook
* we will use Scikit-Optimize, refere docs [here](https://scikit-optimize.github.io/stable/)

### Steps for Hyperparameter Optimization
1. Load pycaret's top 3 model saved in modelling notebook
2. Define Hyperparameter Space
3. Choose performance metrics to mimimize/maximize
4. Choose Cross Validation Technique:
    1. K-Fold Cross Validation
    2. Leave one out
    3. Leave p-out
    4. Repeated K-Fold Cross Validation
    5. Stratified K-Fold Cross Validation
    6. Group Cross Validation
    7. Nested Cross Validation
5. Choose hyperparameter search process:
    1. Manual approach
    2. Automated approach
       * Parallel 
        1. Grid Search 
        2. Random Search
       * Sequential
        1. Bayesin Search

### INSTALL REQUIRED DEPENDENCIES

In [1]:
# !pip install hyperopt==0.2.5
# !pip install scikit-optimize==0.8.1

### IMPORTING LIBRARIES

In [2]:
import pandas as pd
from pycaret.classification import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    LeaveOneOut,
    LeavePOut,
    StratifiedKFold,
    GroupKFold,
    LeaveOneGroupOut,
    TimeSeriesSplit,
    cross_validate,
    cross_val_score,
    train_test_split,
    GridSearchCV
)

# skopt search function
from skopt import (
    dummy_minimize, # for the randomized search
    gp_minimize,
    forest_minimize,
    gbrt_minimize
)

# for the analysis
from skopt.plots import (
    plot_convergence,
    plot_evaluations,
)

from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

### Step 1: LOAD MODELS & DATASET

In [3]:
saved_models_path = './saved_models/{0}'
pipeline_and_model = load_model(saved_models_path.format('XGBClassifier'))

Transformation Pipeline and Model Successfully Loaded


In [4]:
# loading dataset
transformed_df = pd.read_excel('../Bank_Personal_Loan_Modelling_transformed.xlsx', index_col=0)
transformed_df.head()

Unnamed: 0_level_0,Age,Experience,Income,ZIP Code,CCAvg,Mortgage,Family_-0.5,Family_0.0,Family_0.5,Family_1.0,Education_-0.5,Education_0.0,Education_0.5,Securities Account_0,CD Account_1,Online_0,CreditCard_1,Personal Loan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3750,-0.1,-0.05,0.101695,-1.239155,0.462963,0.0,0,0,1,0,1,0,0,1,0,1,0,0
4698,0.2,0.1,0.661017,-0.781238,0.277778,1.653465,0,0,1,0,0,1,0,1,0,0,0,0
1738,-0.05,-0.05,0.101695,-0.384872,-0.722222,2.277228,1,0,0,0,0,1,0,0,0,0,0,0
319,-0.9,-0.9,0.779661,0.827957,0.166667,1.881188,0,0,0,1,0,0,1,1,0,1,0,0
1309,0.45,0.2,-0.237288,-0.519095,0.277778,0.0,0,0,1,0,0,0,1,1,0,1,1,0


In [5]:
target = 'Personal Loan'

In [6]:
df_x = transformed_df.drop(columns=[target])
df_y = transformed_df[target]

x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

### STEP 2: DEFINE HYPERPARAMETER SPACE

In [7]:
# model seems to be at the last index of pipeline class, this might change in future.
model = pipeline_and_model[len(pipeline_and_model)-1]
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              random_state=272, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='auto', validate_parameters=1,
              verbosity=0)

In [8]:
param_grid = [
    Integer(100, 400, name='n_estimators'),
    Integer(1, 5, name='max_depth'),
#     Integer(1, 4, name='min_child_weight'),
#     Real(0.5, 0.9, name='colsample_bytree'),
#     Real(0.5, 1, name='subsample'),
#     Real(0.01, 0.6, prior='log-uniform', name='learning_rate')
]

### STEP 4: CHOOSE CROSS VALIDATION TECHNIQUE

In [9]:
def perform_kfold_cv(model, x_train, y_train, n_splits = 5, metric = 'accuracy'):
    # K-Fold Cross-Validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=4)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=kf
    )

In [10]:
def perform_repeatedkfold_cv(model, x_train, y_train, n_splits = 5, n_repeats=10,  metric = 'accuracy'):
    # Repeated K-Fold Cross-Validation
    rkf = RepeatedKFold(
        n_splits=n_splits,
        n_repeats=n_repeats,
        random_state=4,
    )

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=rkf
    )

In [11]:
def perform_leaveoneout_cv(model, x_train, y_train, metric = 'accuracy'):
    # Leave One Out Cross-Validation
    loo = LeaveOneOut()

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=loo
    )

In [12]:
def perform_leavepout_cv(model, x_train, y_train, p=2,  metric = 'accuracy'):
    # Leave P Out Cross-Validation
    lpo = LeavePOut(p=p)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=lpo
    )

In [13]:
def perform_stratifiedkfold_cv(model, x_train, y_train, n_splits = 5, n_repeats=10,  metric = 'accuracy'):
    # stratified kfold Cross-Validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=4)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=skf 
    )

In [14]:
def print_cvscore(score):
    print('mean train set accuracy: ', np.mean(score['train_score']), ' +- ', np.std(score['train_score']))
    print('mean test set accuracy: ', np.mean(score['test_score']), ' +- ', np.std(score['test_score']))

#### Group cross validation technique

In [15]:
def perform_group_kfold_cv(model, x_train, y_train, group_col, n_splits = 5, metric = 'accuracy'):
    # Group K-Fold Cross-Validation
    gkf = GroupKFold(n_splits=5)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=gkf.split(x_train.drop(group_col, axis=1), y_train, groups=x_train[group_col]) 
    )

In [16]:
def perform_group_logo_cv(model, x_train, y_train, group_col, metric = 'accuracy'):
    # Cross-Validation
    logo = LeaveOneGroupOut()

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=logo.split(x_train.drop(group_col, axis=1), y_train, groups=x_train[group_col]) 
    )

#### Time Series shuffle split

In [17]:
def perform_timeseries_kfold_cv(model, x_train, y_train, n_splits=5, gap=0, metric = 'accuracy'):
    # Cross-Validation
    ts = TimeSeriesSplit(gap=gap, n_splits=n_splits)

    # estimate generalization error
    return  cross_validate(
        model,
        x_train, 
        y_train,
        scoring=metric,
        return_train_score=True,
        cv=ts
    )

#### Create a dispatcher for cross validation methods

In [18]:
cv_dispatcher = {
    'kfold': perform_kfold_cv,
    'repeatedkfold' : perform_repeatedkfold_cv,
    'leaveoneout': perform_leaveoneout_cv,
    'leavepout': perform_leavepout_cv,
    'stratifiedkfold': perform_stratifiedkfold_cv,
    'groupkfold': perform_group_kfold_cv,
    'grouplogo': perform_group_logo_cv,
    'timeserieskfold': perform_timeseries_kfold_cv
}

#### Decide CV technique suitable to the dataset and the situation

In [19]:
## uncomment below code to see the list of supported metrics in sklearn
# import sklearn
# sorted(sklearn.metrics.SCORERS.keys())

In [20]:
cv_technique = 'kfold'
metric = 'f1'
maximize_metric = True

In [21]:
## to-do: note that the perform cv method should take variable number of args, try using **kwargs
res = cv_dispatcher[cv_technique](model, x_train, y_train, metric=metric)

In [22]:
res

{'fit_time': array([0.18182945, 0.17050815, 0.16852045, 0.17255092, 0.17049623]),
 'score_time': array([0.00200868, 0.01003242, 0.00201321, 0.01002288, 0.01004124]),
 'test_score': array([0.96428571, 0.90666667, 0.93055556, 0.95081967, 0.944     ]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [23]:
print_cvscore(res)

mean train set accuracy:  1.0  +-  0.0
mean test set accuracy:  0.9392655217278169  +-  0.01959832938648923


### STEP 5: CHOOSE SEARCH TECHNIQUE

#### Manual Search
* A good strategy is to start with the manual search
* it will give us an idea about the low_effective_dimensions of the hyperparameters
* so pick different values of hyperparameter and decide a meaningful range

#### Grid Search
* Grid Search is not a part of scikit optimize, hence we will use it from scikit
* note that GridSeach way of taking param_grid is different from scikit-optimize

In [25]:
## uncomment other parameters before actual run
param_grid_search = {
    'n_estimators': range(100, 400, 40),
    'max_depth': range (1, 5, 1),
#     'min_child_weight': range(1, 4, 1),
#     'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
#     'subsample': [0.5, 0.6, 0.7, 0.8, 0.9,1],
#     'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5]  
}

#### Define a objective function first, mandatory for scikit-optimize to work

In [24]:
@use_named_args(param_grid)
def objective(**params):
    # model with new parameters
    model.set_params(**params)
    # tech debt
    score = cv_dispatcher[cv_technique](model, x_train, y_train, metric=metric)
    value = np.mean(score['test_score'])
    
    if (maximize_metric):
        # negate because we need to maximize and scikit-optimize by default minimize
        return -value
    else:
        return value

In [26]:
def perform_gridsearch(model,x_train, y_train, 
                          param_grid = param_grid_search, metric=metric, cv_technique = 'kfold'):
    # to-do: add capability to take all cv techniques
    # as of now gridsearch will support only kfold
    kf = KFold(n_splits=5, shuffle=True, random_state=4)
    
    # grid search, i.e search all combinations
    clf =  GridSearchCV(
        model,
        param_grid,
        scoring=metric,
        cv=kf, # k-fold
        refit=True, # refits best model to entire dataset
    )

    search = clf.fit(x_train, y_train)
    results = pd.DataFrame(search.cv_results_)[['params', 'mean_test_score', 'std_test_score']]
    best_params = search.best_params_
    print(best_params)
    return (search, best_params, results)

In [27]:
grd_srch_res = perform_gridsearch(model, x_train, y_train, param_grid_search, metric)

{'max_depth': 3, 'n_estimators': 340}


In [28]:
def perform_randomsearch(objective=objective, param_grid=param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = dummy_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [29]:
perform_randomsearch()

{'best_score': -0.9440164598221061,
 'best_params': {'n_estimators': 377, 'max_depth': 3}}

In [31]:
def perform_bayesian_gp(objective=objective, param_grid=param_grid, n_calls=30):
    # dummy_minimize performs the randomized search
    search = gp_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [32]:
perform_bayesian_gp()

{'best_score': -0.9427149835206297,
 'best_params': {'n_estimators': 400, 'max_depth': 3}}

In [33]:
def perform_bayesian_forestgp(objective=objective, param_grid=param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = forest_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        base_estimator = 'RF', # the surrogate
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
        n_jobs=4 # should be equal to number of cores for parallelization
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [34]:
perform_bayesian_forestgp()

{'best_score': -0.9440164598221061,
 'best_params': {'n_estimators': 377, 'max_depth': 3}}

In [35]:
def perform_bayesian_gbrtgp(objective=objective, param_grid=param_grid, n_calls=50):
    # dummy_minimize performs the randomized search
    search = gbrt_minimize(
        objective,  # the objective function to minimize
        param_grid,  # the hyperparameter space
        n_initial_points=10, # the number of points to evaluate f(x) to start of
        acq_func='EI', # the acquisition function
        n_calls=n_calls,  # the number of evaluations of the objective function
        random_state=0,
        n_jobs=4 # should be equal to number of cores for parallelization
    )
    
    res = {}
    res['best_score'] = search.fun
    best_params = {}
    for idx, hyper_param in enumerate(param_grid):
        best_params[hyper_param.name] = search.x[idx]
    res['best_params'] = best_params
    return res

In [36]:
perform_bayesian_gbrtgp()

{'best_score': -0.9440164598221061,
 'best_params': {'n_estimators': 377, 'max_depth': 3}}