This files contains an example of tuning a multiple models with BayesSearchCV.

In [1]:
import pickle
import time

import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import plotly.io as pio
pio.renderers.default='notebook'

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([559, 241]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.69875, 0.30125])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [275]:
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from enum import unique, Enum, auto

@unique
class ClassifierSearchSpaceModels(Enum):
    LogisticRegression = auto()
    XGBoost = auto()


#`XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded on Apple Silicon (ARM)`
#https://github.com/dmlc/xgboost/issues/6909
#```
#pip install --upgrade --force-reinstall xgboost --no-binary xgboost -v
#```

# need to make sure this works for a single model or multiple models
class SklearnClassifierSearchSpace:
    def __init__(self,
                 # remove these and pass data and get column names directly?
                 numeric_column_names,
                 non_numeric_column_names,
                 models=[
                     ClassifierSearchSpaceModels.LogisticRegression,
                     ClassifierSearchSpaceModels.XGBoost
                 ],
                 iterations = [7, 6]):
        assert len(models) == len(iterations)
        self._numeric_column_names = numeric_column_names
        self._non_numeric_column_names = non_numeric_column_names
        self._models = models
        self._iterations = iterations

    def pipeline(self):
        numeric_pipeline = Pipeline([
            # tune how we want to impute values
            # e.g. whether or not we want to impute (and how) or simply remove rows with missing values
            ('imputer', hlp.sklearn_pipeline.TransformerChooser()),
            # tune how we want to scale values
            # e.g. MinMax/Normalization/None
            ('scaler', hlp.sklearn_pipeline.TransformerChooser()),
        ])
        non_numeric_pipeline = Pipeline([
            # tune how we handle categoric values
            # e.g. One Hot, Custom-OrdinalEncoder
            ('encoder', hlp.sklearn_pipeline.TransformerChooser()),
        ])
        # associate numeric/non-numeric columns with cooresponding pipeline
        transformations_pipeline = ColumnTransformer([
            ('numeric', numeric_pipeline, self._numeric_column_names),
            ('non_numeric', non_numeric_pipeline, self._non_numeric_column_names)
        ])
        # add model to create the full pipeline
        full_pipeline = Pipeline([
            ('prep', transformations_pipeline),
            ('model', DummyClassifier())
        ])

        return full_pipeline
    
    def _build_imputers(strategies):
        if strategies:
            imputers = [SimpleImputer(strategy=x) if x else None for x in strategies]
        else:
            imputers = [None]

        return imputers
    
    @staticmethod
    def _search_space_logistic(solver='lbfgs',
                               max_iter=1000,
                               C=(1e-6, 1e+2),
                               C_prior='log-uniform',
                               imputer_strategies=['mean', 'median', 'most_frequent'],
                               random_state=None):
        from skopt.space import Real, Categorical, Integer

        model = LogisticRegression(
            solver=solver,
            max_iter=max_iter,
            random_state=random_state
        )

        logistic_search_space = {
            'model': Categorical([model]),
            'model__C': Real(C[0], C[1], prior=C_prior),
            # these steps correspond to the pipeline built in `build_classifier_search_pipeline()`
            'prep__numeric__imputer__transformer': Categorical(build_imputers(imputer_strategies)),
            'prep__numeric__scaler__transformer': Categorical([
                None,
                MinMaxScaler(),
                StandardScaler()
            ]),
            'prep__non_numeric__encoder__transformer': Categorical([
                OneHotEncoder(),
                hlp.sklearn_pipeline.CustomOrdinalEncoder()
            ]),
        }
        return logistic_search_space

    @staticmethod
    def _search_space_xgboost(eval_metric='logloss',
                                          use_label_encoder=False,
                                          max_depth = (3, 10),
                                          n_estimators = (50, 1000),
                                          learning_rate = (0.01, 0.3),
                                          colsample_bytree = (0.01, 1),
                                          subsample = (0.1, 1),
                                          imputer_strategies=None,
                                          random_state=None):
        from skopt.space import Real, Categorical, Integer
        from xgboost import XGBClassifier

        model = XGBClassifier(
            eval_metric=eval_metric,
            use_label_encoder=use_label_encoder,
            random_state=random_state,
        )
        # https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
        # max_depth: 3–10
        # n_estimators: 100 (lots of observations) to 1000 (few observations)
        # learning_rate: 0.01–0.3
        # colsample_bytree: 0.5–1
        # subsample: 0.6–1
        # Then, you can focus on optimizing max_depth and n_estimators.
        #You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.
        xgb_search_space = {
            'model': Categorical([model]),
            'model__max_depth': Integer(max_depth[0], max_depth[1]),
            'model__n_estimators':  Integer(n_estimators[0], n_estimators[1]),
            'model__learning_rate': Real(learning_rate[0], learning_rate[1]),
            'model__colsample_bytree': Real(colsample_bytree[0], colsample_bytree[1]),
            'model__subsample': Real(subsample[0], subsample[1]),
            # these steps correspond to the pipeline built in `build_classifier_search_pipeline()`
            'prep__numeric__imputer__transformer': Categorical(build_imputers(imputer_strategies)),
            'prep__numeric__scaler__transformer': Categorical([None]),
            'prep__non_numeric__encoder__transformer': Categorical([
                OneHotEncoder(),
                hlp.sklearn_pipeline.CustomOrdinalEncoder()
            ]),    
        }

        return xgb_search_space

    def search_spaces(self):
        search_spaces = []
        for model_enum, num_iterations in zip(self._models, self._iterations):
            if model_enum == ClassifierSearchSpaceModels.LogisticRegression:
                space = SklearnClassifierSearchSpace._search_space_logistic()
            elif model_enum == ClassifierSearchSpaceModels.XGBoost:
                space = SklearnClassifierSearchSpace._search_space_xgboost()
                
            else:
                assert False
                     
            search_spaces = search_spaces + [(space, num_iterations)]
        
        return search_spaces
    
    def param_name_mappings(self):
        pass

    def __str__(self):
        return 'asdf'

In [276]:
search_space = SklearnClassifierSearchSpace(numeric_column_names=numeric_columns,
                                            non_numeric_column_names=non_numeric_columns)

print(search_space)

asdf


In [277]:
search_space.pipeline()

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   TransformerChooser()),
                                                                  ('scaler',
                                                                   TransformerChooser())]),
                                                  ['duration', 'credit_amount',
                                                   'installment_commitment',
                                                   'residence_since', 'age',
                                                   'existing_credits',
                                                   'num_dependents']),
                                                 ('non_numeric',
                                                  Pipeline(steps=[('encoder',
                                                          

In [278]:
search_space.search_spaces()

[({'model': Categorical(categories=(LogisticRegression(max_iter=1000),), prior=None),
   'model__C': Real(low=1e-06, high=100.0, prior='log-uniform', transform='identity'),
   'prep__numeric__imputer__transformer': Categorical(categories=(SimpleImputer(), SimpleImputer(strategy='median'), SimpleImputer(strategy='most_frequent')), prior=None),
   'prep__numeric__scaler__transformer': Categorical(categories=(None, MinMaxScaler(), StandardScaler()), prior=None),
   'prep__non_numeric__encoder__transformer': Categorical(categories=(OneHotEncoder(), CustomOrdinalEncoder()), prior=None)},
  7),
 ({'model': Categorical(categories=(XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                 colsample_bynode=None, colsample_bytree=None,
                 enable_categorical=False, eval_metric='logloss', gamma=None,
                 gpu_id=None, importance_type=None, interaction_constraints=None,
                 learning_rate=None, max_delta_step=None, max_depth=None,
  

In [252]:
def build_imputers(strategies):
    
    if strategies:
        imputers = [SimpleImputer(strategy=x) if x else None for x in strategies]
    else:
        imputers = [None]
    
    return imputers

build_imputers(None)

[None]

In [166]:
build_imputers([None, 'mean', 'median', 'most_frequent'])

[None,
 SimpleImputer(),
 SimpleImputer(strategy='median'),
 SimpleImputer(strategy='most_frequent')]

In [167]:
build_imputers(['mean', 'median', 'most_frequent'])

[SimpleImputer(),
 SimpleImputer(strategy='median'),
 SimpleImputer(strategy='most_frequent')]

In [168]:
def build_classifier_search_space_logistic(solver='lbfgs',
                                           max_iter=1000,
                                           C=(1e-6, 1e+2),
                                           C_prior='log-uniform',
                                           imputer_strategies=['mean', 'median', 'most_frequent'],
                                           random_state=None):
    from skopt.space import Real, Categorical, Integer
    
    model = LogisticRegression(
        solver=solver,
        max_iter=max_iter,
        random_state=random_state
    )

    logistic_search_space = {
        'model': Categorical([model]),
        'model__C': Real(C[0], C[1], prior=C_prior),
        # these steps correspond to the pipeline built in `build_classifier_search_pipeline()`
        'prep__numeric__imputer__transformer': Categorical(build_imputers(imputer_strategies)),
        'prep__numeric__scaler__transformer': Categorical([
            None,
            MinMaxScaler(),
            StandardScaler()
        ]),
        'prep__non_numeric__encoder__transformer': Categorical([
            OneHotEncoder(),
            hlp.sklearn_pipeline.CustomOrdinalEncoder()
        ]),
    }
    return logistic_search_space

def search_space_param_name_mapping_classifier_pipeline():
    return {
        'prep__non_numeric__encoder__transformer': 'encoder',
        'prep__numeric__imputer__transformer': 'imputer',
        'prep__numeric__scaler__transformer': 'scaler'
    }

def search_space_param_name_mapping_classifier_logistic():
    return {
        'model__C': 'C',
    }

def search_space_param_name_mapping_classifier_xgboost():
    return {
        'model__max_depth': 'max_depth',
        'model__n_estimators': 'n_estimators',
        'model__learning_rate': 'learning_rate',
        'model__colsample_bytree': 'colsample_bytree',
        'model__subsample': 'subsample',
        'model': 'model',
    }





def build_classifier_search_space_xgboost(eval_metric='logloss',
                                          use_label_encoder=False,
                                          max_depth = (3, 10),
                                          n_estimators = (50, 1000),
                                          learning_rate = (0.01, 0.3),
                                          colsample_bytree = (0.01, 1),
                                          subsample = (0.1, 1),
                                          imputer_strategies=None,
                                          random_state=None):
    from skopt.space import Real, Categorical, Integer
    from xgboost import XGBClassifier
    
    model = XGBClassifier(
        eval_metric=eval_metric,
        use_label_encoder=use_label_encoder,
        random_state=random_state,
    )
    # https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
    # max_depth: 3–10
    # n_estimators: 100 (lots of observations) to 1000 (few observations)
    # learning_rate: 0.01–0.3
    # colsample_bytree: 0.5–1
    # subsample: 0.6–1
    # Then, you can focus on optimizing max_depth and n_estimators.
    #You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.
    
    xgb_search_space = {
        'model': Categorical([model]),
        'model__max_depth': Integer(max_depth[0], max_depth[1]),
        'model__n_estimators':  Integer(n_estimators[0], n_estimators[1]),
        'model__learning_rate': Real(learning_rate[0], learning_rate[1]),
        'model__colsample_bytree': Real(colsample_bytree[0], colsample_bytree[1]),
        'model__subsample': Real(subsample[0], subsample[1]),
        # these steps correspond to the pipeline built in `build_classifier_search_pipeline()`
        'prep__numeric__imputer__transformer': Categorical(build_imputers(imputer_strategies)),
        'prep__numeric__scaler__transformer': Categorical([None]),
        'prep__non_numeric__encoder__transformer': Categorical([
            OneHotEncoder(),
            hlp.sklearn_pipeline.CustomOrdinalEncoder()
        ]),    
    }
    
    return xgb_search_space


In [169]:
build_classifier_search_space_logistic()

{'model': Categorical(categories=(LogisticRegression(max_iter=1000),), prior=None),
 'model__C': Real(low=1e-06, high=100.0, prior='log-uniform', transform='identity'),
 'prep__numeric__imputer__transformer': Categorical(categories=(SimpleImputer(), SimpleImputer(strategy='median'), SimpleImputer(strategy='most_frequent')), prior=None),
 'prep__numeric__scaler__transformer': Categorical(categories=(None, MinMaxScaler(), StandardScaler()), prior=None),
 'prep__non_numeric__encoder__transformer': Categorical(categories=(OneHotEncoder(), CustomOrdinalEncoder()), prior=None)}

In [170]:
build_classifier_search_space_xgboost()

{'model': Categorical(categories=(XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None,
               enable_categorical=False, eval_metric='logloss', gamma=None,
               gpu_id=None, importance_type=None, interaction_constraints=None,
               learning_rate=None, max_delta_step=None, max_depth=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=None, num_parallel_tree=None,
               predictor=None, random_state=None, reg_alpha=None,
               reg_lambda=None, scale_pos_weight=None, subsample=None,
               tree_method=None, use_label_encoder=False,
               validate_parameters=None, verbosity=None),), prior=None),
 'model__max_depth': Integer(low=3, high=10, prior='uniform', transform='identity'),
 'model__n_estimators': Integer(low=50, high=1000, prior='uniform', transform='identity'),
 'model__learnin

# Model

`XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded on Apple Silicon (ARM)`

https://github.com/dmlc/xgboost/issues/6909

```
pip install --upgrade --force-reinstall xgboost --no-binary xgboost -v
```

In [110]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# skopt.BayesSearchCV

[https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html](https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html)

In [245]:
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold

bayes_search = BayesSearchCV(
    estimator=search_space.pipeline(),
    search_spaces=search_space.search_spaces(),
    cv=RepeatedKFold(n_splits=5, n_repeats=2),  # 5 fold 2 repeat CV
    scoring='roc_auc',
    refit=False,  # required if passing in multiple scorers
    return_train_score=False,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [253]:
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")

Elapsed time to run BayesSearchCV: 9.462 seconds; 0.2 minutes


In [254]:
len(bayes_search.cv_results_['params'])

13

In [255]:
print(bayes_search.cv_results_)

{'mean_fit_time': array([0.06337442, 0.08440077, 0.03295195, 0.03410697, 0.04040511,
       0.0313798 , 0.03787649, 0.54374831, 0.81939776, 0.39988477,
       0.46086586, 0.19971857, 0.22375185]), 'std_fit_time': array([0.01938587, 0.02150791, 0.0053874 , 0.00714931, 0.01275391,
       0.01019581, 0.01145617, 0.08820914, 0.10021621, 0.073106  ,
       0.06117878, 0.01963555, 0.03032959]), 'mean_score_time': array([0.01242173, 0.00629814, 0.01317093, 0.01074312, 0.00674403,
       0.00502975, 0.00577927, 0.01492548, 0.01446817, 0.03257031,
       0.00977757, 0.01060796, 0.01688228]), 'std_score_time': array([0.00480727, 0.00338586, 0.00428794, 0.00095994, 0.00362272,
       0.00159773, 0.0015973 , 0.00627781, 0.00872167, 0.01393961,
       0.0033097 , 0.00556017, 0.00653129]), 'param_model': masked_array(data=[LogisticRegression(max_iter=1000),
                   LogisticRegression(max_iter=1000),
                   LogisticRegression(max_iter=1000),
                   LogisticRegressio

In [256]:
print(bayes_search.best_score_)

0.7639067281356418


In [257]:
print(bayes_search.best_params_)

OrderedDict([('model', LogisticRegression(max_iter=1000)), ('model__C', 1.5955669452672732), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer(strategy='most_frequent')), ('prep__numeric__scaler__transformer', MinMaxScaler())])


# Results

In [72]:
new_param_column_names = {
    'model__max_depth': 'max_depth',
    'model__n_estimators': 'n_estimators',
    'model__learning_rate': 'learning_rate',
    'model__colsample_bytree': 'colsample_bytree',
    'model__subsample': 'subsample',
    'model': 'model',
    'prep__non_numeric__encoder__transformer': 'encoder',
    'prep__numeric__imputer__transformer': 'imputer',
    'prep__numeric__scaler__transformer': 'scaler'
}

In [258]:
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
    searcher=bayes_search,                                     
    higher_score_is_better = True,
#    parameter_name_mappings = new_param_column_names
)

In [259]:
results.to_yaml_file(yaml_file_name = 'Run 1 - Multi-model - BayesSearchCV.yaml')

In [260]:
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Multi-model - BayesSearchCV.yaml')

## Timings

In [261]:
results.fit_time_averages

array([0.06337442, 0.08440077, 0.03295195, 0.03410697, 0.04040511,
       0.0313798 , 0.03787649, 0.54374831, 0.81939776, 0.39988477,
       0.46086586, 0.19971857, 0.22375185])

## Best Scores/Params

In [262]:
results.best_primary_score

0.7639067281356418

In [263]:
results.best_primary_score_params

{'model': 'LogisticRegression(max_iter=1000)',
 'model__C': 1.5955669452672732,
 'prep__non_numeric__encoder__transformer': 'OneHotEncoder()',
 'prep__numeric__imputer__transformer': "SimpleImputer(strategy='most_frequent')",
 'prep__numeric__scaler__transformer': 'MinMaxScaler()'}

In [264]:
results.to_formatted_dataframe(num_rows=40)

roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,model,model__C,prep__non_numeric__encoder__transformer,prep__numeric__imputer__transformer,prep__numeric__scaler__transformer,model__colsample_bytree,model__learning_rate,model__max_depth,model__n_estimators,model__subsample
0.764,0.736,0.792,LogisticRegression(max_iter=1000),1.596,OneHotEncoder(),SimpleImputer(strategy='most_frequent'),MinMaxScaler(),,,,,
0.762,0.735,0.789,LogisticRegression(max_iter=1000),22.376,OneHotEncoder(),SimpleImputer(strategy='median'),,,,,,
0.754,0.739,0.769,LogisticRegression(max_iter=1000),32.731,OneHotEncoder(),SimpleImputer(),StandardScaler(),,,,,
0.752,0.73,0.774,LogisticRegression(max_iter=1000),11.655,OneHotEncoder(),SimpleImputer(strategy='most_frequent'),StandardScaler(),,,,,
0.745,0.708,0.783,LogisticRegression(max_iter=1000),22.913,OneHotEncoder(),SimpleImputer(strategy='most_frequent'),MinMaxScaler(),,,,,
0.745,0.717,0.773,"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,  colsample_bynode=None, colsample_bytree=None,  enable_categorical=False, eval_metric='logloss', gamma=None,  gpu_id=None, importance_type=None, interaction_constraints=None,  learning_rate=None, max_delta_step=None, max_depth=None,  min_child_weight=None, missing=nan, monotone_constraints=None,  n_estimators=100, n_jobs=None, num_parallel_tree=None,  predictor=None, random_state=None, reg_alpha=None,  reg_lambda=None, scale_pos_weight=None, subsample=None,  tree_method=None, use_label_encoder=False,  validate_parameters=None, verbosity=None)",,CustomOrdinalEncoder(),,,0.365,0.151,7.0,395.0,0.747
0.736,0.709,0.763,LogisticRegression(max_iter=1000),0.003,CustomOrdinalEncoder(),SimpleImputer(strategy='median'),StandardScaler(),,,,,
0.726,0.706,0.747,"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,  colsample_bynode=None, colsample_bytree=None,  enable_categorical=False, eval_metric='logloss', gamma=None,  gpu_id=None, importance_type=None, interaction_constraints=None,  learning_rate=None, max_delta_step=None, max_depth=None,  min_child_weight=None, missing=nan, monotone_constraints=None,  n_estimators=100, n_jobs=None, num_parallel_tree=None,  predictor=None, random_state=None, reg_alpha=None,  reg_lambda=None, scale_pos_weight=None, subsample=None,  tree_method=None, use_label_encoder=False,  validate_parameters=None, verbosity=None)",,OneHotEncoder(),,,0.023,0.257,6.0,808.0,0.257
0.724,0.702,0.747,"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,  colsample_bynode=None, colsample_bytree=None,  enable_categorical=False, eval_metric='logloss', gamma=None,  gpu_id=None, importance_type=None, interaction_constraints=None,  learning_rate=None, max_delta_step=None, max_depth=None,  min_child_weight=None, missing=nan, monotone_constraints=None,  n_estimators=100, n_jobs=None, num_parallel_tree=None,  predictor=None, random_state=None, reg_alpha=None,  reg_lambda=None, scale_pos_weight=None, subsample=None,  tree_method=None, use_label_encoder=False,  validate_parameters=None, verbosity=None)",,CustomOrdinalEncoder(),,,0.922,0.079,6.0,299.0,0.188
0.712,0.688,0.736,"XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,  colsample_bynode=None, colsample_bytree=None,  enable_categorical=False, eval_metric='logloss', gamma=None,  gpu_id=None, importance_type=None, interaction_constraints=None,  learning_rate=None, max_delta_step=None, max_depth=None,  min_child_weight=None, missing=nan, monotone_constraints=None,  n_estimators=100, n_jobs=None, num_parallel_tree=None,  predictor=None, random_state=None, reg_alpha=None,  reg_lambda=None, scale_pos_weight=None, subsample=None,  tree_method=None, use_label_encoder=False,  validate_parameters=None, verbosity=None)",,OneHotEncoder(),,,0.704,0.263,6.0,412.0,0.29


In [76]:
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.primary_score_trial_ranking

array([16,  7, 17, 13, 14, 15, 19, 18, 20, 12, 10,  1,  2,  3,  9, 11,  5,
        8,  6,  4])

In [77]:
# gives the 
# e.g. results.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
results.primary_score_best_indexes

array([11, 12, 13, 19, 16, 18,  1, 17, 14, 10, 15,  9,  3,  4,  5,  0,  2,
        7,  6,  8])

## BayesSearchCV Performance Over Time

In [97]:
bayes_search.cv_results_['params'][0].keys()

odict_keys(['model', 'model__colsample_bytree', 'model__learning_rate', 'model__max_depth', 'model__n_estimators', 'model__subsample', 'prep__non_numeric__encoder__transformer', 'prep__numeric__imputer__transformer', 'prep__numeric__scaler__transformer'])

In [96]:
bayes_search.cv_results_['params'][11].keys()

odict_keys(['model', 'model__C', 'prep__non_numeric__encoder__transformer', 'prep__numeric__imputer__transformer', 'prep__numeric__scaler__transformer'])

In [82]:
labeled_dataframe = results.to_dataframe(sort_by_score=False)  # leave original trial order
columns = labeled_dataframe.columns.to_list()  # cache columns to move Iteration column to front
columns

['roc_auc Mean',
 'roc_auc 95CI.LO',
 'roc_auc 95CI.HI',
 'model',
 'model__colsample_bytree',
 'model__learning_rate',
 'model__max_depth',
 'model__n_estimators',
 'model__subsample',
 'prep__non_numeric__encoder__transformer',
 'prep__numeric__scaler__transformer',
 'model__C']

In [90]:
results.parameter_names_original

['model',
 'model__colsample_bytree',
 'model__learning_rate',
 'model__max_depth',
 'model__n_estimators',
 'model__subsample',
 'prep__non_numeric__encoder__transformer',
 'prep__numeric__imputer__transformer',
 'prep__numeric__scaler__transformer']

In [89]:
def create_hyper_param_labels(trial) -> list:
    """Creates a list of strings that represent the name/value pair for each hyper-parameter."""
    return [f"{results.parameter_names_mapping[x] if results.parameter_names_mapping and x in results.parameter_names_mapping else x}: {trial[x]}"  # pylint: disable=line-too-long  # noqa
            for x in results.parameter_names_original]

def create_trial_label(trial) -> str:
        return f"{{{hstring.collapse(create_hyper_param_labels(trial), separate=', ')}}}"

create_hyper_param_labels(results.parameter_trials[0])

["model: XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n              colsample_bynode=None, colsample_bytree=None,\n              enable_categorical=False, eval_metric='logloss', gamma=None,\n              gpu_id=None, importance_type=None, interaction_constraints=None,\n              learning_rate=None, max_delta_step=None, max_depth=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n              predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,\n              scale_pos_weight=None, subsample=None, tree_method=None,\n              use_label_encoder=False, validate_parameters=None,\n              verbosity=None)",
 'model__colsample_bytree: 0.8844821246070537',
 'model__learning_rate: 0.09798893186641074',
 'model__max_depth: 10',
 'model__n_estimators: 1735',
 'model__subsample: 0.15608164675966435',
 'prep__non_numeric__encoder__transformer:

In [84]:
results.parameter_trials[0]

{'model': "XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,\n              colsample_bynode=None, colsample_bytree=None,\n              enable_categorical=False, eval_metric='logloss', gamma=None,\n              gpu_id=None, importance_type=None, interaction_constraints=None,\n              learning_rate=None, max_delta_step=None, max_depth=None,\n              min_child_weight=None, missing=nan, monotone_constraints=None,\n              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n              predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,\n              scale_pos_weight=None, subsample=None, tree_method=None,\n              use_label_encoder=False, validate_parameters=None,\n              verbosity=None)",
 'model__colsample_bytree': 0.8844821246070537,
 'model__learning_rate': 0.09798893186641074,
 'model__max_depth': 10,
 'model__n_estimators': 1735,
 'model__subsample': 0.15608164675966435,
 'prep__non_numeric__encoder__transforme

In [None]:
[x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                              for x in results.trial_labels(order_from_best_to_worst=False)]

In [79]:
labeled_dataframe = self.to_dataframe(sort_by_score=False)  # leave original trial order
columns = labeled_dataframe.columns.to_list()  # cache columns to move Iteration column to front
labeled_dataframe['Trial Index'] = np.arange(1, self.number_of_trials + 1)
labeled_dataframe = labeled_dataframe[['Trial Index'] + columns]
# create the labels that will be used in the plotly hover text
labeled_dataframe['label'] = [x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                              for x in self.trial_labels(order_from_best_to_worst=False)]

KeyError: 'model__colsample_bytree'

In [78]:
results.plot_performance_across_trials().show()

KeyError: 'model__colsample_bytree'

In [None]:
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()

In [None]:
results.plot_performance_across_trials(size='learning_rate', color='encoder').show()

---

## Variable Performance Over Time

In [None]:
results.plot_parameter_values_across_trials().show()

---

## Scatter Matrix

In [None]:
results.plot_scatter_matrix(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()

---

## Variable Performance - Numeric

In [None]:
results.plot_performance_numeric_params(height=800)

In [None]:
results.plot_parallel_coordinates().show()

---

## Variable Performance - Non-Numeric

In [None]:
results.plot_performance_non_numeric_params()

---

## Individual Variable Performance

In [None]:
results.plot_score_vs_parameter(
    parameter='learning_rate',
    size='colsample_bytree',
    color='scaler'
)

---

In [None]:
results.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
                                   parameter_y='learning_rate',
                                   size='max_depth'
                                  )

In [None]:
results.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
                                   parameter_y='learning_rate',
                                   size='imputer')

# Regression on `roc_auc Mean`

In [None]:
score_variable = results.primary_score_name + ' Mean'

In [None]:
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
                              if x not in [score_variable] + results.parameter_names])
score_dataframe.head()

In [None]:
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names

In [None]:
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)

In [None]:
import statsmodels.formula.api as smf

y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")

formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)

numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)

print(numeric_columns)
print(non_numeric_columns)

numeric_pipeline = Pipeline([
    ('scaling', StandardScaler()),
])

transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])

score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
                                           columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()

In [None]:
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')

In [None]:
print(formula)
model = smf.ols(formula=formula,
                data = score_dataframe_transformed)
results = model.fit()
print(results.summary())

In [None]:
coefficients = pd.DataFrame({
    'feature': results.params.index,
    'coefficient': results.params,
    'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients

In [None]:
score_variable

In [None]:
px.bar(
    data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
    y='feature',
    x='coefficient',
    color='Stat Sig',
    title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

# Feature Importance

https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

In [None]:
from sklearn.inspection import permutation_importance

estimator = bayes_search.best_estimator_

start_time = time.time()
result = permutation_importance(
    estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()

In [None]:
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})

In [None]:
fig = px.box(
    data_frame=temp,
    y='age',
    x='default',
#    size=size_variable,
#    color=color_variable,
#    trendline='lowess',
#     labels={
#         score_variable: f"Average Cross Validation Score ({results.primary_score_name})",
#     },
#     title=f"<b>{x_variable}</b> - Performance<br>" \
#           f"<sup>Size of point corresponds to '{size_variable}'</sup>",
#     custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()

NOTE: foreign worker seems like it should be important but is ranked last in feature importance.