This files contains an example of tuning a Random Forest model with BayesSearchCV

It saves the BayesSearchCV object to the `cross-validation.pkl`

In [1]:
import pickle
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
#import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


%matplotlib inline

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([241, 559]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.30125, 0.69875])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [10]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute', hlp.sklearn_pipeline.TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling', hlp.sklearn_pipeline.TransformerChooser()),
])

In [11]:
non_numeric_pipeline = Pipeline([
    ('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])

In [12]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_columns),
    ('non_numeric', non_numeric_pipeline, non_numeric_columns)
])

# Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=42)

In [14]:
full_pipeline = Pipeline([
    ('prep', transformations_pipeline),
    ('model', random_forest_model)
])

In [15]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('impute',
                                                   TransformerChooser()),
                                                  ('scaling',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric',
                                  Pipeline(steps=[('encoder',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
     

# skopt.BayesSearchCV

[https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html](https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html)

In [16]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score  # , roc_auc_score
from sklearn.metrics import SCORERS

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [17]:
num_folds = 5
num_repeats = 2

In [18]:
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold

In [18]:
search_space = {
    'prep__numeric__impute__transformer': Categorical([SimpleImputer(strategy='mean')]),
    'prep__numeric__scaling__transformer': Categorical([MinMaxScaler(), StandardScaler()]),
    'prep__non_numeric__encoder__transformer': Categorical([
        OneHotEncoder(),
        hlp.sklearn_pipeline.CustomOrdinalEncoder()
    ]),
    'model__n_estimators': Integer(50, 5000),
    # If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
    'model__max_features':  Real(.01, .99),
}

bayes_search = BayesSearchCV(
    estimator=full_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
    scoring='roc_auc',
    #return_train_score=True,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)
bayes_search.fit(X_train, y_train)

del search_space

Fitting 10 folds for each of 1 candidates, totalling 10 fits


Exception in thread Thread-7:
Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 567, in run
    self.flag_executor_shutting_down()
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 756, in flag_executor_shutting_down
    self.kill_workers()
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 766, in kill_workers
    recursive_terminate(p)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(proce

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 933, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 542, in wrap_future_result
    return future.result(timeout=timeout)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/concurrent/futures/_base.py", line 440, in result
    self._condition.wait(timeout)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/threading.py", line 312, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    ex

TypeError: object of type 'NoneType' has no len()

In [28]:
print(bayes_search.cv_results_)

{'mean_fit_time': array([ 6.5865334 , 13.47546132, 10.29709206,  2.14906924,  4.97792709,
       13.40947664, 10.11771731, 11.48841   ,  8.72120063,  6.33547592,
       10.31825056,  6.09030178,  5.87965505, 14.28870032, 10.88587234,
        4.46232014,  9.93625956, 12.02282898,  7.92264071,  1.56687431,
        0.64846358,  2.99328976,  0.69492939,  8.40013292,  6.69268205,
        5.00700541,  1.06290247,  0.18697426,  8.05555727,  0.54039755]), 'std_fit_time': array([1.00305847, 2.04191455, 1.63977125, 0.33035957, 0.75385359,
       2.10628957, 1.55677016, 1.78987421, 1.35511693, 1.37328755,
       1.6329692 , 1.08359915, 1.1224078 , 2.36246211, 2.24279022,
       0.84034037, 1.48323344, 1.88737531, 1.57992727, 0.20516333,
       0.06270976, 0.79466014, 0.19335597, 1.28155405, 1.45478051,
       0.76354065, 0.18744979, 0.03613231, 1.35399453, 0.03933362]), 'mean_score_time': array([0.44569538, 0.39396822, 0.45252876, 0.09775076, 0.23296108,
       0.41823056, 0.41050224, 0.5188447 ,

## Results

In [60]:
new_param_column_names = {'model__max_features': 'max_features',
                          'model__n_estimators': 'n_estimators',
                          'prep__non_numeric__encoder__transformer': 'encoder',
                          'prep__numeric__impute__transformer': 'imputer',
                          'prep__numeric__scaling__transformer': 'scaler'}

In [61]:
parser = hlp.sklearn_eval.SearchCVParser(searcher=bayes_search,
                                         higher_score_is_better = True,
                                         parameter_name_mappings = new_param_column_names)

NameError: name 'bayes_search' is not defined

In [None]:
parser.to_yaml_file(yaml_file_name = 'Run 1 - Random Forest - BayesSearchCV.yaml')

In [21]:
parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = 'Run 1 - Random Forest - BayesSearchCV.yaml')

In [22]:
parser.to_formatted_dataframe()

roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,scaler
0.772,0.743,0.8,0.01,812,OneHotEncoder(),StandardScaler()
0.77,0.757,0.783,0.166,4052,CustomOrdinalEncoder(),StandardScaler()
0.77,0.748,0.791,0.014,4098,CustomOrdinalEncoder(),StandardScaler()
0.77,0.746,0.793,0.01,4370,OneHotEncoder(),StandardScaler()
0.767,0.739,0.795,0.216,509,CustomOrdinalEncoder(),MinMaxScaler()
0.766,0.734,0.798,0.01,4542,CustomOrdinalEncoder(),MinMaxScaler()
0.764,0.735,0.793,0.194,878,OneHotEncoder(),MinMaxScaler()
0.763,0.74,0.785,0.01,350,OneHotEncoder(),MinMaxScaler()
0.761,0.734,0.788,0.011,3963,CustomOrdinalEncoder(),StandardScaler()
0.76,0.726,0.794,0.211,4486,OneHotEncoder(),StandardScaler()


In [23]:
parser.number_of_splits

10

In [24]:
parser.number_of_iterations

30

In [25]:
parser.best_primary_score

0.771715122583868

In [26]:
hlp.plot.GOLDEN_RATIO

1.61803398875

In [50]:
score_df = parser.to_dataframe(sort_by_score=False)
score_df['labels'] = [x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                      for x in parser.iteration_labels(order_from_best_to_worst=False)]

In [51]:
score_df.head()

Unnamed: 0,roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,imputer,scaler,labels
0,0.744822,0.715147,0.774497,0.411902,3652,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.4119018796762476<br>n_esti...
1,0.750214,0.73097,0.769457,0.830641,4422,OneHotEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.8306405884422187<br>n_esti...
2,0.754102,0.721462,0.786742,0.445936,4598,OneHotEncoder(),SimpleImputer(),MinMaxScaler(),<br>max_features: 0.4459358620551288<br>n_esti...
3,0.744361,0.723145,0.765578,0.806148,901,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.8061480685902161<br>n_esti...
4,0.739341,0.713878,0.764803,0.793562,2218,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.7935623727750296<br>n_esti...


In [52]:
fig = px.scatter(
    data_frame=score_df,
    x=np.arange(0, parser.number_of_iterations),
    y=parser.primary_score_name + " Mean",
    title='Average Cross-Validation Score across all iterations',
    trendline='lowess',
    labels={'x': 'Iteration'},
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)

fig.show()