This files contains an example of tuning a Random Forest model with BayesSearchCV

It saves the BayesSearchCV object to the `cross-validation.pkl`

In [1]:
import pickle

import helpsk as hlp
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import plotly.io as pio
pio.renderers.default='notebook'

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([241, 559]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.30125, 0.69875])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [10]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute', hlp.sklearn_pipeline.TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling', hlp.sklearn_pipeline.TransformerChooser()),
])

In [11]:
non_numeric_pipeline = Pipeline([
    ('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])

In [12]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_columns),
    ('non_numeric', non_numeric_pipeline, non_numeric_columns)
])

# Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=42)

In [14]:
full_pipeline = Pipeline([
    ('prep', transformations_pipeline),
    ('model', random_forest_model)
])

In [15]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('impute',
                                                   TransformerChooser()),
                                                  ('scaling',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric',
                                  Pipeline(steps=[('encoder',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
     

# skopt.BayesSearchCV

[https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html](https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html)

In [16]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score  # , roc_auc_score
from sklearn.metrics import SCORERS

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [17]:
num_folds = 5
num_repeats = 2

In [18]:
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold

In [19]:
search_space = {
    'prep__numeric__impute__transformer': Categorical([SimpleImputer(strategy='mean')]),
    'prep__numeric__scaling__transformer': Categorical([MinMaxScaler(), StandardScaler()]),
    'prep__non_numeric__encoder__transformer': Categorical([
        OneHotEncoder(),
        hlp.sklearn_pipeline.CustomOrdinalEncoder()
    ]),
    'model__n_estimators': Integer(50, 5000),
    # If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
    'model__max_features':  Real(.01, .99),
}

bayes_search = BayesSearchCV(
    estimator=full_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
    scoring='roc_auc',
    #return_train_score=True,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)
bayes_search.fit(X_train, y_train)

del search_space

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac

In [20]:
print(bayes_search.cv_results_)

{'mean_fit_time': array([ 6.97221973, 12.69967868,  9.66322381,  2.02044966,  4.75362718,
       12.58117998,  9.53103747, 10.5482764 ,  8.10936279,  4.93289454,
        0.66682076,  1.00640917,  4.7512696 ,  0.2489856 ,  6.22093594,
       10.2782078 ,  3.86154358,  7.83754926,  0.85682049,  6.90920057,
        5.36261246,  3.56159992,  4.96366568,  2.43259306,  2.77215679,
        4.23899202,  0.31469512,  3.7309411 ,  3.62136981,  0.1612534 ]), 'std_fit_time': array([1.34584963, 1.76542077, 1.46287134, 0.29817295, 0.71954159,
       1.77158347, 1.29365293, 1.55321825, 1.10456069, 0.81359502,
       0.10899923, 0.15624438, 0.77338842, 0.04491065, 1.06082861,
       1.48985042, 0.73429287, 1.22952107, 0.16616703, 1.09705468,
       0.94032927, 0.59055745, 0.75528309, 0.38276541, 0.48115369,
       0.81171324, 0.05137436, 0.63859133, 0.63309367, 0.03048789]), 'mean_score_time': array([0.52486951, 0.37179453, 0.41650198, 0.08896842, 0.19460638,
       0.38579776, 0.33186147, 0.38362889,

# Results

In [21]:
new_param_column_names = {'model__max_features': 'max_features',
                          'model__n_estimators': 'n_estimators',
                          'prep__non_numeric__encoder__transformer': 'encoder',
                          'prep__numeric__impute__transformer': 'imputer',
                          'prep__numeric__scaling__transformer': 'scaler'}

In [22]:
parser = hlp.sklearn_eval.SearchCVParser(searcher=bayes_search,
                                         higher_score_is_better = True,
                                         parameter_name_mappings = new_param_column_names)

In [23]:
parser.to_yaml_file(yaml_file_name = 'Run 1.yaml')

In [24]:
#parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = 'Run 1.yaml')

## Best Scores/Params

In [25]:
parser.to_formatted_dataframe()

roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,scaler
0.773,0.75,0.796,0.01,2897,CustomOrdinalEncoder(),StandardScaler()
0.771,0.748,0.793,0.014,4098,CustomOrdinalEncoder(),StandardScaler()
0.769,0.748,0.791,0.01,4328,CustomOrdinalEncoder(),StandardScaler()
0.769,0.734,0.804,0.011,3108,CustomOrdinalEncoder(),StandardScaler()
0.769,0.731,0.806,0.011,2164,CustomOrdinalEncoder(),StandardScaler()
0.765,0.734,0.797,0.186,2512,CustomOrdinalEncoder(),StandardScaler()
0.765,0.733,0.797,0.01,3042,CustomOrdinalEncoder(),StandardScaler()
0.765,0.734,0.795,0.01,3914,OneHotEncoder(),StandardScaler()
0.764,0.745,0.784,0.01,3887,CustomOrdinalEncoder(),MinMaxScaler()
0.763,0.74,0.785,0.149,768,CustomOrdinalEncoder(),MinMaxScaler()


In [26]:
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
parser.primary_score_iteration_ranking

array([15, 21, 23, 28, 30, 25, 22, 26, 29,  2, 11, 10,  9, 14, 13, 24, 12,
       17, 18, 20,  3,  6,  8, 19,  5,  4, 27,  7,  1, 16])

In [27]:
# gives the 
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
parser.primary_score_best_indexes

array([28,  9, 20, 25, 24, 21, 27, 22, 12, 11, 10, 16, 14, 13,  0, 29, 17,
       18, 23, 19,  1,  6,  2, 15,  5,  7, 26,  3,  8,  4])

In [28]:
parser.best_primary_score

0.7727484873094479

In [29]:
parser.best_primary_score_params()

{'max_features': 0.01,
 'n_estimators': 2897,
 'encoder': 'CustomOrdinalEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'StandardScaler()'}

In [30]:
# create the labels that will be used in the plotly hover text
score_df = parser.to_dataframe(sort_by_score=False)
score_df['labels'] = [x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                      for x in parser.iteration_labels(order_from_best_to_worst=False)]

In [31]:
score_df.head(1)

Unnamed: 0,roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,imputer,scaler,labels
0,0.759555,0.744368,0.774743,0.411902,3652,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.4119018796762476<br>n_esti...


## BayesSearchCV Performance Over Time

In [60]:
score_variable = parser.primary_score_name + " Mean"
size_variable = 'n_estimators'
color_variable = 'max_features'
fig = px.scatter(
    data_frame=score_df,
    x=np.arange(0, parser.number_of_iterations),
    y=score_variable,
    size=size_variable,
    color=color_variable,
    trendline='lowess',
    labels={
        'x': 'Iteration',
        score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
    },
    title=f"Bayesian Performance Over Time<br>" \
          f"<sup>Size of point corresponds to '{size_variable}'</sup>",
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

del size_variable

---

In [57]:
y_variable = 'max_features'
color_variable = score_variable
size_variable = 'n_estimators'
fig = px.scatter(
    data_frame=score_df,
    x=np.arange(0, parser.number_of_iterations),
    y=y_variable,
    size=size_variable,
    color='roc_auc Mean',
    trendline='lowess',
    labels={
        'x': 'Iteration',
    },
    title=f"<b>{y_variable}</b> Over Time<br><sup>Size changes depending on '{size_variable}'</sup>",
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

del y_variable
del color_variable
del size_variable

---

In [40]:
#scatter_trend_line = 'ols'
scatter_trend_line = 'lowess'
#feature_color = 'encoder'
feature_color = 'scaler'
fig = px.scatter(
    data_frame=score_df,
    x='max_features',
    y=parser.primary_score_name + " Mean",
    size='n_estimators',
    color=feature_color,
    title='max_features',
    trendline='lowess',
#    facet_col=[]
    #labels={'x': 'Iteration'},
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

fig.show()

In [34]:
#scatter_trend_line = 'ols'
scatter_trend_line = 'lowess'
#feature_color = 'encoder'
feature_color = 'scaler'
fig = px.scatter(
    data_frame=score_df,
    x='max_features',
    y=parser.primary_score_name + " Mean",
    size=,
    color=None,
    title='max_features',
    trendline=scatter_trend_line,
    #labels={'x': 'Iteration'},
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)

fig.show()

SyntaxError: invalid syntax (2595807486.py, line 9)

---

# Regression on `roc_auc Mean`

In [None]:
score_name = parser.primary_score_name + " Mean"
score_name

In [None]:
parser.parameter_names

In [None]:
parser.to_formatted_dataframe()

In [None]:
score_dataframe = parser.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
                              if x not in [score_name] + parser.parameter_names])
score_dataframe.head()

In [None]:
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names

In [None]:
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)

In [None]:
import statsmodels.formula.api as smf

y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")

formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)

numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)

print(numeric_columns)
print(non_numeric_columns)

numeric_pipeline = Pipeline([
    ('scaling', StandardScaler()),
])

transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])

score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
                                           columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed

In [None]:
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_features'] = score_dataframe_transformed['max_features'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')

In [None]:
print(formula)
model = smf.ols(formula=formula,
                data = score_dataframe_transformed)
results = model.fit()
print(results.summary())

In [None]:
coefficients = results.params[results.params.index != 'Intercept']
coefficients = coefficients.reindex(coefficients.abs().sort_values(ascending=False).index)

In [None]:
coefficients

In [None]:
coefficients = pd.DataFrame({'regression_coefficient': coefficients}).reset_index().rename(columns={'index': 'feature'})
coefficients

In [None]:
px.bar(data_frame=coefficients.reindex(coefficients['regression_coefficient'].abs().sort_values(ascending=True).index),
      y='feature',
      x='regression_coefficient')