This files contains an example of tuning a Random Forest model with BayesSearchCV.

In [1]:
import pickle
import time

import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import plotly.io as pio
pio.renderers.default='notebook'

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([559, 241]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.69875, 0.30125])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [17]:
file_name = '../Logistic Regression/Run 1 - Logistic Regression - BayesSearchCV.yaml'
log_parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = file_name)
log_parser.best_primary_score_params

{'C': 0.025537280346410507,
 'encoder': 'OneHotEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'StandardScaler()'}

In [18]:
file_name = '../Random Forest/Run 1 - Random Forest - BayesSearchCV.yaml'
rf_parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = file_name)
rf_parser.best_primary_score_params

{'max_features': 0.11861727550540657,
 'n_estimators': 4994,
 'encoder': 'OneHotEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'MinMaxScaler()'}

In [19]:
file_name = '../XGBoost/Run 1 - XGBoost - BayesSearchCV.yaml'
xgb_parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = file_name)
xgb_parser.best_primary_score_params

{'colsample_bytree': 0.12514061156354356,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 778,
 'subsample': 0.5788129146377768,
 'encoder': 'CustomOrdinalEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'MinMaxScaler()'}

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

logistic_estimator = make_pipeline(
    ColumnTransformer([
        (
            'numeric',
            make_pipeline(
                SimpleImputer(strategy='mean'),
                 StandardScaler()
            ),
            numeric_columns
        ),
        (
            'non_numeric',
            make_pipeline(OneHotEncoder()),
            non_numeric_columns
        )
    ]),
    LogisticRegression(
        C=log_parser.best_primary_score_params['C'],
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    )
)

rf_estimator = make_pipeline(
    ColumnTransformer([
        (
            'numeric',
            make_pipeline(
                SimpleImputer(strategy='mean'),
                MinMaxScaler()
            ),
            numeric_columns
        ),
        (
            'non_numeric',
            #make_pipeline(hlp.sklearn_pipeline.CustomOrdinalEncoder()),
            make_pipeline(OneHotEncoder()),
            non_numeric_columns
        )
    ]),
    RandomForestClassifier(
        max_features=rf_parser.best_primary_score_params['max_features'],
        n_estimators=rf_parser.best_primary_score_params['n_estimators'],
        random_state=42,
    )
)

xgb_estimator = make_pipeline(
    ColumnTransformer([
        (
            'numeric',
            make_pipeline(
                SimpleImputer(strategy='mean'),
                MinMaxScaler()
            ),
            numeric_columns
        ),
        (
            'non_numeric',
            make_pipeline(hlp.sklearn_pipeline.CustomOrdinalEncoder()),
            non_numeric_columns
        )
    ]),
    XGBClassifier(
        colsample_bytree=xgb_parser.best_primary_score_params['colsample_bytree'],
        learning_rate=xgb_parser.best_primary_score_params['learning_rate'],
        max_depth=xgb_parser.best_primary_score_params['max_depth'],
        n_estimators=xgb_parser.best_primary_score_params['n_estimators'],
        subsample=xgb_parser.best_primary_score_params['subsample'],
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
)

estimators = [
    ('logistic', logistic_estimator),
    ('rf', rf_estimator),
    ('xgb', xgb_estimator),
]

# Model

`XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded on Apple Silicon (ARM)`

https://github.com/dmlc/xgboost/issues/6909

```
pip install --upgrade --force-reinstall xgboost --no-binary xgboost -v
```

In [21]:
from sklearn.ensemble import StackingClassifier

final_estimator = XGBClassifier(random_state=42,
                                eval_metric='logloss',
                                use_label_encoder=False)

stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

# skopt.BayesSearchCV

[https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html](https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html)

In [22]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score  # , roc_auc_score
from sklearn.metrics import SCORERS

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [23]:
num_folds = 5
num_repeats = 2

In [24]:
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold

https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

> max_depth: 3–10
> n_estimators: 100 (lots of observations) to 1000 (few observations)
> learning_rate: 0.01–0.3
> colsample_bytree: 0.5–1
> subsample: 0.6–1

> Then, you can focus on optimizing max_depth and n_estimators.
You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.

Find tuning options with:

```
bayes_search.get_params().keys()
```

Note that the param will be e.g. `final_estimator__max_depth` even though `bayes_search.get_params().keys()` returns `estimator__final_estimator__max_depth`

In [None]:
search_space = {
    'final_estimator__max_depth': Integer(3, 10),
    'final_estimator__n_estimators':  Integer(50, 2000),
    'final_estimator__learning_rate': Real(0.01, 0.3),
    'final_estimator__colsample_bytree': Real(0.01, 1),
    'final_estimator__subsample': Real(0.5, 1),
}

bayes_search = BayesSearchCV(
    estimator=stacking_model,
    search_spaces=search_space,
    n_iter=50,
    cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
    scoring='roc_auc',
    # return_train_score=True,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

del search_space

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [None]:
# find tuning options with:
# bayes_search.get_params().keys()

In [None]:
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")

In [None]:
print(bayes_search.cv_results_)

In [None]:
print(bayes_search.best_score_)

In [None]:
print(bayes_search.best_params_)

# Results

In [None]:
new_param_column_names = {
    'final_estimator__max_depth': 'max_depth',
    'final_estimator__n_estimators': 'n_estimators',
    'final_estimator__learning_rate': 'learning_rate',
    'final_estimator__colsample_bytree': 'colsample_bytree',
    'final_estimator__subsample': 'subsample',
}

In [None]:
parser = hlp.sklearn_eval.SearchCVParser(searcher=bayes_search,
                                         higher_score_is_better = True,
                                         parameter_name_mappings = new_param_column_names)

In [None]:
parser.to_yaml_file(yaml_file_name = 'Run 1 - Stacking - BayesSearchCV.yaml')

In [None]:
parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = 'Run 1 - Stacking - BayesSearchCV.yaml')

## Timings

In [None]:
parser.fit_time_averages

## Best Scores/Params

In [None]:
parser.best_primary_score

In [None]:
parser.best_primary_score_params

In [None]:
parser.to_formatted_dataframe()

In [None]:
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
parser.primary_score_iteration_ranking

In [None]:
# gives the 
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
parser.primary_score_best_indexes

In [None]:
score_df = parser.to_dataframe(sort_by_score=False)
columns = score_df.columns.to_list()  # cache columns to move Iteration column to front
score_df['Iteration'] = np.arange(1, parser.number_of_iterations + 1)
score_df = score_df[['Iteration'] + columns]
# create the labels that will be used in the plotly hover text
score_df['labels'] = [x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                      for x in parser.iteration_labels(order_from_best_to_worst=False)]
del columns

score_variable = parser.primary_score_name + " Mean"

In [None]:
score_df.head(1)

## BayesSearchCV Performance Over Time

In [None]:
size_variable = 'learning_rate'
color_variable = 'colsample_bytree'
fig = px.scatter(
    data_frame=score_df,
    x='Iteration',
    y=score_variable,
    size=size_variable,
    color=color_variable,
    trendline='lowess',
    labels={
        score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
    },
    title=f"Bayesian Performance Over Time<br>" \
          f"<sup>Size of point corresponds to '{size_variable}'</sup>",
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        score_variable + ": " + "%{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

del size_variable
del color_variable

---

## Variable Performance Over Time

In [None]:
score_df_long_numeric = pd.melt(score_df,
                                id_vars=['Iteration', score_variable, 'labels'],
                                value_vars=parser.numeric_parameters)
#score_df_long_numeric.head(1)

In [None]:
color_variable = score_variable
fig = px.scatter(
    data_frame=score_df_long_numeric,
    x='Iteration',
    y='value',
    color=color_variable,
    facet_col='variable',
    trendline='lowess',
    labels={
        'value': 'Parameter Value',
    },
    title="Variable Performance Over Time",
    custom_data=['labels', score_variable],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "Parameter Value: %{y}",
        "roc_auc Mean: %{customdata[1]}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.update_yaxes(matches=None, showticklabels=True)
fig.show()

del color_variable

---

## Variable Performance - Numeric

In [None]:
color_variable = score_variable
fig = px.scatter(
    data_frame=score_df_long_numeric,
    x='value',
    y=score_variable,
    color=color_variable,
    facet_col='variable',
    facet_col_wrap=2,
    trendline='lowess',
    labels={
#        score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
        'value': 'Parameter Value',
    },
    title="Variable Performance<br><sup>Numeric Parameters</sup>",
    custom_data=['labels', score_variable],
    height=1000,
    width=1000*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Parameter Value: %{x}",
        "roc_auc Mean: %{customdata[1]}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.update_xaxes(matches=None, showticklabels=True)
#fig.update_yaxes(matches=None, showticklabels=True)
fig.show()

del color_variable

---

## Variable Performance - Non-Numeric

**No non-numeric Variables**

## Individual Variable Performance

---

In [None]:
x_variable = 'learning_rate'
size_variable = 'colsample_bytree'
#color_variable = 'scaler'
fig = px.scatter(
    data_frame=score_df,
    x=x_variable,
    y=score_variable,
    size=size_variable,
#    color=color_variable,
    trendline='lowess',
    labels={
        score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
    },
    title=f"<b>{x_variable}</b> - Performance<br>" \
          f"<sup>Size of point corresponds to '{size_variable}'</sup>",
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Parameter Value: %{x}",
        score_variable + ": " + "%{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

del x_variable
del size_variable
#del color_variable

In [None]:
size_variable = score_variable

from sklearn.preprocessing import MinMaxScaler
scaled_size = MinMaxScaler().fit_transform(score_df[[size_variable]]).reshape(1, -1)
scaled_size = scaled_size.tolist()[0]

x_variable = 'learning_rate'
y_variable = 'colsample_bytree'

color_variable = score_variable
fig = px.scatter(
    data_frame=score_df,
    x=x_variable,
    y=y_variable,
#    size=score_df[size_variable],
    size=scaled_size,
    color=color_variable,
    trendline='lowess',
#     labels={
#         score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
#     },
    title=f"Performance of <b>{x_variable}</b> vs <b>{y_variable}</b><br>" \
          f"<sup>color and size corresponds to `{score_variable}`</sup>",
    custom_data=['labels', score_variable],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        x_variable + ": %{x}",
        y_variable + ": %{y}",
        score_variable + ": " + "%{customdata[1]}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

del x_variable
del size_variable
del scaled_size
del color_variable

---

# Regression on `roc_auc Mean`

In [None]:
score_variable

In [None]:
score_dataframe = parser.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
                              if x not in [score_variable] + parser.parameter_names])
score_dataframe.head()

In [None]:
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names

In [None]:
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)

In [None]:
import statsmodels.formula.api as smf

y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")

formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)

numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)

print(numeric_columns)
print(non_numeric_columns)

numeric_pipeline = Pipeline([
    ('scaling', StandardScaler()),
])

transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])

score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
                                           columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()

In [None]:
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')

In [None]:
print(formula)
model = smf.ols(formula=formula,
                data = score_dataframe_transformed)
results = model.fit()
print(results.summary())

In [None]:
coefficients = pd.DataFrame({
    'feature': results.params.index,
    'coefficient': results.params,
    'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients

In [None]:
score_variable

In [None]:
px.bar(
    data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
    y='feature',
    x='coefficient',
    color='Stat Sig',
    title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

# Feature Importance

https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

In [None]:
from sklearn.inspection import permutation_importance

estimator = bayes_search.best_estimator_

start_time = time.time()
result = permutation_importance(
    estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()

In [None]:
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})

In [None]:
fig = px.box(
    data_frame=temp,
    y='age',
    x='default',
#    size=size_variable,
#    color=color_variable,
#    trendline='lowess',
#     labels={
#         score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
#     },
#     title=f"<b>{x_variable}</b> - Performance<br>" \
#           f"<sup>Size of point corresponds to '{size_variable}'</sup>",
#     custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()

NOTE: foreign worker seems like it should be important but is ranked last in feature importance.