This files contains an example of tuning a Random Forest model with BayesSearchCV

It saves the BayesSearchCV object to the `cross-validation.pkl`

In [1]:
import pickle
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

%matplotlib inline

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([241, 559]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.30125, 0.69875])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [10]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('imputer', hlp.sklearn_pipeline.TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaler', hlp.sklearn_pipeline.TransformerChooser()),
])

In [11]:
non_numeric_pipeline = Pipeline([
    ('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])

In [12]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_columns),
    ('non_numeric', non_numeric_pipeline, non_numeric_columns)
])

# Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=42)

In [14]:
full_pipeline = Pipeline([
    ('prep', transformations_pipeline),
    ('model', random_forest_model)
])

In [15]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('imputer',
                                                   TransformerChooser()),
                                                  ('scaler',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric',
                                  Pipeline(steps=[('encoder',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
     

# GridSearchCV

In [16]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score  # , roc_auc_score
from sklearn.metrics import SCORERS

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [17]:
param_grad = [
    {
        'prep__numeric__imputer__transformer': [SimpleImputer(strategy='mean')],
        'prep__numeric__scaler__transformer': [MinMaxScaler(), StandardScaler()],
        'prep__non_numeric__encoder__transformer': [OneHotEncoder(),
                                                    hlp.sklearn_pipeline.CustomOrdinalEncoder()],
        'model__max_features': [2, 10, 'auto'],
        'model__n_estimators': [50, 100, 500]
    },
]

In [18]:
num_folds = 5
num_repeats = 2

In [19]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    full_pipeline,
    param_grid=param_grad,
    cv=RepeatedKFold(n_splits=num_folds,
                     n_repeats=num_repeats,
                     random_state=2),
    scoring=scores,
    refit='ROC/AUC',
    #scoring='roc_auc',
    return_train_score=True,
    n_jobs=-1,
    verbose=1,
)
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


GridSearchCV(cv=RepeatedKFold(n_repeats=2, n_splits=5, random_state=2),
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('imputer',
                                                                                          TransformerChooser()),
                                                                                         ('scaler',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since

In [20]:
print(grid_search.cv_results_)

{'mean_fit_time': array([0.14046199, 0.12743397, 0.12822218, 0.12122686, 0.25952916,
       0.18465269, 0.21112926, 0.39394457, 1.72847202, 1.62511282,
       1.67913516, 1.87692618, 0.25920825, 0.26472392, 0.33946106,
       0.36135995, 0.45499737, 0.36260474, 0.45696957, 0.48179107,
       1.67243195, 1.0939472 , 1.28612204, 1.2672277 , 0.11389604,
       0.11791267, 0.13693194, 0.133323  , 0.21684937, 0.21404488,
       0.22742326, 0.22854972, 0.98877707, 1.02522588, 1.29601572,
       1.64507847]), 'std_fit_time': array([0.01429017, 0.02280932, 0.01401216, 0.01832419, 0.04128028,
       0.02126495, 0.01716543, 0.06960993, 0.08717147, 0.05220621,
       0.06465464, 0.14758432, 0.04208279, 0.05075626, 0.04655266,
       0.03042743, 0.04335428, 0.01279633, 0.01941942, 0.02405408,
       0.28651735, 0.02293258, 0.02786752, 0.02101591, 0.00918684,
       0.01068164, 0.01278457, 0.01017386, 0.01319073, 0.00969614,
       0.01666115, 0.01386957, 0.03606388, 0.14986186, 0.28835701,
       

In [21]:
grid_search.cv

RepeatedKFold(n_repeats=2, n_splits=5, random_state=2)

## Results

In [22]:
print(grid_search.cv.n_repeats)
print(grid_search.cv.cvargs['n_splits'])

2
5


In [23]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.771524602897316
{'model__max_features': 2, 'model__n_estimators': 500, 'prep__non_numeric__encoder__transformer': OneHotEncoder(), 'prep__numeric__imputer__transformer': SimpleImputer(), 'prep__numeric__scaler__transformer': MinMaxScaler()}


In [25]:
new_param_column_names = {'model__max_features': 'max_features',
                          'model__n_estimators': 'n_estimators',
                          'prep__non_numeric__encoder__transformer': 'encoder',
                          'prep__numeric__imputer__transformer': 'imputer',
                          'prep__numeric__scaler__transformer': 'scaler'}
parser = hlp.sklearn_eval.SearchCVParser(searcher=grid_search,
                                         higher_score_is_better = True,
                                         parameter_name_mappings = new_param_column_names)

In [26]:
parser.to_yaml_file(yaml_file_name = 'Run 2 - Random Forest - GridSearchCV.yaml')

In [27]:
parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = 'Run 2 - Random Forest - GridSearchCV.yaml')

In [28]:
parser.best_primary_score

0.771524602897316

In [29]:
parser.best_primary_score_params

{'max_features': 2,
 'n_estimators': 500,
 'encoder': 'OneHotEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'MinMaxScaler()'}

In [30]:
parser.to_formatted_dataframe()

ROC/AUC Mean,ROC/AUC 95CI.LO,ROC/AUC 95CI.HI,F1 Mean,F1 95CI.LO,F1 95CI.HI,Pos. Pred. Val Mean,Pos. Pred. Val 95CI.LO,Pos. Pred. Val 95CI.HI,True Pos. Rate Mean,True Pos. Rate 95CI.LO,True Pos. Rate 95CI.HI,max_features,n_estimators,encoder,scaler
0.772,0.747,0.796,0.841,0.826,0.857,0.749,0.722,0.775,0.962,0.951,0.973,2,500,OneHotEncoder(),MinMaxScaler()
0.771,0.746,0.796,0.842,0.826,0.857,0.749,0.722,0.775,0.964,0.953,0.974,2,500,OneHotEncoder(),StandardScaler()
0.77,0.745,0.796,0.836,0.824,0.849,0.757,0.732,0.782,0.936,0.916,0.957,2,500,CustomOrdinalEncoder(),MinMaxScaler()
0.77,0.744,0.795,0.837,0.824,0.849,0.758,0.734,0.782,0.935,0.915,0.956,2,500,CustomOrdinalEncoder(),StandardScaler()
0.769,0.743,0.794,0.836,0.824,0.848,0.77,0.746,0.794,0.918,0.89,0.946,auto,500,OneHotEncoder(),MinMaxScaler()
0.768,0.742,0.794,0.835,0.822,0.847,0.769,0.744,0.794,0.915,0.887,0.944,auto,500,OneHotEncoder(),StandardScaler()
0.767,0.737,0.797,0.837,0.821,0.852,0.775,0.747,0.802,0.912,0.892,0.932,10,500,OneHotEncoder(),MinMaxScaler()
0.767,0.736,0.798,0.838,0.822,0.853,0.775,0.748,0.801,0.914,0.891,0.937,10,500,OneHotEncoder(),StandardScaler()
0.767,0.748,0.785,0.834,0.821,0.847,0.759,0.733,0.785,0.928,0.911,0.945,2,100,CustomOrdinalEncoder(),MinMaxScaler()
0.766,0.747,0.784,0.83,0.817,0.843,0.757,0.731,0.782,0.921,0.904,0.938,2,100,CustomOrdinalEncoder(),StandardScaler()
