This files contains an example of tuning a Random Forest model with BayesSearchCV

It saves the BayesSearchCV object to the `cross-validation.pkl`

In [1]:
import pickle

import helpsk as hlp
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import plotly.io as pio
pio.renderers.default='notebook'

# Load Data

In [2]:
with open('../X_train.pkl', 'rb') as handle:
    X_train = pickle.load(handle)

with open('../y_train.pkl', 'rb') as handle:
    y_train = pickle.load(handle)

In [3]:
hlp.pandas.numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,760,40,5.0%,0,0.0%,21.0,11.7,0.6,1.0,0.6,4.0,9.0,12.0,18.0,24.0,36.0,60.0
credit_amount,800,0,0.0%,38,5.0%,3203.9,2932.3,0.9,1.9,3.9,0.0,753.9,1300.8,2236.5,3951.5,7394.6,18424.0
installment_commitment,800,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,800,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,800,0,0.0%,0,0.0%,35.6,11.4,0.3,1.0,0.7,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,800,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,800,0,0.0%,0,0.0%,1.1,0.3,0.3,2.0,2.1,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(X_train, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,763,37,4.6%,no checking,4,0.5%
credit_history,800,0,0.0%,existing paid,5,0.6%
purpose,800,0,0.0%,radio/tv,10,1.2%
savings_status,800,0,0.0%,<100,5,0.6%
employment,800,0,0.0%,1<=X<4,5,0.6%
personal_status,800,0,0.0%,male single,4,0.5%
other_parties,800,0,0.0%,none,3,0.4%
property_magnitude,800,0,0.0%,car,4,0.5%
other_payment_plans,800,0,0.0%,none,3,0.4%
housing,800,0,0.0%,own,3,0.4%


In [5]:
y_train[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([241, 559]))

In [7]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.30125, 0.69875])

# Transformation Pipeline

In [8]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [9]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [10]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute', hlp.sklearn_pipeline.TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling', hlp.sklearn_pipeline.TransformerChooser()),
])

In [11]:
non_numeric_pipeline = Pipeline([
    ('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])

In [12]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric', numeric_pipeline, numeric_columns),
    ('non_numeric', non_numeric_pipeline, non_numeric_columns)
])

# Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=42)

In [14]:
full_pipeline = Pipeline([
    ('prep', transformations_pipeline),
    ('model', random_forest_model)
])

In [15]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'prep': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('impute',
                                                   TransformerChooser()),
                                                  ('scaling',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric',
                                  Pipeline(steps=[('encoder',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
     

# skopt.BayesSearchCV

[https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html](https://scikit-optimize.github.io/stable/auto_examples/sklearn-gridsearchcv-replacement.html)

In [16]:
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score  # , roc_auc_score
from sklearn.metrics import SCORERS

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [17]:
num_folds = 5
num_repeats = 2

In [18]:
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold

In [19]:
search_space = {
    'prep__numeric__impute__transformer': Categorical([SimpleImputer(strategy='mean')]),
    'prep__numeric__scaling__transformer': Categorical([MinMaxScaler(), StandardScaler()]),
    'prep__non_numeric__encoder__transformer': Categorical([
        OneHotEncoder(),
        hlp.sklearn_pipeline.CustomOrdinalEncoder()
    ]),
    'model__n_estimators': Integer(50, 5000),
    # If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.
    'model__max_features':  Real(.01, .99),
}

bayes_search = BayesSearchCV(
    estimator=full_pipeline,
    search_spaces=search_space,
    n_iter=30,
    cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
    scoring='roc_auc',
    #return_train_score=True,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)
bayes_search.fit(X_train, y_train)

del search_space

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for eac


The objective has been evaluated at this point before.



Fitting 10 folds for each of 1 candidates, totalling 10 fits



The objective has been evaluated at this point before.



Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [58]:
print(bayes_search.cv_results_)

{'mean_fit_time': array([ 6.9287313 , 13.3954437 ,  9.85841575,  2.08140197,  4.88015075,
       14.30460865, 10.4182595 , 11.34313741,  8.92161362,  5.36251166,
        0.11719007,  3.88611736,  8.73690135,  5.31223896,  4.36354735,
        0.36009874,  6.89398377,  0.97612298,  7.13056157,  0.09046202,
        6.92607887,  0.14542806,  7.62072718,  9.43346775,  6.95095599,
        0.137483  ,  0.15315557,  6.806496  ,  6.69049768, 14.31849637]), 'std_fit_time': array([1.30960849, 2.04164898, 1.5191195 , 0.32175999, 0.71599639,
       2.55535142, 1.21175031, 1.93901972, 1.46448234, 0.95420639,
       0.01979477, 0.69598568, 1.46271719, 0.96063588, 0.95183809,
       0.06285323, 1.17604263, 0.16839452, 1.38349075, 0.01222121,
       1.29757492, 0.01924485, 1.48018274, 1.85250516, 1.18614437,
       0.0208033 , 0.02301242, 1.29193307, 1.20473863, 2.59283311]), 'mean_score_time': array([0.65251803, 0.41953478, 0.40765047, 0.096065  , 0.19943168,
       0.62848754, 0.36679211, 0.4374136 ,

# Results

In [21]:
new_param_column_names = {'model__max_features': 'max_features',
                          'model__n_estimators': 'n_estimators',
                          'prep__non_numeric__encoder__transformer': 'encoder',
                          'prep__numeric__impute__transformer': 'imputer',
                          'prep__numeric__scaling__transformer': 'scaler'}

In [22]:
parser = hlp.sklearn_eval.SearchCVParser(searcher=bayes_search,
                                         higher_score_is_better = True,
                                         parameter_name_mappings = new_param_column_names)

In [59]:
parser.to_yaml_file(yaml_file_name = 'Run 1.yaml')

In [73]:
#parser = hlp.sklearn_eval.SearchCVParser.from_yaml_file(yaml_file_name = 'Run 1.yaml')

## Best Scores/Params

In [57]:
parser.to_formatted_dataframe()

roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,scaler
0.776,0.748,0.803,0.071,3976,CustomOrdinalEncoder(),StandardScaler()
0.772,0.757,0.786,0.059,4999,OneHotEncoder(),StandardScaler()
0.77,0.749,0.791,0.01,5000,OneHotEncoder(),StandardScaler()
0.766,0.746,0.787,0.092,5000,CustomOrdinalEncoder(),StandardScaler()
0.766,0.736,0.796,0.048,229,OneHotEncoder(),MinMaxScaler()
0.765,0.748,0.783,0.01,5000,CustomOrdinalEncoder(),MinMaxScaler()
0.765,0.742,0.789,0.014,4098,CustomOrdinalEncoder(),StandardScaler()
0.765,0.743,0.787,0.176,4994,OneHotEncoder(),MinMaxScaler()
0.764,0.734,0.794,0.062,5000,CustomOrdinalEncoder(),MinMaxScaler()
0.763,0.737,0.789,0.01,3025,OneHotEncoder(),MinMaxScaler()


In [54]:
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
parser.primary_score_iteration_ranking

array([15, 30, 14, 29, 27, 28, 21, 19, 22,  7, 20, 10, 16,  1, 12,  5,  2,
       24,  9, 18,  4, 25,  3,  8,  6, 17, 23, 13, 11, 26])

In [76]:
# gives the 
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
parser.primary_score_best_indexes

array([13, 16, 22, 20, 15, 24,  9, 23, 18, 11, 28, 14, 27,  2,  0, 12, 25,
       19,  7, 10,  6,  8, 26, 17, 21, 29,  4,  5,  3,  1])

In [79]:
parser.best_primary_score

0.7755301871079988

In [81]:
parser.best_primary_score_params()

{'max_features': 0.07051320077253395,
 'n_estimators': 3976,
 'encoder': 'CustomOrdinalEncoder()',
 'imputer': 'SimpleImputer()',
 'scaler': 'StandardScaler()'}

In [84]:
# create the labels that will be used in the plotly hover text
score_df = parser.to_dataframe(sort_by_score=False)
score_df['labels'] = [x.replace('{', '<br>').replace(', ', '<br>').replace('}', '')
                      for x in parser.iteration_labels(order_from_best_to_worst=False)]

In [86]:
score_df.head(1)

Unnamed: 0,roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,imputer,scaler,labels
0,0.753552,0.738644,0.768461,0.411902,3652,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler(),<br>max_features: 0.4119018796762476<br>n_esti...


## BayesSearchCV Performance Over Time

In [95]:
y_variable = parser.primary_score_name + " Mean"
fig = px.scatter(
    data_frame=score_df,
    x=np.arange(0, parser.number_of_iterations),
    y=y_variable,
    size='max_features',
    color='n_estimators',
    trendline='lowess',
    labels={
        'x': 'Iteration',
        y_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
    },
    title="Bayesian Performance Over Time",
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)
fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)
fig.show()

---

In [70]:
#scatter_trend_line = 'ols'
scatter_trend_line = 'lowess'
#feature_color = 'encoder'
feature_color = 'scaler'
fig = px.scatter(
    data_frame=score_df,
    x='max_features',
    y=parser.primary_score_name + " Mean",
    #size='n_estimators',
    #color=feature_color,
    title='max_features',
#    trendline=scatter_trend_line,
#    facet_col=[]
    #labels={'x': 'Iteration'},
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

fig.show()

In [72]:
#scatter_trend_line = 'ols'
scatter_trend_line = 'lowess'
#feature_color = 'encoder'
feature_color = 'scaler'
fig = px.scatter(
    data_frame=score_df,
    x='max_features',
    y=parser.primary_score_name + " Mean",
    size=,
    color=None,
    title='max_features',
    trendline=scatter_trend_line,
    #labels={'x': 'Iteration'},
    custom_data=['labels'],
    height=600,
    width=600*hlp.plot.GOLDEN_RATIO
)

fig.update_traces(
    hovertemplate="<br>".join([
        "Iteration: %{x}",
        "roc_auc Mean: %{y}",
        "<br>Parameters: %{customdata[0]}",
    ])
)

fig.show()

---

# Regression on `roc_auc Mean`

In [34]:
score_name = parser.primary_score_name + " Mean"
score_name

'roc_auc Mean'

In [35]:
parser.parameter_names

['max_features', 'n_estimators', 'encoder', 'imputer', 'scaler']

In [36]:
parser.to_formatted_dataframe()

roc_auc Mean,roc_auc 95CI.LO,roc_auc 95CI.HI,max_features,n_estimators,encoder,scaler
0.776,0.748,0.803,0.071,3976,CustomOrdinalEncoder(),StandardScaler()
0.772,0.757,0.786,0.059,4999,OneHotEncoder(),StandardScaler()
0.77,0.749,0.791,0.01,5000,OneHotEncoder(),StandardScaler()
0.766,0.746,0.787,0.092,5000,CustomOrdinalEncoder(),StandardScaler()
0.766,0.736,0.796,0.048,229,OneHotEncoder(),MinMaxScaler()
0.765,0.748,0.783,0.01,5000,CustomOrdinalEncoder(),MinMaxScaler()
0.765,0.742,0.789,0.014,4098,CustomOrdinalEncoder(),StandardScaler()
0.765,0.743,0.787,0.176,4994,OneHotEncoder(),MinMaxScaler()
0.764,0.734,0.794,0.062,5000,CustomOrdinalEncoder(),MinMaxScaler()
0.763,0.737,0.789,0.01,3025,OneHotEncoder(),MinMaxScaler()


In [37]:
score_dataframe = parser.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
                              if x not in [score_name] + parser.parameter_names])
score_dataframe.head()

Unnamed: 0,roc_auc Mean,max_features,n_estimators,encoder,imputer,scaler
13,0.77553,0.070513,3976,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
16,0.771707,0.059044,4999,OneHotEncoder(),SimpleImputer(),StandardScaler()
22,0.769783,0.01,5000,OneHotEncoder(),SimpleImputer(),StandardScaler()
20,0.766235,0.092116,5000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
15,0.76579,0.047582,229,OneHotEncoder(),SimpleImputer(),MinMaxScaler()


In [38]:
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names

{'roc_auc Mean': 'roc_auc_Mean',
 'max_features': 'max_features',
 'n_estimators': 'n_estimators',
 'encoder': 'encoder',
 'imputer': 'imputer',
 'scaler': 'scaler'}

In [39]:
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)

In [40]:
import statsmodels.formula.api as smf

y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")

formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())

roc_auc_Mean ~ max_features + n_estimators + encoder + imputer + scaler
                            OLS Regression Results                            
Dep. Variable:           roc_auc_Mean   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     18.31
Date:                Fri, 24 Dec 2021   Prob (F-statistic):           3.84e-07
Time:                        16:14:53   Log-Likelihood:                 113.24
No. Observations:                  30   AIC:                            -216.5
Df Residuals:                      25   BIC:                            -209.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)

numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)

print(numeric_columns)
print(non_numeric_columns)

numeric_pipeline = Pipeline([
    ('scaling', StandardScaler()),
])

transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])

score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
                                           columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed

['roc_auc_Mean', 'max_features', 'n_estimators']
['encoder', 'imputer', 'scaler']


Unnamed: 0,roc_auc_Mean,max_features,n_estimators,encoder,imputer,scaler
0,1.879784,-0.776309,0.388126,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
1,1.532361,-0.809794,0.913677,OneHotEncoder(),SimpleImputer(),StandardScaler()
2,1.357587,-0.952983,0.914191,OneHotEncoder(),SimpleImputer(),StandardScaler()
3,1.035151,-0.713238,0.914191,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
4,0.994802,-0.843258,-1.536837,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
5,0.957896,-0.952983,0.914191,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
6,0.937773,-0.942596,0.450802,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
7,0.896958,-0.467696,0.911108,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
8,0.822061,-0.800766,0.914191,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
9,0.771292,-0.95295,-0.100435,OneHotEncoder(),SimpleImputer(),MinMaxScaler()


In [42]:
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['max_features'] = score_dataframe_transformed['max_features'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')

In [43]:
print(formula)
model = smf.ols(formula=formula,
                data = score_dataframe_transformed)
results = model.fit()
print(results.summary())

roc_auc_Mean ~ max_features + n_estimators + encoder + imputer + scaler
                            OLS Regression Results                            
Dep. Variable:           roc_auc_Mean   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     18.31
Date:                Fri, 24 Dec 2021   Prob (F-statistic):           3.84e-07
Time:                        16:14:53   Log-Likelihood:                -22.040
No. Observations:                  30   AIC:                             54.08
Df Residuals:                      25   BIC:                             61.09
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------

In [44]:
coefficients = results.params[results.params.index != 'Intercept']
coefficients = coefficients.reindex(coefficients.abs().sort_values(ascending=False).index)

In [45]:
coefficients

max_features                 -0.785146
n_estimators                  0.267050
encoder[T.OneHotEncoder()]   -0.253625
scaler[T.StandardScaler()]   -0.031195
dtype: float64

In [46]:
coefficients = pd.DataFrame({'regression_coefficient': coefficients}).reset_index().rename(columns={'index': 'feature'})
coefficients

Unnamed: 0,feature,regression_coefficient
0,max_features,-0.785146
1,n_estimators,0.26705
2,encoder[T.OneHotEncoder()],-0.253625
3,scaler[T.StandardScaler()],-0.031195


In [47]:
px.bar(data_frame=coefficients.reindex(coefficients['regression_coefficient'].abs().sort_values(ascending=True).index),
      y='feature',
      x='regression_coefficient')