In [1]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

## Create Missing Values

credit_data['duration'].iloc[0:50] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [3]:
hlp.pandas.numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,950,50,0.05,0,0.0,20.87,11.96,0.57,1.1,0.99,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0,0,0.0,3271.26,2822.74,0.86,1.95,4.29,250.0,932.0,1365.5,2319.5,3972.25,7179.4,18424.0
installment_commitment,1000,0,0.0,0,0.0,2.97,1.12,0.38,-0.53,-1.21,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0,0,0.0,2.85,1.1,0.39,-0.27,-1.38,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0,0,0.0,35.55,11.38,0.32,1.02,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0,0,0.0,1.41,0.58,0.41,1.27,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0,0,0.0,1.16,0.36,0.31,1.91,1.65,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [4]:
hlp.pandas.non_numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Null,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.004
credit_history,1000,0,0.0,existing paid,5,0.005
purpose,1000,0,0.0,radio/tv,10,0.01
savings_status,1000,0,0.0,<100,5,0.005
employment,1000,0,0.0,1<=X<4,5,0.005
personal_status,1000,0,0.0,male single,4,0.004
other_parties,1000,0,0.0,none,3,0.003
property_magnitude,1000,0,0.0,car,4,0.004
other_payment_plans,1000,0,0.0,none,3,0.003
housing,1000,0,0.0,own,3,0.003


In [5]:
hlp.pandas.numeric_summary(credit_data). \
    pipe(hlp.pandas_style.format)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,950,50,0.05,0,0.0,20.87,11.96,0.57,1.1,0.99,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0,0,0.0,3271.26,2822.74,0.86,1.95,4.29,250.0,932.0,1365.5,2319.5,3972.25,7179.4,18424.0
installment_commitment,1000,0,0.0,0,0.0,2.97,1.12,0.38,-0.53,-1.21,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0,0,0.0,2.85,1.1,0.39,-0.27,-1.38,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0,0,0.0,35.55,11.38,0.32,1.02,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0,0,0.0,1.41,0.58,0.41,1.27,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0,0,0.0,1.16,0.36,0.31,1.91,1.65,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [15]:
columns_to_format = [x for x in credit_data.columns if x not in ['% Nulls', '% Zeros']]
columns_to_format

['checking_status',
 'duration',
 'credit_history',
 'purpose',
 'credit_amount',
 'savings_status',
 'employment',
 'installment_commitment',
 'personal_status',
 'other_parties',
 'residence_since',
 'property_magnitude',
 'age',
 'other_payment_plans',
 'housing',
 'existing_credits',
 'job',
 'num_dependents',
 'own_telephone',
 'foreign_worker',
 'target']

In [73]:
numeric_summary = hlp.pandas.numeric_summary(credit_data)
columns_to_format = [x for x in numeric_summary.columns if x not in ['% Nulls', '% Zeros']]
import math
hlp.pandas.numeric_summary(credit_data).style. \
    format({
    '% Nulls': '{:,.0%}'.format,
    '% Zeros': '{:,.0%}'.format,
    }).\
    pipe(hlp.pandas_style.format, subset=columns_to_format, round_to=0).\
    highlight_between(left=0.00000001, right=math.inf, subset=['# of Nulls', '% Nulls', '# of Zeros', '% Zeros'],
                      color=hlp.color.Colors.TULIP_TREE.value)



Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,950,50,5%,0,0%,21,12,1,1,1,4,9,12,18,24,36,72
credit_amount,1000,0,0%,0,0%,3271,2823,1,2,4,250,932,1366,2320,3972,7179,18424
installment_commitment,1000,0,0%,0,0%,3,1,0,-1,-1,1,1,2,3,4,4,4
residence_since,1000,0,0%,0,0%,3,1,0,0,-1,1,1,2,3,4,4,4
age,1000,0,0%,0,0%,36,11,0,1,1,19,23,27,33,42,52,75
existing_credits,1000,0,0%,0,0%,1,1,0,1,2,1,1,1,1,2,2,4
num_dependents,1000,0,0%,0,0%,1,0,0,2,2,1,1,1,1,1,2,2


# Training and Test Data

In [36]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [37]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [39]:
del y_full, X_full

In [40]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(800, 20)
(800,)
(200, 20)
(200,)


In [41]:
hlp.pandas.value_frequency(series=y_train)

Unnamed: 0,Frequency,Percent
good,559,0.69875
bad,241,0.30125


In [42]:
hlp.pandas.value_frequency(series=y_test)

Unnamed: 0,Frequency,Percent
good,141,0.705
bad,59,0.295


# Transformation Pipeline

In [43]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [44]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [45]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [46]:
non_numeric_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
])

In [47]:
temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [48]:
print(type(temp))
print(temp.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(800, 55)


In [49]:
temp.toarray()[0:10, 0:10]

array([[0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

In [50]:
non_numeric_pipeline.steps[0][1].categories_

[array(['0<=X<200', '<0', '>=200', 'no checking', nan], dtype=object),
 array(['all paid', 'critical/other existing credit', 'delayed previously',
        'existing paid', 'no credits/all paid'], dtype=object),
 array(['business', 'domestic appliance', 'education',
        'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs',
        'retraining', 'used car'], dtype=object),
 array(['100<=X<500', '500<=X<1000', '<100', '>=1000', 'no known savings'],
       dtype=object),
 array(['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'], dtype=object),
 array(['female div/dep/mar', 'male div/sep', 'male mar/wid',
        'male single'], dtype=object),
 array(['co applicant', 'guarantor', 'none'], dtype=object),
 array(['car', 'life insurance', 'no known property', 'real estate'],
       dtype=object),
 array(['bank', 'none', 'stores'], dtype=object),
 array(['for free', 'own', 'rent'], dtype=object),
 array(['high qualif/self emp/mgmt', 'skilled', 'unemp/unskilled non res',
        'un

In [51]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [52]:
temp = transformations_pipeline.fit_transform(X_train)

In [53]:
temp.shape

(800, 62)

In [54]:
pd.DataFrame(temp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,21.042328,6836.0,3.0,4.0,63.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,21.000000,2319.0,2.0,1.0,33.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,6.000000,1236.0,2.0,4.0,50.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,21.000000,5003.0,1.0,4.0,29.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,12.000000,886.0,4.0,2.0,21.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,18.000000,6458.0,2.0,4.0,39.0,2.0,2.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
796,18.000000,2662.0,4.0,3.0,32.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
797,24.000000,5804.0,4.0,2.0,27.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
798,12.000000,1484.0,2.0,1.0,25.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [55]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [56]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [57]:
full_pipeline.n_features_in_

20

In [58]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('one_hot_encoder',
                                                   OneHotEncoder())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
                        

In [59]:
param_grad = [
    {'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
     'model__max_features': [2, 10, 40, 'auto'],
     'model__n_estimators': [50, 100, 500, 1000]}
]

In [60]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid=param_grad, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                              

In [61]:
grid_search.cv_results_

{'mean_fit_time': array([0.05445275, 0.0499763 , 0.09070339, 0.08820686, 0.41469302,
        0.41524677, 0.82719188, 0.8297647 , 0.060536  , 0.0586504 ,
        0.11130276, 0.10960736, 0.51763201, 0.51367922, 1.05345945,
        1.03576398, 0.09419198, 0.09428439, 0.18090019, 0.18149648,
        0.87896848, 0.87886381, 1.78329711, 1.74966669, 0.05428829,
        0.05431247, 0.10140734, 0.10142183, 0.47797055, 0.47814574,
        0.94588103, 0.94822397]),
 'std_fit_time': array([0.00871451, 0.00152864, 0.00206254, 0.00015489, 0.00330533,
        0.00247436, 0.00551285, 0.00225941, 0.00241258, 0.00117577,
        0.00181889, 0.00079349, 0.00353289, 0.00251294, 0.0283833 ,
        0.01371228, 0.00058091, 0.00119818, 0.00102546, 0.00106104,
        0.00451927, 0.00708051, 0.03029788, 0.01279424, 0.00032356,
        0.00039032, 0.00021681, 0.00035504, 0.00183661, 0.00146126,
        0.00310212, 0.0057655 ]),
 'mean_score_time': array([0.00743341, 0.00699739, 0.00990639, 0.00924864, 0.050814

In [62]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)
results_df

Unnamed: 0,mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.783366,0.033512,auto,1000,MinMaxScaler()
12,0.782291,0.029653,10,500,MinMaxScaler()
28,0.781576,0.032722,auto,500,MinMaxScaler()
14,0.781379,0.030222,10,1000,MinMaxScaler()
29,0.780154,0.028014,auto,500,StandardScaler()
31,0.779451,0.02894,auto,1000,StandardScaler()
15,0.778029,0.033659,10,1000,StandardScaler()
13,0.777263,0.026116,10,500,StandardScaler()
26,0.774953,0.030384,auto,100,MinMaxScaler()
20,0.772369,0.02811,40,500,MinMaxScaler()


In [63]:
#results_df.style.\
#    format(precision=3, na_rep='<Missing>', thousands=",")

In [64]:
results_df.style.format(precision=3, na_rep='<Missing>', thousands=",")

Unnamed: 0,mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.783,0.034,auto,1000,MinMaxScaler()
12,0.782,0.03,10,500,MinMaxScaler()
28,0.782,0.033,auto,500,MinMaxScaler()
14,0.781,0.03,10,1000,MinMaxScaler()
29,0.78,0.028,auto,500,StandardScaler()
31,0.779,0.029,auto,1000,StandardScaler()
15,0.778,0.034,10,1000,StandardScaler()
13,0.777,0.026,10,500,StandardScaler()
26,0.775,0.03,auto,100,MinMaxScaler()
20,0.772,0.028,40,500,MinMaxScaler()


In [65]:
results_df.style.\
    format(precision=3, na_rep='<Missing>', thousands=",").apply(lambda row: \
     row.apply(lambda col: \
     'background: lightgreen' if row.name == 30 else ''), \
     axis=1).hide_index()

mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.783,0.034,auto,1000,MinMaxScaler()
0.782,0.03,10,500,MinMaxScaler()
0.782,0.033,auto,500,MinMaxScaler()
0.781,0.03,10,1000,MinMaxScaler()
0.78,0.028,auto,500,StandardScaler()
0.779,0.029,auto,1000,StandardScaler()
0.778,0.034,10,1000,StandardScaler()
0.777,0.026,10,500,StandardScaler()
0.775,0.03,auto,100,MinMaxScaler()
0.772,0.028,40,500,MinMaxScaler()


In [66]:
results_df['min'] = results_df['mean_score'] - (2 * results_df['st_dev_score'])
results_df['max'] = results_df['mean_score'] + (2 * results_df['st_dev_score'])

In [68]:
# results_df.style.format(precision=3, na_rep='<Missing>', thousands=",").\
#     bar(subset=['mean_score', 'max'], color='grey').\
#     bar(subset=['min'], color='grey', align='left').

In [69]:

results_mod = results_df.copy()
results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)
results_mod

Unnamed: 0,mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.783366,0.716341,0.850391,auto,1000,MinMaxScaler()
12,0.782291,0.722984,0.841598,10,500,MinMaxScaler()
28,0.781576,0.716132,0.84702,auto,500,MinMaxScaler()
14,0.781379,0.720934,0.841823,10,1000,MinMaxScaler()
29,0.780154,0.724125,0.836182,auto,500,StandardScaler()
31,0.779451,0.721572,0.837331,auto,1000,StandardScaler()
15,0.778029,0.710711,0.845348,10,1000,StandardScaler()
13,0.777263,0.72503,0.829496,10,500,StandardScaler()
26,0.774953,0.714185,0.835722,auto,100,MinMaxScaler()
20,0.772369,0.716149,0.828588,40,500,MinMaxScaler()


In [71]:
results_mod.style.format(precision=3, na_rep='<Missing>', thousands=",").\
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

#    highlight_between(subset='min', color='yellow', left=0.7, right=.8).

Unnamed: 0,mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.783,0.716,0.85,auto,1000,MinMaxScaler()
12,0.782,0.723,0.842,10,500,MinMaxScaler()
28,0.782,0.716,0.847,auto,500,MinMaxScaler()
14,0.781,0.721,0.842,10,1000,MinMaxScaler()
29,0.78,0.724,0.836,auto,500,StandardScaler()
31,0.779,0.722,0.837,auto,1000,StandardScaler()
15,0.778,0.711,0.845,10,1000,StandardScaler()
13,0.777,0.725,0.829,10,500,StandardScaler()
26,0.775,0.714,0.836,auto,100,MinMaxScaler()
20,0.772,0.716,0.829,40,500,MinMaxScaler()


In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]],
                  index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'),
                  columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))
df.style

In [None]:
s = df.style.format('{:.0f}').hide_columns([('Random', 'Tumour'), ('Random', 'Non-Tumour')])
s

In [None]:
s.set_table_styles([  # create internal CSS classes
    {'selector': '.true', 'props': 'background-color: #e6ffe6;'},
    {'selector': '.false', 'props': 'background-color: #ffe6e6;'},
], overwrite=False)
cell_color = pd.DataFrame([['true ', 'false ', 'true ', 'false '],
                           ['false ', 'true ', 'false ', 'true ']],
                          index=df.index,
                          columns=df.columns[:4])
s.set_td_classes(cell_color)

In [None]:
cell_color

In [None]:
grid_search.best_estimator_._final_estimator.feature_importances_

In [None]:
grid_search.best_estimator_._final_estimator.feature_importances_.shape

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/