In [171]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [3]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

## Create Missing Values

credit_data['duration'].iloc[0:50] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan

In [4]:
hlp.pandas.numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,950,50,0.05,0,0.0,20.874,11.957,0.573,1.102,0.987,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0,0,0.0,3271.258,2822.737,0.863,1.95,4.293,250.0,932.0,1365.5,2319.5,3972.25,7179.4,18424.0
installment_commitment,1000,0,0.0,0,0.0,2.973,1.119,0.376,-0.531,-1.21,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0,0,0.0,2.845,1.104,0.388,-0.273,-1.381,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0,0,0.0,35.546,11.375,0.32,1.021,0.596,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0,0,0.0,1.407,0.578,0.411,1.273,1.604,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0,0,0.0,1.155,0.362,0.313,1.909,1.649,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Null,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.004
credit_history,1000,0,0.0,existing paid,5,0.005
purpose,1000,0,0.0,radio/tv,10,0.01
savings_status,1000,0,0.0,<100,5,0.005
employment,1000,0,0.0,1<=X<4,5,0.005
personal_status,1000,0,0.0,male single,4,0.004
other_parties,1000,0,0.0,none,3,0.003
property_magnitude,1000,0,0.0,car,4,0.004
other_payment_plans,1000,0,0.0,none,3,0.003
housing,1000,0,0.0,own,3,0.003


# Training and Test Data

In [13]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [14]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [16]:
del y_full, X_full

In [17]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(800, 20)
(800,)
(200, 20)
(200,)


In [19]:
hlp.pandas.value_frequency(series=y_train)

Unnamed: 0,Frequency,Percent
good,559,0.69875
bad,241,0.30125


In [18]:
hlp.pandas.value_frequency(series=y_test)

Unnamed: 0,Frequency,Percent
good,141,0.705
bad,59,0.295


# Transformation Pipeline

In [172]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [67]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [181]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [182]:
non_numeric_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
])

In [183]:
temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [184]:
print(type(temp))
print(temp.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(800, 55)


In [185]:
temp.toarray()[0:10, 0:10]

array([[0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

In [186]:
non_numeric_pipeline.steps[0][1].categories_

[array(['0<=X<200', '<0', '>=200', 'no checking', nan], dtype=object),
 array(['all paid', 'critical/other existing credit', 'delayed previously',
        'existing paid', 'no credits/all paid'], dtype=object),
 array(['business', 'domestic appliance', 'education',
        'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs',
        'retraining', 'used car'], dtype=object),
 array(['100<=X<500', '500<=X<1000', '<100', '>=1000', 'no known savings'],
       dtype=object),
 array(['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'], dtype=object),
 array(['female div/dep/mar', 'male div/sep', 'male mar/wid',
        'male single'], dtype=object),
 array(['co applicant', 'guarantor', 'none'], dtype=object),
 array(['car', 'life insurance', 'no known property', 'real estate'],
       dtype=object),
 array(['bank', 'none', 'stores'], dtype=object),
 array(['for free', 'own', 'rent'], dtype=object),
 array(['high qualif/self emp/mgmt', 'skilled', 'unemp/unskilled non res',
        'un

In [187]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [188]:
temp = transformations_pipeline.fit_transform(X_train)

In [189]:
temp.shape

(800, 62)

In [190]:
pd.DataFrame(temp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,21.042328,6836.0,3.0,4.0,63.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,21.000000,2319.0,2.0,1.0,33.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,6.000000,1236.0,2.0,4.0,50.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,21.000000,5003.0,1.0,4.0,29.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,12.000000,886.0,4.0,2.0,21.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,18.000000,6458.0,2.0,4.0,39.0,2.0,2.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
796,18.000000,2662.0,4.0,3.0,32.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
797,24.000000,5804.0,4.0,2.0,27.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
798,12.000000,1484.0,2.0,1.0,25.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [191]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [192]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [197]:
full_pipeline.n_features_in_

20

In [193]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('one_hot_encoder',
                                                   OneHotEncoder())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
                        

In [201]:
param_grad = [
    {'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
     'model__max_features': [2, 10, 40, 'auto'],
     'model__n_estimators': [50, 100, 500, 1000]}
]

In [202]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid=param_grad, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                              

In [203]:
grid_search.cv_results_

{'mean_fit_time': array([0.05393319, 0.04734941, 0.08767157, 0.08831797, 0.41350212,
        0.41034684, 0.8081985 , 0.80824528, 0.05753675, 0.05762162,
        0.10839467, 0.10923963, 0.51170635, 0.51205945, 1.01819744,
        1.06179476, 0.09513259, 0.09405384, 0.18023758, 0.18072257,
        0.88252993, 0.88345237, 1.78990273, 1.75986023, 0.05422626,
        0.05397892, 0.1007915 , 0.10130258, 0.47492442, 0.47893977,
        0.94863734, 0.94363842]),
 'std_fit_time': array([1.29960916e-02, 1.12364671e-04, 4.04834309e-04, 1.69143597e-04,
        2.87486367e-03, 1.51637021e-03, 1.98863839e-03, 1.16238682e-03,
        4.82321024e-04, 3.55602621e-04, 1.29116680e-03, 2.30747645e-03,
        2.37298319e-03, 1.74552521e-03, 3.31124453e-03, 5.58765579e-02,
        2.55340854e-03, 4.29034100e-04, 1.28398054e-03, 2.06677847e-03,
        5.98091800e-03, 1.05006389e-02, 2.16356929e-02, 1.57928419e-02,
        3.59574349e-04, 7.19431766e-05, 7.50274705e-04, 6.65464607e-04,
        1.77783741e-0

In [205]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)
results_df

Unnamed: 0,mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.782921,0.032585,auto,1000,MinMaxScaler()
29,0.781764,0.031528,auto,500,StandardScaler()
12,0.779853,0.03133,10,500,MinMaxScaler()
31,0.779775,0.03123,auto,1000,StandardScaler()
14,0.779194,0.031771,10,1000,MinMaxScaler()
28,0.778779,0.031283,auto,500,MinMaxScaler()
15,0.777747,0.030652,10,1000,StandardScaler()
13,0.775357,0.029209,10,500,StandardScaler()
21,0.773321,0.030321,40,500,StandardScaler()
22,0.771717,0.032159,40,1000,MinMaxScaler()


In [207]:
#results_df.style.\
#    format(precision=3, na_rep='<Missing>', thousands=",")

In [208]:
results_df.style.\
    format(precision=3, na_rep='<Missing>', thousands=",").apply(lambda row: \
     row.apply(lambda col: \
     'background: lightgreen' if row.name == 30 else ''), \
     axis=1).hide_index()

mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.783,0.033,auto,1000,MinMaxScaler()
0.782,0.032,auto,500,StandardScaler()
0.78,0.031,10,500,MinMaxScaler()
0.78,0.031,auto,1000,StandardScaler()
0.779,0.032,10,1000,MinMaxScaler()
0.779,0.031,auto,500,MinMaxScaler()
0.778,0.031,10,1000,StandardScaler()
0.775,0.029,10,500,StandardScaler()
0.773,0.03,40,500,StandardScaler()
0.772,0.032,40,1000,MinMaxScaler()


In [231]:
results_df['min'] = results_df['mean_score'] - (2 * results_df['st_dev_score'])
results_df['max'] = results_df['mean_score'] + (2 * results_df['st_dev_score'])

In [234]:
results_df.style.format(precision=3, na_rep='<Missing>', thousands=",").\
    bar(subset=['mean_score', 'max'], color='grey').\
    bar(subset=['min'], color='grey', align='left').

Unnamed: 0,mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer,min,max
30,0.783,0.033,auto,1000,MinMaxScaler(),0.718,0.848
29,0.782,0.032,auto,500,StandardScaler(),0.719,0.845
12,0.78,0.031,10,500,MinMaxScaler(),0.717,0.843
31,0.78,0.031,auto,1000,StandardScaler(),0.717,0.842
14,0.779,0.032,10,1000,MinMaxScaler(),0.716,0.843
28,0.779,0.031,auto,500,MinMaxScaler(),0.716,0.841
15,0.778,0.031,10,1000,StandardScaler(),0.716,0.839
13,0.775,0.029,10,500,StandardScaler(),0.717,0.834
21,0.773,0.03,40,500,StandardScaler(),0.713,0.834
22,0.772,0.032,40,1000,MinMaxScaler(),0.707,0.836


In [289]:
#CODE MODIFIED FROM https://github.com/pandas-dev/pandas/blob/v1.3.2/pandas/io/formats/style.py#L2178-L2258

from pandas.io.formats.style import Styler
from pandas.io.formats.style_render import Subset
from pandas._typing import Axis
def _bar_inverse(
        s,
        align: str,
        colors: list[str],
        width: float = 100,
        vmin: float = None,
        vmax: float = None,
    ):
        """
        Draw bar chart in dataframe cells.
        """
        # Get input value range.
        smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
        smax = np.nanmax(s.to_numpy()) if vmax is None else vmax
        if align == "mid":
            smin = min(0, smin)
            smax = max(0, smax)
        elif align == "zero":
            # For "zero" mode, we want the range to be symmetrical around zero.
            smax = max(abs(smin), abs(smax))
            smin = -smax
        # Transform to percent-range of linear-gradient
        normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12)
        zero = -width * smin / (smax - smin + 1e-12)

        def css_bar(start: float, end: float, color: str) -> str:
            """
            Generate CSS code to draw a bar from start to end.
            """
            css = "width: 10em; height: 80%;"
            if end > start:
                css += "background: linear-gradient(90deg,"
                if start > 0:
                    css += f" {color} {start:.1f}%, transparent {start:.1f}%, "
                e = min(end, width)
                css += f"transparent {e:.1f}%, {color} {e:.1f}%)"
            return css

        def css(x):
            if pd.isna(x):
                return ""

            # avoid deprecated indexing `colors[x > zero]`
            color = colors[1] if x > zero else colors[0]

            if align == "left":
                return css_bar(0, x, color)
            else:
                return css_bar(min(x, zero), max(x, zero), color)

        if s.ndim == 1:
            print(css(normed[10]))
            
            return [css(x) for x in normed]
        else:
            print('asdeeee')
            return DataFrame(
                [[css(x) for x in row] for row in normed],
                index=s.index,
                columns=s.columns,
            )
from pandas.api.types import is_list_like
def bar_inverse(
        styler,
        subset: Subset = None,
        axis: Axis = 0,
        color="#d65f5f",
        width: float = 100,
        align: str = "left",
        vmin: float = None,
        vmax: float = None,
    ) -> Styler:
    """
    Draw bar chart in the cell backgrounds.
    Parameters
    ----------
    subset : label, array-like, IndexSlice, optional
        A valid 2d input to `DataFrame.loc[<subset>]`, or, in the case of a 1d input
        or single key, to `DataFrame.loc[:, <subset>]` where the columns are
        prioritised, to limit ``data`` to *before* applying the function.
    axis : {0 or 'index', 1 or 'columns', None}, default 0
        Apply to each column (``axis=0`` or ``'index'``), to each row
        (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
        with ``axis=None``.
    color : str or 2-tuple/list
        If a str is passed, the color is the same for both
        negative and positive numbers. If 2-tuple/list is used, the
        first element is the color_negative and the second is the
        color_positive (eg: ['#d65f5f', '#5fba7d']).
    width : float, default 100
        A number between 0 or 100. The largest value will cover `width`
        percent of the cell's width.
    align : {'left', 'zero',' mid'}, default 'left'
        How to align the bars with the cells.
        - 'left' : the min value starts at the left of the cell.
        - 'zero' : a value of zero is located at the center of the cell.
        - 'mid' : the center of the cell is at (max-min)/2, or
          if values are all negative (positive) the zero is aligned
          at the right (left) of the cell.
    vmin : float, optional
        Minimum bar value, defining the left hand limit
        of the bar drawing range, lower values are clipped to `vmin`.
        When None (default): the minimum value of the data will be used.
    vmax : float, optional
        Maximum bar value, defining the right hand limit
        of the bar drawing range, higher values are clipped to `vmax`.
        When None (default): the maximum value of the data will be used.
    Returns
    -------
    styler
    """
    if align not in ("left", "zero", "mid"):
        raise ValueError("`align` must be one of {'left', 'zero',' mid'}")

    if not (is_list_like(color)):
        color = [color, color]
    elif len(color) == 1:
        color = [color[0], color[0]]
    elif len(color) > 2:
        raise ValueError(
            "`color` must be string or a list-like "
            "of length 2: [`color_neg`, `color_pos`] "
            "(eg: color=['#d65f5f', '#5fba7d'])"
        )

    if subset is None:
        subset = styler.data.select_dtypes(include=np.number).columns

    styler.apply(
        _bar_inverse,
        subset=subset,
        axis=axis,
        align=align,
        colors=color,
        width=width,
        vmin=vmin,
        vmax=vmax,
    )

    return styler

# bar_inverse(
#     results_df.style,
#     subset=['min'],
#  #   axis=0,
#  #   align='left',
#     color='gray',
#  #   width=100,
# #     vmin=0,
# #     vmax=1,
#         )

In [303]:

results_mod = results_df.copy()
results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)
results_mod

Unnamed: 0,mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.782921,0.717751,0.84809,auto,1000,MinMaxScaler()
29,0.781764,0.718709,0.844819,auto,500,StandardScaler()
12,0.779853,0.717192,0.842513,10,500,MinMaxScaler()
31,0.779775,0.717315,0.842235,auto,1000,StandardScaler()
14,0.779194,0.715653,0.842735,10,1000,MinMaxScaler()
28,0.778779,0.716214,0.841344,auto,500,MinMaxScaler()
15,0.777747,0.716443,0.839051,10,1000,StandardScaler()
13,0.775357,0.716938,0.833776,10,500,StandardScaler()
21,0.773321,0.71268,0.833963,40,500,StandardScaler()
22,0.771717,0.707398,0.836035,40,1000,MinMaxScaler()


In [305]:
results_mod.style.format(precision=3, na_rep='<Missing>', thousands=",").\
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(bar_inverse, subset=['mean*-2SD'], color='gray')

#    highlight_between(subset='min', color='yellow', left=0.7, right=.8).

width: 10em; height: 80%;background: linear-gradient(90deg,transparent 68.4%, gray 68.4%)


Unnamed: 0,mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
30,0.783,0.718,0.848,auto,1000,MinMaxScaler()
29,0.782,0.719,0.845,auto,500,StandardScaler()
12,0.78,0.717,0.843,10,500,MinMaxScaler()
31,0.78,0.717,0.842,auto,1000,StandardScaler()
14,0.779,0.716,0.843,10,1000,MinMaxScaler()
28,0.779,0.716,0.841,auto,500,MinMaxScaler()
15,0.778,0.716,0.839,10,1000,StandardScaler()
13,0.775,0.717,0.834,10,500,StandardScaler()
21,0.773,0.713,0.834,40,500,StandardScaler()
22,0.772,0.707,0.836,40,1000,MinMaxScaler()


In [134]:
import pandas as pd
import numpy as np

df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]],
                  index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'),
                  columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))
df.style

Model:,Decision Tree,Decision Tree,Regression,Regression,Random,Random
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Tumour (Positive),38.0,2.0,18.0,22.0,21,
Non-Tumour (Negative),19.0,439.0,6.0,452.0,226,232.0


In [135]:
s = df.style.format('{:.0f}').hide_columns([('Random', 'Tumour'), ('Random', 'Non-Tumour')])
s

Model:,Decision Tree,Decision Tree,Regression,Regression
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Tumour (Positive),38,2,18,22
Non-Tumour (Negative),19,439,6,452


In [137]:
s.set_table_styles([  # create internal CSS classes
    {'selector': '.true', 'props': 'background-color: #e6ffe6;'},
    {'selector': '.false', 'props': 'background-color: #ffe6e6;'},
], overwrite=False)
cell_color = pd.DataFrame([['true ', 'false ', 'true ', 'false '],
                           ['false ', 'true ', 'false ', 'true ']],
                          index=df.index,
                          columns=df.columns[:4])
s.set_td_classes(cell_color)

Model:,Decision Tree,Decision Tree,Regression,Regression
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Tumour (Positive),38,2,18,22
Non-Tumour (Negative),19,439,6,452


In [138]:
cell_color

Model:,Decision Tree,Decision Tree,Regression,Regression
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Tumour (Positive),True,False,True,False
Non-Tumour (Negative),False,True,False,True


Unnamed: 0,mean_score,st_dev_score,model__max_features
1,0.773,0.027,10
3,0.768,0.032,auto
2,0.766,0.028,40
0,0.759,0.036,2


In [119]:
grid_search.best_estimator_._final_estimator.feature_importances_

array([0.0754755 , 0.10930985, 0.03194036, 0.03501169, 0.09137456,
       0.01725489, 0.00843814, 0.01694168, 0.03699194, 0.00519236,
       0.04190846, 0.00586453, 0.00897108, 0.02222291, 0.00895831,
       0.01341363, 0.01557162, 0.01043077, 0.00248649, 0.01105474,
       0.01361007, 0.0193251 , 0.00170111, 0.01521   , 0.00436435,
       0.00093963, 0.0078134 , 0.00762163, 0.00604044, 0.01788119,
       0.00411842, 0.01164173, 0.01523977, 0.01113989, 0.01360795,
       0.01218306, 0.00995284, 0.01375158, 0.00675196, 0.007414  ,
       0.01506316, 0.00495152, 0.00663783, 0.01006701, 0.01553278,
       0.01272542, 0.01291412, 0.01389387, 0.01395413, 0.0160065 ,
       0.00517882, 0.00761786, 0.01749944, 0.01055352, 0.01163606,
       0.01406286, 0.00222072, 0.00978983, 0.0128646 , 0.01309963,
       0.00232712, 0.00228156])

In [120]:
grid_search.best_estimator_._final_estimator.feature_importances_.shape

(62,)

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/