In [1]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

(1000, 21)

In [3]:
## Create Missing Values

credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [5]:
hlp.pandas.numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [7]:
hlp.pandas.non_numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.4%
credit_history,1000,0,0.0,existing paid,5,0.5%
purpose,1000,0,0.0,radio/tv,10,1.0%
savings_status,1000,0,0.0,<100,5,0.5%
employment,1000,0,0.0,1<=X<4,5,0.5%
personal_status,1000,0,0.0,male single,4,0.4%
other_parties,1000,0,0.0,none,3,0.3%
property_magnitude,1000,0,0.0,car,4,0.4%
other_payment_plans,1000,0,0.0,none,3,0.3%
housing,1000,0,0.0,own,3,0.3%


# Training and Test Data

In [15]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [16]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [18]:
del y_full, X_full

In [19]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(800, 20)
(800,)
(200, 20)
(200,)


In [20]:
hlp.pandas.value_frequency(series=y_train)

Unnamed: 0,Frequency,Percent
good,559,0.69875
bad,241,0.30125


In [21]:
hlp.pandas.value_frequency(series=y_test)

Unnamed: 0,Frequency,Percent
good,141,0.705
bad,59,0.295


# Transformation Pipeline

In [22]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [23]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [24]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [25]:
non_numeric_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
])

In [26]:
temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [27]:
print(type(temp))
print(temp.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(800, 55)


In [28]:
temp.toarray()[0:10, 0:10]

array([[0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

In [29]:
non_numeric_pipeline.steps[0][1].categories_

[array(['0<=X<200', '<0', '>=200', 'no checking', nan], dtype=object),
 array(['all paid', 'critical/other existing credit', 'delayed previously',
        'existing paid', 'no credits/all paid'], dtype=object),
 array(['business', 'domestic appliance', 'education',
        'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs',
        'retraining', 'used car'], dtype=object),
 array(['100<=X<500', '500<=X<1000', '<100', '>=1000', 'no known savings'],
       dtype=object),
 array(['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'], dtype=object),
 array(['female div/dep/mar', 'male div/sep', 'male mar/wid',
        'male single'], dtype=object),
 array(['co applicant', 'guarantor', 'none'], dtype=object),
 array(['car', 'life insurance', 'no known property', 'real estate'],
       dtype=object),
 array(['bank', 'none', 'stores'], dtype=object),
 array(['for free', 'own', 'rent'], dtype=object),
 array(['high qualif/self emp/mgmt', 'skilled', 'unemp/unskilled non res',
        'un

In [30]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [31]:
temp = transformations_pipeline.fit_transform(X_train)

In [32]:
temp.shape

(800, 62)

In [33]:
pd.DataFrame(temp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,21.017105,0.0,3.0,4.0,63.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,21.000000,2319.0,2.0,1.0,33.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,6.000000,1236.0,2.0,4.0,50.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,21.000000,5003.0,1.0,4.0,29.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,12.000000,886.0,4.0,2.0,21.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,18.000000,6458.0,2.0,4.0,39.0,2.0,2.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
796,18.000000,2662.0,4.0,3.0,32.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
797,24.000000,5804.0,4.0,2.0,27.0,2.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
798,12.000000,1484.0,2.0,1.0,25.0,1.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [34]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [35]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [36]:
full_pipeline.n_features_in_

20

In [37]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('one_hot_encoder',
                                                   OneHotEncoder())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
                        

In [38]:
param_grad = [
    {'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
     'model__max_features': [2, 10, 40, 'auto'],
     'model__n_estimators': [50, 100, 500, 1000]}
]

In [39]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid=param_grad, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                              

In [40]:
grid_search.cv_results_

{'mean_fit_time': array([0.0544281 , 0.04794173, 0.08855882, 0.08830881, 0.41427388,
        0.42141395, 0.81490946, 0.8130579 , 0.05799007, 0.05789714,
        0.10878429, 0.10885267, 0.51358538, 0.51474023, 1.03005104,
        1.02129736, 0.0944798 , 0.09460478, 0.18220453, 0.18197255,
        0.88914995, 0.88246827, 1.75043201, 1.75268998, 0.05444045,
        0.05463872, 0.10183825, 0.10227795, 0.48109493, 0.53407741,
        0.95673103, 0.96274214]),
 'std_fit_time': array([1.03157425e-02, 7.26098528e-05, 4.09603302e-04, 2.27235396e-04,
        3.35174427e-03, 2.05098023e-02, 8.82084586e-04, 1.68775802e-03,
        5.81267332e-04, 1.90390120e-04, 7.09885425e-04, 5.42900462e-04,
        1.92643679e-03, 1.60760671e-03, 1.73124020e-02, 4.10003414e-03,
        6.47469451e-04, 4.93950141e-04, 9.85661685e-04, 7.32713855e-04,
        1.56092350e-02, 3.30587947e-03, 8.47111629e-03, 7.81814050e-03,
        2.13653986e-04, 2.13335346e-04, 2.92743011e-04, 4.47078319e-04,
        3.26451709e-0

In [41]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)
results_df

Unnamed: 0,mean_score,st_dev_score,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
31,0.782204,0.032646,auto,1000,StandardScaler()
26,0.780373,0.027692,auto,100,MinMaxScaler()
28,0.777945,0.030656,auto,500,MinMaxScaler()
15,0.777631,0.031153,10,1000,StandardScaler()
30,0.776872,0.030788,auto,1000,MinMaxScaler()
29,0.776609,0.027129,auto,500,StandardScaler()
27,0.776428,0.031546,auto,100,StandardScaler()
13,0.775982,0.02913,10,500,StandardScaler()
12,0.774849,0.031672,10,500,MinMaxScaler()
14,0.774315,0.032466,10,1000,MinMaxScaler()


In [44]:
# results_df.style.\
#     format(precision=3, na_rep='<Missing>', thousands=",").apply(lambda row: \
#      row.apply(lambda col: \
#      'background: lightgreen' if row.name == 30 else ''), \
#      axis=1).hide_index()

In [None]:
# results_df.style.format(precision=3, na_rep='<Missing>', thousands=",").\
#     bar(subset=['mean_score', 'max'], color='grey').\
#     bar(subset=['min'], color='grey', align='left').

In [None]:
#results_df['min'] = results_df['mean_score'] - (2 * results_df['st_dev_score'])
#results_df['max'] = results_df['mean_score'] + (2 * results_df['st_dev_score'])

In [46]:

results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)
results_mod

Unnamed: 0,mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
31,0.782204,0.716912,0.847497,auto,1000,StandardScaler()
26,0.780373,0.72499,0.835756,auto,100,MinMaxScaler()
28,0.777945,0.716632,0.839258,auto,500,MinMaxScaler()
15,0.777631,0.715326,0.839937,10,1000,StandardScaler()
30,0.776872,0.715297,0.838447,auto,1000,MinMaxScaler()
29,0.776609,0.722351,0.830867,auto,500,StandardScaler()
27,0.776428,0.713335,0.83952,auto,100,StandardScaler()
13,0.775982,0.717721,0.834242,10,500,StandardScaler()
12,0.774849,0.711506,0.838193,10,500,MinMaxScaler()
14,0.774315,0.709382,0.839248,10,1000,MinMaxScaler()


In [48]:
results_mod.\
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.782,0.717,0.847,auto,1000,StandardScaler()
0.78,0.725,0.836,auto,100,MinMaxScaler()
0.778,0.717,0.839,auto,500,MinMaxScaler()
0.778,0.715,0.84,10,1000,StandardScaler()
0.777,0.715,0.838,auto,1000,MinMaxScaler()
0.777,0.722,0.831,auto,500,StandardScaler()
0.776,0.713,0.84,auto,100,StandardScaler()
0.776,0.718,0.834,10,500,StandardScaler()
0.775,0.712,0.838,10,500,MinMaxScaler()
0.774,0.709,0.839,10,1000,MinMaxScaler()


In [49]:
# results_mod.style.format(precision=3, na_rep='<Missing>', thousands=",").\
#     bar(subset=['mean_score'], color='#5fba7d').\
#     bar(subset=['mean*+2SD'], color='gray').\
#     pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray').\
#     hide_index().\
#     highlight_null(null_color=hlp.color.Colors.AVOCADO)

#    highlight_between(subset='min', color='yellow', left=0.7, right=.8).

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline
   
```
grid_search.best_estimator_._final_estimator.feature_importances_
grid_search.best_estimator_._final_estimator.feature_importances_.shape
```

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/