In [62]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

%matplotlib inline

# Load Data

In [3]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

## Create Missing Values

credit_data['duration'].iloc[0:50] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan

In [4]:
hlp.pandas.numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,950,50,0.05,0,0.0,20.874,11.957,0.573,1.102,0.987,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0,0,0.0,3271.258,2822.737,0.863,1.95,4.293,250.0,932.0,1365.5,2319.5,3972.25,7179.4,18424.0
installment_commitment,1000,0,0.0,0,0.0,2.973,1.119,0.376,-0.531,-1.21,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0,0,0.0,2.845,1.104,0.388,-0.273,-1.381,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0,0,0.0,35.546,11.375,0.32,1.021,0.596,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0,0,0.0,1.407,0.578,0.411,1.273,1.604,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0,0,0.0,1.155,0.362,0.313,1.909,1.649,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Null,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.004
credit_history,1000,0,0.0,existing paid,5,0.005
purpose,1000,0,0.0,radio/tv,10,0.01
savings_status,1000,0,0.0,<100,5,0.005
employment,1000,0,0.0,1<=X<4,5,0.005
personal_status,1000,0,0.0,male single,4,0.004
other_parties,1000,0,0.0,none,3,0.003
property_magnitude,1000,0,0.0,car,4,0.004
other_payment_plans,1000,0,0.0,none,3,0.003
housing,1000,0,0.0,own,3,0.003


# Training and Test Data

In [13]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [14]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [16]:
del y_full, X_full

In [17]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(800, 20)
(800,)
(200, 20)
(200,)


In [19]:
hlp.pandas.value_frequency(series=y_train)

Unnamed: 0,Frequency,Percent
good,559,0.69875
bad,241,0.30125


In [18]:
hlp.pandas.value_frequency(series=y_test)

Unnamed: 0,Frequency,Percent
good,141,0.705
bad,59,0.295


# Transformation Pipeline

In [67]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [68]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('standard_scaler', StandardScaler()),
])

In [69]:
temp = numeric_pipeline.fit_transform(X_train[numeric_columns])
print(type(temp))
print(temp.shape)

<class 'numpy.ndarray'>
(800, 7)


In [70]:
np.mean(temp, axis=0).round(3)

array([-0., -0.,  0.,  0., -0.,  0., -0.])

In [71]:
np.std(temp, axis=0)

array([1., 1., 1., 1., 1., 1., 1.])

In [72]:
non_numeric_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
])

In [73]:
temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [77]:
print(type(temp))
print(temp.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(800, 55)


In [80]:
temp.toarray()[0:10, 0:10]

array([[0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

In [87]:
non_numeric_pipeline.steps[0][1].categories_

[array(['0<=X<200', '<0', '>=200', 'no checking', nan], dtype=object),
 array(['all paid', 'critical/other existing credit', 'delayed previously',
        'existing paid', 'no credits/all paid'], dtype=object),
 array(['business', 'domestic appliance', 'education',
        'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs',
        'retraining', 'used car'], dtype=object),
 array(['100<=X<500', '500<=X<1000', '<100', '>=1000', 'no known savings'],
       dtype=object),
 array(['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'], dtype=object),
 array(['female div/dep/mar', 'male div/sep', 'male mar/wid',
        'male single'], dtype=object),
 array(['co applicant', 'guarantor', 'none'], dtype=object),
 array(['car', 'life insurance', 'no known property', 'real estate'],
       dtype=object),
 array(['bank', 'none', 'stores'], dtype=object),
 array(['for free', 'own', 'rent'], dtype=object),
 array(['high qualif/self emp/mgmt', 'skilled', 'unemp/unskilled non res',
        'un

In [103]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [104]:
temp = transformations_pipeline.fit_transform(X_train)

In [105]:
temp.shape

(800, 62)

In [106]:
pd.DataFrame(temp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-3.129740e-16,1.199912,0.031196,1.044509,2.406187,1.017777,-0.409736,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-3.728861e-03,-0.359630,-0.860109,-1.671440,-0.224364,-0.710931,-0.409736,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,-1.325144e+00,-0.733547,-0.860109,1.044509,1.266282,-0.710931,-0.409736,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-3.728861e-03,0.567050,-1.751413,1.044509,-0.575104,1.017777,-0.409736,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,-7.965779e-01,-0.854388,0.922500,-0.766124,-1.276585,-0.710931,-0.409736,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-2.680119e-01,1.069404,-0.860109,1.044509,0.301746,1.017777,2.440599,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
796,-2.680119e-01,-0.241206,0.922500,0.139192,-0.312049,-0.710931,-0.409736,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
797,2.605542e-01,0.843603,0.922500,-0.766124,-0.750474,1.017777,-0.409736,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
798,-7.965779e-01,-0.647923,-0.860109,-1.671440,-0.925844,-0.710931,-0.409736,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [98]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [107]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [108]:
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('standard_scaler',
                                                   StandardScaler())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('one_hot_encoder',
                                                   OneHotEncoder())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                                   'personal_status', 'other_parties',
                            

In [111]:
param_grad = [
    {'model__max_features': [2, 10, 40, 'auto']}
]

In [113]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid=param_grad, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('standard_scaler',
                                                                                          StandardScaler())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                                  

In [116]:
grid_search.cv_results_

{'mean_fit_time': array([0.09921227, 0.10989456, 0.18177648, 0.10143366]),
 'std_fit_time': array([0.01923734, 0.00116743, 0.00103342, 0.000498  ]),
 'mean_score_time': array([0.01016941, 0.00937796, 0.00900736, 0.00913258]),
 'std_score_time': array([1.00247187e-03, 2.16596948e-04, 4.84119126e-05, 3.45611230e-05]),
 'param_model__max_features': masked_array(data=[2, 10, 40, 'auto'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model__max_features': 2},
  {'model__max_features': 10},
  {'model__max_features': 40},
  {'model__max_features': 'auto'}],
 'split0_test_score': array([0.76098548, 0.77477477, 0.77973892, 0.76594962]),
 'split1_test_score': array([0.73465402, 0.75176711, 0.72265625, 0.73614211]),
 'split2_test_score': array([0.71912202, 0.75223214, 0.76376488, 0.75604539]),
 'split3_test_score': array([0.82421875, 0.8249628 , 0.80831473, 0.82933408]),
 'split4_test_score': array([0.7546503 , 0.75948661, 0.757

In [117]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df.sort_values(by=['mean_score'], ascending=False).head(10)

Unnamed: 0,mean_score,st_dev_score,model__max_features
1,0.772645,0.02745,10
3,0.767643,0.032314,auto
2,0.766457,0.028032,40
0,0.758726,0.035941,2


In [119]:
grid_search.best_estimator_._final_estimator.feature_importances_

array([0.0754755 , 0.10930985, 0.03194036, 0.03501169, 0.09137456,
       0.01725489, 0.00843814, 0.01694168, 0.03699194, 0.00519236,
       0.04190846, 0.00586453, 0.00897108, 0.02222291, 0.00895831,
       0.01341363, 0.01557162, 0.01043077, 0.00248649, 0.01105474,
       0.01361007, 0.0193251 , 0.00170111, 0.01521   , 0.00436435,
       0.00093963, 0.0078134 , 0.00762163, 0.00604044, 0.01788119,
       0.00411842, 0.01164173, 0.01523977, 0.01113989, 0.01360795,
       0.01218306, 0.00995284, 0.01375158, 0.00675196, 0.007414  ,
       0.01506316, 0.00495152, 0.00663783, 0.01006701, 0.01553278,
       0.01272542, 0.01291412, 0.01389387, 0.01395413, 0.0160065 ,
       0.00517882, 0.00761786, 0.01749944, 0.01055352, 0.01163606,
       0.01406286, 0.00222072, 0.00978983, 0.0128646 , 0.01309963,
       0.00232712, 0.00228156])

In [120]:
grid_search.best_estimator_._final_estimator.feature_importances_.shape

(62,)

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/