In [125]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

(1000, 21)

In [3]:
## Create Missing Values

credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [5]:
hlp.pandas.numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [7]:
hlp.pandas.non_numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.4%
credit_history,1000,0,0.0,existing paid,5,0.5%
purpose,1000,0,0.0,radio/tv,10,1.0%
savings_status,1000,0,0.0,<100,5,0.5%
employment,1000,0,0.0,1<=X<4,5,0.5%
personal_status,1000,0,0.0,male single,4,0.4%
other_parties,1000,0,0.0,none,3,0.3%
property_magnitude,1000,0,0.0,car,4,0.4%
other_payment_plans,1000,0,0.0,none,3,0.3%
housing,1000,0,0.0,own,3,0.3%


# Training and Test Data

In [210]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [215]:
X_full.head(30)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes
5,no checking,,existing paid,education,9055.0,no known savings,1<=X<4,2.0,male single,none,4.0,no known property,35.0,none,for free,1.0,unskilled resident,2.0,yes,yes
6,no checking,,existing paid,furniture/equipment,2835.0,500<=X<1000,>=7,3.0,male single,none,4.0,life insurance,53.0,none,own,1.0,skilled,1.0,none,yes
7,0<=X<200,,existing paid,used car,6948.0,<100,1<=X<4,2.0,male single,none,2.0,car,35.0,none,rent,1.0,high qualif/self emp/mgmt,1.0,yes,yes
8,no checking,,existing paid,radio/tv,3059.0,>=1000,4<=X<7,2.0,male div/sep,none,4.0,real estate,61.0,none,own,1.0,unskilled resident,1.0,none,yes
9,0<=X<200,,critical/other existing credit,new car,5234.0,<100,unemployed,4.0,male mar/wid,none,2.0,car,28.0,none,own,2.0,high qualif/self emp/mgmt,1.0,none,yes


In [212]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [216]:
X_full.head(30)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes
5,no checking,,existing paid,education,9055.0,no known savings,1<=X<4,2.0,male single,none,4.0,no known property,35.0,none,for free,1.0,unskilled resident,2.0,yes,yes
6,no checking,,existing paid,furniture/equipment,2835.0,500<=X<1000,>=7,3.0,male single,none,4.0,life insurance,53.0,none,own,1.0,skilled,1.0,none,yes
7,0<=X<200,,existing paid,used car,6948.0,<100,1<=X<4,2.0,male single,none,2.0,car,35.0,none,rent,1.0,high qualif/self emp/mgmt,1.0,yes,yes
8,no checking,,existing paid,radio/tv,3059.0,>=1000,4<=X<7,2.0,male div/sep,none,4.0,real estate,61.0,none,own,1.0,unskilled resident,1.0,none,yes
9,0<=X<200,,critical/other existing credit,new car,5234.0,<100,unemployed,4.0,male mar/wid,none,2.0,car,28.0,none,own,2.0,high qualif/self emp/mgmt,1.0,none,yes


In [217]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [218]:
X_train

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
29,,,delayed previously,business,0.0,<100,>=7,3.0,male single,none,4.0,no known property,63.0,none,own,2.0,skilled,1.0,yes,yes
535,>=200,21.0,critical/other existing credit,education,2319.0,<100,<1,2.0,male div/sep,none,1.0,car,33.0,none,rent,1.0,skilled,1.0,none,yes
695,no checking,6.0,existing paid,used car,1236.0,500<=X<1000,1<=X<4,2.0,male single,none,4.0,life insurance,50.0,none,rent,1.0,skilled,1.0,none,yes
557,no checking,21.0,no credits/all paid,new car,5003.0,no known savings,1<=X<4,1.0,female div/dep/mar,none,4.0,life insurance,29.0,bank,own,2.0,skilled,1.0,yes,yes
836,no checking,12.0,existing paid,radio/tv,886.0,no known savings,1<=X<4,4.0,female div/dep/mar,none,2.0,car,21.0,none,own,1.0,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,no checking,18.0,all paid,new car,6458.0,<100,>=7,2.0,male single,none,4.0,no known property,39.0,bank,own,2.0,high qualif/self emp/mgmt,2.0,yes,yes
270,no checking,18.0,existing paid,new car,2662.0,no known savings,4<=X<7,4.0,male single,none,3.0,life insurance,32.0,none,own,1.0,skilled,1.0,none,no
860,no checking,24.0,critical/other existing credit,used car,5804.0,>=1000,1<=X<4,4.0,male single,none,2.0,real estate,27.0,none,own,2.0,skilled,1.0,none,yes
435,0<=X<200,12.0,existing paid,radio/tv,1484.0,no known savings,1<=X<4,2.0,male mar/wid,none,1.0,real estate,25.0,none,own,1.0,skilled,1.0,yes,yes


In [204]:
del y_full, X_full

In [205]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(800, 20)
(800,)
(200, 20)
(200,)


In [206]:
hlp.pandas.value_frequency(series=y_train)

Unnamed: 0,Frequency,Percent
good,559,0.69875
bad,241,0.30125


In [207]:
hlp.pandas.value_frequency(series=y_test)

Unnamed: 0,Frequency,Percent
good,141,0.705
bad,59,0.295


# Transformation Pipeline

In [22]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [77]:
class DropMissingValuesTransformer(BaseEstimator, TransformerMixin):
    """Any row with missing values will be dropped.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna(axis=0)

In [104]:
class DoNothingTransformer(BaseEstimator, TransformerMixin):
    """Dummy Transformer that doesn't do anything
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

In [199]:
X_train[['purpose', 'savings_status']].shape

(800, 2)

In [219]:
X_train[['purpose', 'savings_status']]

Unnamed: 0,purpose,savings_status
29,business,<100
535,education,<100
695,used car,500<=X<1000
557,new car,no known savings
836,radio/tv,no known savings
...,...,...
106,new car,<100
270,new car,no known savings
860,used car,>=1000
435,radio/tv,no known savings


In [220]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [223]:
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [224]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [225]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute_chooser', TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [226]:
non_numeric_pipeline = Pipeline([
    ('encoder_chooser', TransformerChooser()),
])

In [227]:
#temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [228]:
#print(type(temp))
#print(temp.shape)

In [229]:
#temp.toarray()[0:10, 0:10]

In [230]:
#non_numeric_pipeline.steps[0][1].categories_

In [231]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [232]:
temp = transformations_pipeline.fit_transform(X_train)

In [233]:
temp.shape

(800, 20)

In [234]:
#pd.DataFrame(temp)

In [235]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [236]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [237]:
full_pipeline.n_features_in_

20

In [238]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('impute_chooser',
                                                   TransformerChooser()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('encoder_chooser',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                           

In [251]:
temp = X_train.copy()
for column in temp.columns.values:
    if temp[column].dtype.name == 'category':
        print(column)

checking_status
credit_history
purpose
savings_status
employment
personal_status
other_parties
property_magnitude
other_payment_plans
housing
job
own_telephone
foreign_worker


In [268]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    """First replaces missing values with '<missing>' then applies OrdinalEncoder
        
    """
    def __init__(self):
        self._ordinal_encoder = OrdinalEncoder()#unknown_value=-1,
                                               #handle_unknown='use_encoded_value')
        self._missing_value = '<missing>'
            
    def _fill_na(self, X):
        for column in X.columns.values:
            if X[column].dtype.name == 'category':
                if self._missing_value not in X[column].cat.categories:
                    X[column] = X[column].cat.add_categories(self._missing_value)
                X[column] = X[column].fillna(self._missing_value)
        
        return X

        
    def fit(self, X, y=None):
        X = self._fill_na(X)
        self._ordinal_encoder.fit(X)
        return self

    def transform(self, X):
        X = self._fill_na(X)
        return self._ordinal_encoder.transform(X)

In [271]:
param_grad = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__max_features': [2, 10, 'auto'],
        'model__n_estimators': [50, 100, 500, 1000]
    },
]

In [272]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid=param_grad, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('impute_chooser',
                                                                                          TransformerChooser()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                  

In [273]:
grid_search.cv_results_

{'mean_fit_time': array([0.05571294, 0.04909682, 0.06110787, 0.06047802, 0.08872647,
        0.08873916, 0.10713396, 0.10677881, 0.42167897, 0.41602435,
        0.44601216, 0.43912101, 0.81960592, 0.83747644, 0.86440868,
        0.86501675, 0.05827146, 0.05814276, 0.08451805, 0.08180518,
        0.11070642, 0.11168942, 0.1433362 , 0.14701667, 0.53438759,
        0.51972175, 0.63543591, 0.63613725, 1.02429876, 1.03293066,
        1.24858289, 1.25509009, 0.05583868, 0.05605483, 0.06564336,
        0.06633439, 0.10385666, 0.11213517, 0.15702887, 0.13273101,
        0.55518775, 0.50882902, 0.50461068, 0.50223494, 1.04716072,
        0.9779304 , 0.99283528, 1.0109827 ]),
 'std_fit_time': array([0.01233404, 0.00012755, 0.00020192, 0.00060995, 0.0001979 ,
        0.0002779 , 0.00445144, 0.00177757, 0.00363962, 0.00407208,
        0.00197253, 0.00296356, 0.00791725, 0.01238066, 0.01137415,
        0.00671611, 0.00016554, 0.0002091 , 0.0006224 , 0.00148924,
        0.0010395 , 0.00062878, 0.001

In [274]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [275]:

results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [276]:
results_mod.\
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.78,0.72,0.841,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.779,0.717,0.841,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.779,0.722,0.836,2,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.72,0.835,10,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.716,0.838,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.716,0.838,auto,100,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
0.777,0.706,0.847,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.71,0.843,10,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.715,0.838,auto,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
0.776,0.712,0.841,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()


# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline
   
```
grid_search.best_estimator_._final_estimator.feature_importances_
grid_search.best_estimator_._final_estimator.feature_importances_.shape
```

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/