In [1]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

(1000, 21)

In [3]:
## Create Missing Values

credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
hlp.pandas.numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.4%
credit_history,1000,0,0.0,existing paid,5,0.5%
purpose,1000,0,0.0,radio/tv,10,1.0%
savings_status,1000,0,0.0,<100,5,0.5%
employment,1000,0,0.0,1<=X<4,5,0.5%
personal_status,1000,0,0.0,male single,4,0.4%
other_parties,1000,0,0.0,none,3,0.3%
property_magnitude,1000,0,0.0,car,4,0.4%
other_payment_plans,1000,0,0.0,none,3,0.3%
housing,1000,0,0.0,own,3,0.3%


# Training and Test Data

In [6]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [7]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [8]:
y_full[0:10]

0    good
1     bad
2    good
3    good
4     bad
5    good
6    good
7    good
8    good
9     bad
Name: target, dtype: category
Categories (2, object): ['good', 'bad']

In [9]:
from sklearn.preprocessing import label_binarize
#y_full = label_binarize(y_full, classes=['bad', 'good']).flatten()
y_full = label_binarize(y_full, classes=['good', 'bad']).flatten()


In [10]:
y_full[0:10]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [12]:
del y_full, X_full

In [13]:
print(X_train.shape)
print(len(y_train))

print(X_test.shape)
print(len(y_test))

(800, 20)
800
(200, 20)
200


In [14]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([559, 241]))

In [15]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.69875, 0.30125])

In [16]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])

array([0.705, 0.295])

# Transformation Pipeline

In [17]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [18]:
class DropMissingValuesTransformer(BaseEstimator, TransformerMixin):
    """Any row with missing values will be dropped.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna(axis=0)

In [19]:
class DoNothingTransformer(BaseEstimator, TransformerMixin):
    """Dummy Transformer that doesn't do anything
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

In [20]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [21]:
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [22]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [23]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute_chooser', TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [24]:
non_numeric_pipeline = Pipeline([
    ('encoder_chooser', TransformerChooser()),
])

In [25]:
#temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [26]:
#print(type(temp))
#print(temp.shape)

In [27]:
#temp.toarray()[0:10, 0:10]

In [28]:
#non_numeric_pipeline.steps[0][1].categories_

In [29]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [30]:
temp = transformations_pipeline.fit_transform(X_train)

In [31]:
temp.shape

(800, 20)

In [32]:
#pd.DataFrame(temp)

# Model

In [33]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [34]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [35]:
full_pipeline.n_features_in_

20

In [36]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('impute_chooser',
                                                   TransformerChooser()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('encoder_chooser',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                           

In [37]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    """First replaces missing values with '<missing>' then applies OrdinalEncoder
        
    """
    def __init__(self):
        self._ordinal_encoder = OrdinalEncoder()#unknown_value=-1,
                                               #handle_unknown='use_encoded_value')
        self._missing_value = '<missing>'
            
    def _fill_na(self, X):
        for column in X.columns.values:
            if X[column].dtype.name == 'category':
                if self._missing_value not in X[column].cat.categories:
                    X[column] = X[column].cat.add_categories(self._missing_value)
                X[column] = X[column].fillna(self._missing_value)
        
        return X

        
    def fit(self, X, y=None):
        X = self._fill_na(X)
        self._ordinal_encoder.fit(X)
        return self

    def transform(self, X):
        X = self._fill_na(X)
        return self._ordinal_encoder.transform(X)

In [38]:
param_grad = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__max_features': [2, 10, 60, 'auto'],
        'model__n_estimators': [50, 100, 500, 1000]
    },
]

In [126]:
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, precision_score, recall_score

from sklearn.metrics import SCORERS
# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/metrics/_scorer.py#L702
  

def make_scorer_threshold(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       #needs_proba=True,
                       needs_threshold=True,
                       greater_is_better=True)


def make_scorer_proba(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       needs_proba=True,
                       #needs_threshold=True,
                       greater_is_better=True)

scores = {
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [127]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(full_pipeline,
                           param_grid=param_grad,
                           cv=5,
                           scoring=scores,
                           refit='ROC/AUC',
                           #scoring='roc_auc',
                           return_train_score=True
                          )
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

 0.76624832 0.77293764 0.77103199 0.7683481  0.77804722 0.77442566
 0.76786318 0.76847543 0.77959291 0.77837191 0.75618601 0.76838925
 0.75507044 0.75701495 0.77238499 0.7630612  0.77320811 0.76431796
 0.77582844 0.77465613 0.76244987 0.77108168 0.77600367 0.77670556
 0.76739757 0.76750853 0.75671489 0.75990215        nan        nan
 0.75402849 0.75298093        nan        nan 0.76409187 0.76333353
        nan        nan 0.76437779 0.76523473        nan        nan
 0.75784841 0.77457634 0.76682603 0.76660023 0.77529561 0.77344755
 0.76703338 0.77641218 0.77547307 0.77699787 0.77647265 0.77288693
 0.77790236 0.7785529  0.78152622 0.77585731]
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. nan nan
  1.  1. nan nan  1.  1. nan nan  1.  1. nan nan  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
 0.42379904 0.39514359 0.37274413 0.35919733 0.41438699 0.42218771
 0.36807671 0.3701261  0.43437928 0.43020264 0.44724952 0.46963474
 0.46974158 0.4824786  0.4419

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('impute_chooser',
                                                                                          TransformerChooser()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                                          'residence_since',
                  

In [128]:
grid_search.cv_results_

{'mean_fit_time': array([0.05417862, 0.04730601, 0.05951829, 0.06055212, 0.08858666,
        0.08740463, 0.10061545, 0.10149546, 0.41295357, 0.41482377,
        0.43992028, 0.43732347, 0.80558186, 0.82225037, 0.87537103,
        0.85850058, 0.05778885, 0.05813098, 0.07880325, 0.07931814,
        0.10785804, 0.10745845, 0.14003377, 0.14013319, 0.51836042,
        0.51843901, 0.63242459, 0.6341712 , 1.02893505, 1.03213573,
        1.2565908 , 1.25159216, 0.11926546, 0.1194653 , 0.02710581,
        0.02769065, 0.23304896, 0.23309107, 0.03717809, 0.03975143,
        1.13864684, 1.13762836, 0.11430955, 0.11491432, 2.2616878 ,
        2.23937316, 0.21141067, 0.21183147, 0.05502357, 0.05483274,
        0.06517086, 0.06511121, 0.1025754 , 0.10271163, 0.11292019,
        0.11270566, 0.48131695, 0.48152752, 0.49456053, 0.49407301,
        0.95155287, 0.95632529, 0.96862602, 0.96605473]),
 'std_fit_time': array([1.30855062e-02, 1.84353558e-04, 1.74571984e-03, 1.36840538e-03,
        1.14923755e-0

In [139]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({
                            'mean_test_ROC/AUC': cvres["mean_test_ROC/AUC"], 
                            'std_test_ROC/AUC': cvres["std_test_ROC/AUC"],
                            'mean_test_F1': cvres["mean_test_F1"], 
                            'std_test_F1': cvres["std_test_F1"],
                            'mean_test_Pos. Pred. Val': cvres["mean_test_Pos. Pred. Val"], 
                            'std_test_Pos. Pred. Val': cvres["std_test_Pos. Pred. Val"],
                            'mean_test_True Pos. Rate': cvres["mean_test_True Pos. Rate"], 
                            'std_test_True Pos. Rate': cvres["std_test_True Pos. Rate"],
                        }),
                        pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_test_ROC/AUC'], ascending=False)

In [140]:
results_mod = results_df.copy()
# #results_mod.drop(columns=['min', 'max'], inplace = True)
# results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
# results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
# results_mod.drop(columns=['st_dev_score'], inplace = True)

In [141]:
results_mod.style.bar(subset=['mean_test_ROC/AUC'], color='#5fba7d')

Unnamed: 0,mean_test_ROC/AUC,std_test_ROC/AUC,mean_test_F1,std_test_F1,mean_test_Pos. Pred. Val,std_test_Pos. Pred. Val,mean_test_True Pos. Rate,std_test_True Pos. Rate,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
62,0.781526,0.029707,0.451747,0.080471,0.608677,0.083518,0.361139,0.076942,auto,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
14,0.779593,0.032943,0.434379,0.063282,0.681441,0.090023,0.319643,0.051087,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
61,0.778553,0.029926,0.47145,0.063143,0.662951,0.074631,0.369558,0.063594,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
15,0.778372,0.033577,0.430203,0.077605,0.68204,0.096766,0.315561,0.064566,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
10,0.778047,0.035598,0.414387,0.084916,0.664453,0.09624,0.303061,0.073424,2,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
60,0.777902,0.033052,0.462901,0.050889,0.650373,0.057835,0.361224,0.049952,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
57,0.776998,0.030356,0.472816,0.058701,0.676039,0.057878,0.365391,0.056571,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
29,0.776706,0.031863,0.47272,0.075256,0.643417,0.065442,0.377806,0.07813,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
58,0.776473,0.027602,0.450758,0.064807,0.605789,0.05573,0.361224,0.066369,auto,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
55,0.776412,0.030838,0.486815,0.052397,0.649468,0.066742,0.390306,0.046317,auto,100,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()


In [57]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 
                                      'st_dev_score': cvres["std_test_score"]}),
                        pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [58]:
results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [59]:
results_mod.\
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.78,0.718,0.841,10,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.779,0.722,0.837,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
0.779,0.718,0.839,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
0.778,0.716,0.841,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.713,0.841,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.707,0.847,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.711,0.843,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.693,0.861,10,50,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.713,0.841,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.776,0.711,0.841,2,500,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()


# Random Search

In [None]:
from scipy.stats import randint, uniform, expon

In [None]:
np.random.seed(42)
s = uniform(.2, .79).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
np.random.seed(42)
s = randint(2, 100).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
s = randint(2, 20)
s.

In [None]:
If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.

In [None]:
model_param_dict = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__n_estimators': randint(50, 5000),
        'model__max_features':  uniform(.01, .99),
        #'model__max_depth': randint(2, 50),
    },
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
random_search = RandomizedSearchCV(estimator=full_pipeline,
                                 param_distributions=model_param_dict,
                                 n_iter=1000,
                                 cv=RepeatedKFold(n_splits=5, n_repeats=1),
                                 #scoring=scorer,
                                 #return_train_score=True,
                                 n_jobs=-1,
                                 #verbose=2,
                                )
random_search.fit(X_train, y_train)

In [None]:
cvres = random_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [None]:

results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [None]:
results_mod. \
    head(30). \
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline
   
```
grid_search.best_estimator_._final_estimator.feature_importances_
grid_search.best_estimator_._final_estimator.feature_importances_.shape
```

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/