In [1]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

(1000, 21)

In [3]:
## Create Missing Values

credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
hlp.pandas.numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.4%
credit_history,1000,0,0.0,existing paid,5,0.5%
purpose,1000,0,0.0,radio/tv,10,1.0%
savings_status,1000,0,0.0,<100,5,0.5%
employment,1000,0,0.0,1<=X<4,5,0.5%
personal_status,1000,0,0.0,male single,4,0.4%
other_parties,1000,0,0.0,none,3,0.3%
property_magnitude,1000,0,0.0,car,4,0.4%
other_payment_plans,1000,0,0.0,none,3,0.3%
housing,1000,0,0.0,own,3,0.3%


# Training and Test Data

In [6]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [7]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [8]:
y_full[0:10]

0    good
1     bad
2    good
3    good
4     bad
5    good
6    good
7    good
8    good
9     bad
Name: target, dtype: category
Categories (2, object): ['good', 'bad']

In [9]:
from sklearn.preprocessing import label_binarize
y_full = label_binarize(y_full, classes=['bad', 'good']).flatten()

In [10]:
y_full[0:10]

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 0])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [12]:
del y_full, X_full

In [13]:
print(X_train.shape)
print(len(y_train))

print(X_test.shape)
print(len(y_test))

(800, 20)
800
(200, 20)
200


In [14]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([241, 559]))

In [15]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.30125, 0.69875])

In [16]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])

array([0.295, 0.705])

# Transformation Pipeline

In [17]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [18]:
class DropMissingValuesTransformer(BaseEstimator, TransformerMixin):
    """Any row with missing values will be dropped.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna(axis=0)

In [19]:
class DoNothingTransformer(BaseEstimator, TransformerMixin):
    """Dummy Transformer that doesn't do anything
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

In [20]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [21]:
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [22]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [23]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute_chooser', TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [24]:
non_numeric_pipeline = Pipeline([
    ('encoder_chooser', TransformerChooser()),
])

In [25]:
#temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [26]:
#print(type(temp))
#print(temp.shape)

In [27]:
#temp.toarray()[0:10, 0:10]

In [28]:
#non_numeric_pipeline.steps[0][1].categories_

In [29]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [30]:
temp = transformations_pipeline.fit_transform(X_train)

In [31]:
temp.shape

(800, 20)

In [32]:
#pd.DataFrame(temp)

# Model

In [33]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [34]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [35]:
full_pipeline.n_features_in_

20

In [36]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('impute_chooser',
                                                   TransformerChooser()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('encoder_chooser',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                           

In [37]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    """First replaces missing values with '<missing>' then applies OrdinalEncoder
        
    """
    def __init__(self):
        self._ordinal_encoder = OrdinalEncoder()#unknown_value=-1,
                                               #handle_unknown='use_encoded_value')
        self._missing_value = '<missing>'
            
    def _fill_na(self, X):
        for column in X.columns.values:
            if X[column].dtype.name == 'category':
                if self._missing_value not in X[column].cat.categories:
                    X[column] = X[column].cat.add_categories(self._missing_value)
                X[column] = X[column].fillna(self._missing_value)
        
        return X

        
    def fit(self, X, y=None):
        X = self._fill_na(X)
        self._ordinal_encoder.fit(X)
        return self

    def transform(self, X):
        X = self._fill_na(X)
        return self._ordinal_encoder.transform(X)

In [38]:
param_grad = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__max_features': [2, 10, 60, 'auto'],
        'model__n_estimators': [50, 100, 500, 1000]
    },
]

In [39]:
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, precision_score, recall_score

from sklearn.metrics import SCORERS
# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/metrics/_scorer.py#L702
  

def make_scorer_threshold(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       #needs_proba=True,
                       needs_threshold=True,
                       greater_is_better=True)


def make_scorer_proba(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       needs_proba=True,
                       #needs_threshold=True,
                       greater_is_better=True)

scores = {
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
grid_search = GridSearchCV(full_pipeline,
                           param_grid=param_grad,
                           cv=RepeatedKFold(n_splits=5, n_repeats=2),
                           scoring=scores,
                           refit='ROC/AUC',
                           #scoring='roc_auc',
                           return_train_score=True
                          )
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

GridSearchCV(cv=RepeatedKFold(n_repeats=2, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('impute_chooser',
                                                                                          TransformerChooser()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                       

In [41]:
grid_search.cv_results_

{'mean_fit_time': array([0.0503278 , 0.04718738, 0.05851831, 0.05855215, 0.08780353,
        0.08806345, 0.10052717, 0.10094283, 0.40781307, 0.40744395,
        0.43609669, 0.43865523, 0.84609838, 0.86276217, 0.90428438,
        0.89234385, 0.05974331, 0.05963202, 0.08396769, 0.08085501,
        0.1138972 , 0.11738369, 0.15493419, 0.14316213, 0.53581908,
        0.52168987, 0.64075081, 0.68698103, 1.10078876, 1.10274651,
        1.30949874, 1.31282151, 0.12015216, 0.11898913, 0.02718678,
        0.02905054, 0.23366721, 0.24464824, 0.03866918, 0.03752899,
        1.20044541, 1.19207342, 0.11952128, 0.1170985 , 2.40584743,
        2.28872433, 0.21686745, 0.25590432, 0.06040251, 0.05863564,
        0.06831241, 0.0676949 , 0.10728581, 0.11427393, 0.1183847 ,
        0.11644735, 0.49110305, 0.49190423, 0.50017993, 0.50364716,
        1.02185373, 0.99673316, 1.05210485, 0.97442636]),
 'std_fit_time': array([0.00724321, 0.00016527, 0.00016809, 0.00015877, 0.00187793,
        0.00105837, 0.000

In [45]:
results_df = None
for score in scores.keys():
    print(score)

    results_df = pd.concat([
        results_df,
        pd.DataFrame({
            score + " Mean": cvres['mean_test_' + score], 
            score + " St. Dev": cvres['std_test_' + score],
        })
    ], axis=1)
    
results_df = pd.concat([
        results_df,
        pd.DataFrame(cvres["params"]),
    ], axis=1)


#scores.keys()[0]
results_df = results_df.sort_values(by=str(list(scores.keys())[0]) + ' Mean', ascending=False)

ROC/AUC
F1
Pos. Pred. Val
True Pos. Rate


In [47]:
modified_results = results_df.head(20)
#modified_results

In [48]:
import math
import scipy.stats as st

sample_size = 10 # 5 fold * 2 repeats
modified_results = results_df

for score in scores.keys():
    print(score)
    mean_key = score + ' Mean'
    st_dev_key = score + ' St. Dev'
    
    score_means = modified_results[mean_key]
    score_standard_errors = modified_results[st_dev_key] / math.sqrt(sample_size)
    
    confidence_intervals = st.t.interval(alpha=0.95,  # confidence interval
                                         df=sample_size-1,  # degrees of fredom
                                         loc=score_means,
                                         scale=score_standard_errors)
    
    modified_results = modified_results.drop(columns=st_dev_key)
    
    insertion_index = modified_results.columns.get_loc(mean_key) + 1
    modified_results.insert(loc=insertion_index,
                            column=score + ' 95CI.HI',
                            value=confidence_intervals[1])
    modified_results.insert(loc=insertion_index,
                            column=score + ' 95CI.LO',
                            value=confidence_intervals[0])
    
    
    
#modified_results = modified_results.style.bar(subset=['Mean ROC/AUC'], color='#5fba7d')

ROC/AUC
F1
Pos. Pred. Val
True Pos. Rate


In [51]:
modified_style = modified_results.style

for score in scores.keys():
    mean_key = score + ' Mean'
    ci_low_key = score + ' 95CI.LO'
    ci_high_key = score + ' 95CI.HI'
    
    modified_style.\
        bar(subset=[mean_key], color=hlp.color.Colors.PIGMENT_GREEN.value).\
        bar(subset=[ci_high_key], color=hlp.color.GRAY).\
        pipe(hlp.pandas_style.bar_inverse, subset=[ci_low_key], color=hlp.color.GRAY)

modified_style = hlp.pandas_style.format(styler=modified_style, round_by=3, hide_index=True)
    
modified_style

ROC/AUC Mean,ROC/AUC 95CI.LO,ROC/AUC 95CI.HI,F1 Mean,F1 95CI.LO,F1 95CI.HI,Pos. Pred. Val Mean,Pos. Pred. Val 95CI.LO,Pos. Pred. Val 95CI.HI,True Pos. Rate Mean,True Pos. Rate 95CI.LO,True Pos. Rate 95CI.HI,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.771,0.754,0.788,0.84,0.828,0.852,0.767,0.745,0.789,0.929,0.918,0.94,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.771,0.76,0.781,0.837,0.823,0.851,0.758,0.735,0.781,0.935,0.921,0.949,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
0.77,0.751,0.789,0.84,0.826,0.855,0.744,0.719,0.769,0.966,0.956,0.976,2,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.77,0.752,0.789,0.836,0.822,0.849,0.764,0.74,0.787,0.923,0.913,0.934,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.77,0.752,0.789,0.834,0.82,0.849,0.772,0.746,0.797,0.91,0.899,0.92,10,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.77,0.753,0.787,0.839,0.827,0.851,0.768,0.744,0.792,0.925,0.911,0.94,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.77,0.752,0.788,0.84,0.827,0.853,0.745,0.721,0.769,0.964,0.953,0.974,2,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.77,0.757,0.782,0.835,0.82,0.85,0.755,0.731,0.779,0.934,0.922,0.946,2,500,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
0.77,0.752,0.788,0.842,0.828,0.856,0.748,0.725,0.77,0.965,0.954,0.976,2,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.769,0.756,0.782,0.835,0.821,0.85,0.758,0.733,0.782,0.933,0.917,0.948,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()


In [None]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 
                                      'st_dev_score': cvres["std_test_score"]}),
                        pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [None]:
results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [None]:
results_mod.\
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

# Random Search

In [None]:
from scipy.stats import randint, uniform, expon

In [None]:
np.random.seed(42)
s = uniform(.2, .79).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
np.random.seed(42)
s = randint(2, 100).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
s = randint(2, 20)
s.

In [None]:
If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.

In [None]:
model_param_dict = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__n_estimators': randint(50, 5000),
        'model__max_features':  uniform(.01, .99),
        #'model__max_depth': randint(2, 50),
    },
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
random_search = RandomizedSearchCV(estimator=full_pipeline,
                                 param_distributions=model_param_dict,
                                 n_iter=1000,
                                 cv=RepeatedKFold(n_splits=5, n_repeats=1),
                                 #scoring=scorer,
                                 #return_train_score=True,
                                 n_jobs=-1,
                                 #verbose=2,
                                )
random_search.fit(X_train, y_train)

In [None]:
cvres = random_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [None]:

results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [None]:
results_mod. \
    head(30). \
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline
   
```
grid_search.best_estimator_._final_estimator.feature_importances_
grid_search.best_estimator_._final_estimator.feature_importances_.shape
```

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/