In [1]:
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

# Load Data

In [2]:
#https://www.openml.org/d/31
credit_g = fetch_openml('credit-g', version=1)
credit_data = credit_g['data']
credit_data['target'] = credit_g['target']
credit_data.shape

(1000, 21)

In [3]:
## Create Missing Values

credit_data['duration'].iloc[0:46] = np.nan
credit_data['checking_status'].iloc[25:75] = np.nan
credit_data['credit_amount'].iloc[10:54] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
hlp.pandas.numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,# of Zeros,% Zeros,Mean,St Dev.,Coef of Var,Skewness,Kurtosis,Min,10%,25%,50%,75%,90%,Max
duration,954,46,5.0%,0,0.0%,20.9,12.0,0.6,1.1,1.0,4.0,9.0,12.0,18.0,24.0,36.0,72.0
credit_amount,1000,0,0.0%,44,4.0%,3132.9,2853.4,0.9,1.9,4.3,0.0,740.0,1287.8,2224.0,3873.5,7119.8,18424.0
installment_commitment,1000,0,0.0%,0,0.0%,3.0,1.1,0.4,-0.5,-1.2,1.0,1.0,2.0,3.0,4.0,4.0,4.0
residence_since,1000,0,0.0%,0,0.0%,2.9,1.1,0.4,-0.3,-1.4,1.0,1.0,2.0,3.0,4.0,4.0,4.0
age,1000,0,0.0%,0,0.0%,35.5,11.4,0.3,1.0,0.6,19.0,23.0,27.0,33.0,42.0,52.0,75.0
existing_credits,1000,0,0.0%,0,0.0%,1.4,0.6,0.4,1.3,1.6,1.0,1.0,1.0,1.0,2.0,2.0,4.0
num_dependents,1000,0,0.0%,0,0.0%,1.2,0.4,0.3,1.9,1.6,1.0,1.0,1.0,1.0,1.0,2.0,2.0


In [5]:
hlp.pandas.non_numeric_summary(credit_data, return_style=True)

Unnamed: 0,# of Non-Nulls,# of Nulls,% Nulls,Most Freq. Value,# of Unique,% Unique
checking_status,950,50,0.05,no checking,4,0.4%
credit_history,1000,0,0.0,existing paid,5,0.5%
purpose,1000,0,0.0,radio/tv,10,1.0%
savings_status,1000,0,0.0,<100,5,0.5%
employment,1000,0,0.0,1<=X<4,5,0.5%
personal_status,1000,0,0.0,male single,4,0.4%
other_parties,1000,0,0.0,none,3,0.3%
property_magnitude,1000,0,0.0,car,4,0.4%
other_payment_plans,1000,0,0.0,none,3,0.3%
housing,1000,0,0.0,own,3,0.3%


# Training and Test Data

In [6]:
y_full = credit_data['target']
X_full = credit_data.drop(columns='target')

In [7]:
hlp.pandas.value_frequency(series=y_full)

Unnamed: 0,Frequency,Percent
good,700,0.7
bad,300,0.3


In [8]:
y_full[0:10]

0    good
1     bad
2    good
3    good
4     bad
5    good
6    good
7    good
8    good
9     bad
Name: target, dtype: category
Categories (2, object): ['good', 'bad']

In [9]:
from sklearn.preprocessing import label_binarize
#y_full = label_binarize(y_full, classes=['bad', 'good']).flatten()
y_full = label_binarize(y_full, classes=['good', 'bad']).flatten()


In [10]:
y_full[0:10]

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [12]:
del y_full, X_full

In [13]:
print(X_train.shape)
print(len(y_train))

print(X_test.shape)
print(len(y_test))

(800, 20)
800
(200, 20)
200


In [14]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([559, 241]))

In [15]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])

array([0.69875, 0.30125])

In [16]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])

array([0.705, 0.295])

# Transformation Pipeline

In [17]:
class TransformerChooser(BaseEstimator, TransformerMixin):
    """Transformer that wraps another Transformer. This allows different transformer objects to be tuned.
    """
    def __init__(self, base_transformer=None):
        """
        Args:
            base_transformer:
                Transformer object (e.g. StandardScaler, MinMaxScaler)
        """
        self.base_transformer = base_transformer

    def fit(self, X, y=None):
        if self.base_transformer is None:
            return self

        return self.base_transformer.fit(X, y)

    def transform(self, X):
        if self.base_transformer is None:
            return X

        return self.base_transformer.transform(X)

In [18]:
class DropMissingValuesTransformer(BaseEstimator, TransformerMixin):
    """Any row with missing values will be dropped.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.dropna(axis=0)

In [19]:
class DoNothingTransformer(BaseEstimator, TransformerMixin):
    """Dummy Transformer that doesn't do anything
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

In [20]:
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [21]:
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])

array([[0., 2.],
       [2., 2.],
       [9., 1.],
       ...,
       [9., 3.],
       [6., 4.],
       [6., 2.]])

In [22]:
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']


In [23]:
numeric_pipeline = Pipeline([
    #tune whether or not we want to impute or simply remove rows with missing values
    ('impute_chooser', TransformerChooser()),
    # this is here so that we can select between MinMax and Scaler
    # if this pipeline is ran in a context outside of tuning, no transformation will take place
    ('scaling_chooser', TransformerChooser()),
])

In [24]:
non_numeric_pipeline = Pipeline([
    ('encoder_chooser', TransformerChooser()),
])

In [25]:
#temp = non_numeric_pipeline.fit_transform(X_train[non_numeric_columns])

In [26]:
#print(type(temp))
#print(temp.shape)

In [27]:
#temp.toarray()[0:10, 0:10]

In [28]:
#non_numeric_pipeline.steps[0][1].categories_

In [29]:
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
    ('numeric_pipeline', numeric_pipeline, numeric_columns),
    ('non_numeric_pipeline', non_numeric_pipeline, non_numeric_columns)
])

In [30]:
temp = transformations_pipeline.fit_transform(X_train)

In [31]:
temp.shape

(800, 20)

In [32]:
#pd.DataFrame(temp)

# Model

In [33]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier()

In [34]:
full_pipeline = Pipeline([
    ('preparation', transformations_pipeline),
    #('pca_chooser', ChooserTransform()),  # PCA option lost; didn't include
    #('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('model', random_forest_model)
])

In [35]:
full_pipeline.n_features_in_

20

In [36]:
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps

{'preparation': ColumnTransformer(transformers=[('numeric_pipeline',
                                  Pipeline(steps=[('impute_chooser',
                                                   TransformerChooser()),
                                                  ('scaling_chooser',
                                                   TransformerChooser())]),
                                  ['duration', 'credit_amount',
                                   'installment_commitment', 'residence_since',
                                   'age', 'existing_credits',
                                   'num_dependents']),
                                 ('non_numeric_pipeline',
                                  Pipeline(steps=[('encoder_chooser',
                                                   TransformerChooser())]),
                                  ['checking_status', 'credit_history',
                                   'purpose', 'savings_status', 'employment',
                           

In [37]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    """First replaces missing values with '<missing>' then applies OrdinalEncoder
        
    """
    def __init__(self):
        self._ordinal_encoder = OrdinalEncoder()#unknown_value=-1,
                                               #handle_unknown='use_encoded_value')
        self._missing_value = '<missing>'
            
    def _fill_na(self, X):
        for column in X.columns.values:
            if X[column].dtype.name == 'category':
                if self._missing_value not in X[column].cat.categories:
                    X[column] = X[column].cat.add_categories(self._missing_value)
                X[column] = X[column].fillna(self._missing_value)
        
        return X

        
    def fit(self, X, y=None):
        X = self._fill_na(X)
        self._ordinal_encoder.fit(X)
        return self

    def transform(self, X):
        X = self._fill_na(X)
        return self._ordinal_encoder.transform(X)

In [38]:
param_grad = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__max_features': [2, 10, 60, 'auto'],
        'model__n_estimators': [50, 100, 500, 1000]
    },
]

In [126]:
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, precision_score, recall_score

from sklearn.metrics import SCORERS
# https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/metrics/_scorer.py#L702
  

def make_scorer_threshold(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       #needs_proba=True,
                       needs_threshold=True,
                       greater_is_better=True)


def make_scorer_proba(score_func):
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    # https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve

    # https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
    # according to this link they use
    # roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
    # needs_threshold=True)

    return make_scorer(score_func,
                       needs_proba=True,
                       #needs_threshold=True,
                       greater_is_better=True)

scores = {
    'ROC/AUC':  SCORERS['roc_auc'],
    'F1': make_scorer(f1_score, greater_is_better=True),
    'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
    'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}

In [151]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
grid_search = GridSearchCV(full_pipeline,
                           param_grid=param_grad,
                           cv=RepeatedKFold(n_splits=5, n_repeats=2),
                           scoring=scores,
                           refit='ROC/AUC',
                           #scoring='roc_auc',
                           return_train_score=True
                          )
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

Traceback (most recent call last):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 387, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 1041, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File

GridSearchCV(cv=RepeatedKFold(n_repeats=2, n_splits=5, random_state=None),
             estimator=Pipeline(steps=[('preparation',
                                        ColumnTransformer(transformers=[('numeric_pipeline',
                                                                         Pipeline(steps=[('impute_chooser',
                                                                                          TransformerChooser()),
                                                                                         ('scaling_chooser',
                                                                                          TransformerChooser())]),
                                                                         ['duration',
                                                                          'credit_amount',
                                                                          'installment_commitment',
                                                       

In [152]:
grid_search.cv_results_

{'mean_fit_time': array([0.0538981 , 0.0480804 , 0.06008992, 0.05911622, 0.08709493,
        0.08808928, 0.10027394, 0.10061822, 0.41125367, 0.41278768,
        0.4373507 , 0.43292458, 0.80480864, 0.81705515, 0.85099714,
        0.85320811, 0.05718503, 0.05871933, 0.07964287, 0.07883091,
        0.10900896, 0.10857005, 0.14005299, 0.1403142 , 0.5119993 ,
        0.5171401 , 0.63138666, 0.62670639, 1.01909382, 1.01544223,
        1.24854369, 1.24029272, 0.11682971, 0.11765678, 0.02715237,
        0.02855678, 0.23036098, 0.2271944 , 0.0367389 , 0.03719406,
        1.12322297, 1.12120464, 0.11374381, 0.1143414 , 2.23552644,
        2.24588547, 0.21101947, 0.21125915, 0.05435288, 0.05371087,
        0.06378839, 0.0651639 , 0.10222626, 0.10223691, 0.11077135,
        0.11067657, 0.47374125, 0.47644222, 0.49689734, 0.49272063,
        0.94743998, 0.94868028, 0.96349466, 0.970801  ]),
 'std_fit_time': array([0.01636928, 0.00158217, 0.00318611, 0.00085013, 0.00033629,
        0.000735  , 0.000

In [153]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({
                            'mean_test_ROC/AUC': cvres["mean_test_ROC/AUC"], 
                            'std_test_ROC/AUC': cvres["std_test_ROC/AUC"],
                            'mean_test_F1': cvres["mean_test_F1"], 
                            'std_test_F1': cvres["std_test_F1"],
                            'mean_test_Pos. Pred. Val': cvres["mean_test_Pos. Pred. Val"], 
                            'std_test_Pos. Pred. Val': cvres["std_test_Pos. Pred. Val"],
                            'mean_test_True Pos. Rate': cvres["mean_test_True Pos. Rate"], 
                            'std_test_True Pos. Rate': cvres["std_test_True Pos. Rate"],
                        }),
                        pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_test_ROC/AUC'], ascending=False)

In [154]:
results_mod = results_df.copy()
# #results_mod.drop(columns=['min', 'max'], inplace = True)
# results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
# results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
# results_mod.drop(columns=['st_dev_score'], inplace = True)

In [155]:
results_mod.style.\
    bar(subset=['mean_test_ROC/AUC'], color='#5fba7d').\
    bar(subset=['mean_test_F1'], color='#5fba7d').\
    bar(subset=['mean_test_Pos. Pred. Val'], color='#5fba7d').\
    bar(subset=['mean_test_True Pos. Rate'], color='#5fba7d')

Unnamed: 0,mean_test_ROC/AUC,std_test_ROC/AUC,mean_test_F1,std_test_F1,mean_test_Pos. Pred. Val,std_test_Pos. Pred. Val,mean_test_True Pos. Rate,std_test_True Pos. Rate,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
14,0.777478,0.032883,0.429156,0.092883,0.669493,0.0927,0.323622,0.09615,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
10,0.777281,0.031635,0.413401,0.077724,0.657728,0.114795,0.309128,0.077837,2,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
12,0.776882,0.039351,0.353514,0.117559,0.729248,0.121961,0.242006,0.099534,2,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
61,0.776002,0.034158,0.468734,0.077327,0.666539,0.070259,0.371702,0.099663,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
13,0.77545,0.037313,0.344131,0.114806,0.731158,0.092156,0.235954,0.099811,2,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
15,0.775359,0.034299,0.413759,0.097139,0.654934,0.091919,0.312478,0.10101,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
57,0.775061,0.038267,0.454619,0.094288,0.644071,0.098786,0.361431,0.103709,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
56,0.775049,0.037916,0.463749,0.09665,0.642543,0.068948,0.372285,0.111453,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
60,0.774458,0.037415,0.461719,0.096128,0.648157,0.066098,0.372135,0.121432,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
29,0.772882,0.03739,0.471775,0.086942,0.632713,0.070553,0.388067,0.110103,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()


In [156]:
scores.keys()

dict_keys(['ROC/AUC', 'F1', 'Pos. Pred. Val', 'True Pos. Rate'])

In [257]:
results_df = None
for score in scores.keys():
    print(score)

    results_df = pd.concat([
        results_df,
        pd.DataFrame({
            score + " Mean": cvres['mean_test_' + score], 
            score + " St. Dev": cvres['std_test_' + score],
        })
    ], axis=1)
    
results_df = pd.concat([
        results_df,
        pd.DataFrame(cvres["params"]),
    ], axis=1)


#scores.keys()[0]
results_df = results_df.sort_values(by=str(list(scores.keys())[0]) + ' Mean', ascending=False)

ROC/AUC
F1
Pos. Pred. Val
True Pos. Rate


In [258]:
modified_results = results_df.head(20)
modified_results

Unnamed: 0,ROC/AUC Mean,ROC/AUC St. Dev,F1 Mean,F1 St. Dev,Pos. Pred. Val Mean,Pos. Pred. Val St. Dev,True Pos. Rate Mean,True Pos. Rate St. Dev,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
14,0.777478,0.032883,0.429156,0.092883,0.669493,0.0927,0.323622,0.09615,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
10,0.777281,0.031635,0.413401,0.077724,0.657728,0.114795,0.309128,0.077837,2,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
12,0.776882,0.039351,0.353514,0.117559,0.729248,0.121961,0.242006,0.099534,2,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
61,0.776002,0.034158,0.468734,0.077327,0.666539,0.070259,0.371702,0.099663,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
13,0.77545,0.037313,0.344131,0.114806,0.731158,0.092156,0.235954,0.099811,2,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
15,0.775359,0.034299,0.413759,0.097139,0.654934,0.091919,0.312478,0.10101,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
57,0.775061,0.038267,0.454619,0.094288,0.644071,0.098786,0.361431,0.103709,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
56,0.775049,0.037916,0.463749,0.09665,0.642543,0.068948,0.372285,0.111453,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
60,0.774458,0.037415,0.461719,0.096128,0.648157,0.066098,0.372135,0.121432,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
29,0.772882,0.03739,0.471775,0.086942,0.632713,0.070553,0.388067,0.110103,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()


In [190]:
a = range(10,14)
list(a)

[10, 11, 12, 13]

In [192]:
data = [3, 4, 4, 5, 7, 8, 12, 14, 14, 15, 17, 19, 22, 24, 24, 24, 25, 28, 28, 29]
from scipy.stats import sem, tstd
sem(data)

2.0014468450808804

In [193]:
tstd(data)

8.950742397906678

In [None]:
import numpy as np
import scipy.stats as st
st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a))

In [199]:
temp_m = modified_results['Mean ROC/AUC'].iloc[0]
temp_s = modified_results['St. Dev ROC/AUC'].iloc[0]
print(temp_m)
print(temp_s)

0.7774784530118133
0.032882977485774456


In [202]:
temp_se = temp_s / math.sqrt(sample_size)
temp_se

0.010398510510308434

In [212]:
temp_m - 2*temp_se

0.7566814319911964

In [213]:
temp_m + 2*temp_se

0.7982754740324302

In [214]:
import scipy.stats as st

In [215]:
low_95, high_95 = st.t.interval(0.95, sample_size-1, loc=temp_m, scale=temp_se)

In [209]:
print(low_95)
print(high_95)

0.7539553879790816
0.801001518044545


In [243]:
sample_size = 10
import math
means = modified_results['Mean ROC/AUC']
standard_errors = modified_results['St. Dev ROC/AUC'] / math.sqrt(sample_size)
standard_errors

14    0.010399
10    0.010004
12    0.012444
61    0.010802
13    0.011799
15    0.010846
57    0.012101
56    0.011990
60    0.011832
29    0.011824
52    0.012356
11    0.009537
53    0.011557
28    0.011228
24    0.011239
8     0.011715
25    0.012020
9     0.011527
62    0.009947
59    0.009548
Name: St. Dev ROC/AUC, dtype: float64

In [244]:
confidence_intervals = st.t.interval(0.95, sample_size-1, loc=means, scale=standard_errors)
confidence_intervals

(array([0.75395539, 0.75465021, 0.74873233, 0.75156661, 0.74875832,
        0.75082349, 0.74768673, 0.74792526, 0.74769234, 0.7461347 ,
        0.74481226, 0.75112839, 0.7464955 , 0.74715975, 0.74670297,
        0.74554677, 0.74354653, 0.74457557, 0.74605307, 0.74658069]),
 array([0.80100152, 0.79991083, 0.80503242, 0.8004364 , 0.80214224,
        0.79989501, 0.80243608, 0.80217222, 0.80122315, 0.79962876,
        0.80071385, 0.79427685, 0.79878382, 0.79795821, 0.79754966,
        0.79854898, 0.79792864, 0.79672571, 0.79105665, 0.78977913]))

In [245]:
modified_results = modified_results.drop(columns='St. Dev ROC/AUC')

In [246]:
loc_index = modified_results.columns.get_loc('Mean ROC/AUC') + 1
modified_results.insert(loc=loc_index,
                       column='ROC/AUC 95CI.HI',
                       value=confidence_intervals[1])

In [247]:
modified_results.insert(loc=loc_index,
                       column='ROC/AUC 95CI.LOW',
                       value=confidence_intervals[0])

In [248]:
modified_results

Unnamed: 0,Mean ROC/AUC,ROC/AUC 95CI.LOW,ROC/AUC 95CI.HI,Mean F1,St. Dev F1,Mean Pos. Pred. Val,St. Dev Pos. Pred. Val,Mean True Pos. Rate,St. Dev True Pos. Rate,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
14,0.777478,0.753955,0.801002,0.429156,0.092883,0.669493,0.0927,0.323622,0.09615,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
10,0.777281,0.75465,0.799911,0.413401,0.077724,0.657728,0.114795,0.309128,0.077837,2,500,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
12,0.776882,0.748732,0.805032,0.353514,0.117559,0.729248,0.121961,0.242006,0.099534,2,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
61,0.776002,0.751567,0.800436,0.468734,0.077327,0.666539,0.070259,0.371702,0.099663,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
13,0.77545,0.748758,0.802142,0.344131,0.114806,0.731158,0.092156,0.235954,0.099811,2,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
15,0.775359,0.750823,0.799895,0.413759,0.097139,0.654934,0.091919,0.312478,0.10101,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
57,0.775061,0.747687,0.802436,0.454619,0.094288,0.644071,0.098786,0.361431,0.103709,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
56,0.775049,0.747925,0.802172,0.463749,0.09665,0.642543,0.068948,0.372285,0.111453,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
60,0.774458,0.747692,0.801223,0.461719,0.096128,0.648157,0.066098,0.372135,0.121432,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
29,0.772882,0.746135,0.799629,0.471775,0.086942,0.632713,0.070553,0.388067,0.110103,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()


In [179]:
sample_size = 10 # 5 fold * 2 repeats
import math


for score in scores.keys():
    mean_key = 'Mean ' + score
    st_dev_key = 'St. Dev ' + score
    
    score_means = modified_results
    score_st_errors = modified_results[st_dev_key] / 
    
    
    means = modified_results['Mean ROC/AUC']
    standard_errors = modified_results['St. Dev ROC/AUC'] / math.sqrt(sample_size)
    standard_errors
    
    
    
    print(mean_key)
    print(st_dev_key)
    
    modified_results = modified_results
    
    
    #modified_results = modified_results.style.bar(subset=['Mean ROC/AUC'], color='#5fba7d')

Mean ROC/AUC
St. Dev ROC/AUC
Mean F1
St. Dev F1
Mean Pos. Pred. Val
St. Dev Pos. Pred. Val
Mean True Pos. Rate
St. Dev True Pos. Rate


In [57]:
cvres = grid_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 
                                      'st_dev_score': cvres["std_test_score"]}),
                        pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [58]:
results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [59]:
results_mod.\
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

mean_score,mean*-2SD,mean*+2SD,model__max_features,model__n_estimators,preparation__non_numeric_pipeline__encoder_chooser__base_transformer,preparation__numeric_pipeline__impute_chooser__base_transformer,preparation__numeric_pipeline__scaling_chooser__base_transformer
0.78,0.718,0.841,10,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.779,0.722,0.837,2,1000,CustomOrdinalEncoder(),SimpleImputer(),MinMaxScaler()
0.779,0.718,0.839,2,1000,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()
0.778,0.716,0.841,auto,500,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.713,0.841,auto,1000,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.707,0.847,auto,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.711,0.843,10,1000,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.777,0.693,0.861,10,50,OneHotEncoder(),SimpleImputer(),MinMaxScaler()
0.777,0.713,0.841,auto,500,OneHotEncoder(),SimpleImputer(),StandardScaler()
0.776,0.711,0.841,2,500,CustomOrdinalEncoder(),SimpleImputer(),StandardScaler()


# Random Search

In [None]:
from scipy.stats import randint, uniform, expon

In [None]:
np.random.seed(42)
s = uniform(.2, .79).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
np.random.seed(42)
s = randint(2, 100).rvs(size=1000)
print(min(s))
print(max(s))
plt.hist(s)

In [None]:
s = randint(2, 20)
s.

In [None]:
If float, then max_features is a fraction and round(max_features * n_features) features are considered at each split.

In [None]:
model_param_dict = [
    {
        #DropMissingValuesTransformer(), 
        'preparation__numeric_pipeline__impute_chooser__base_transformer': [SimpleImputer(strategy='mean')],
        'preparation__numeric_pipeline__scaling_chooser__base_transformer': [MinMaxScaler(), StandardScaler()],
        'preparation__non_numeric_pipeline__encoder_chooser__base_transformer': [OneHotEncoder(),
                                                                                 CustomOrdinalEncoder()],
        'model__n_estimators': randint(50, 5000),
        'model__max_features':  uniform(.01, .99),
        #'model__max_depth': randint(2, 50),
    },
]

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
random_search = RandomizedSearchCV(estimator=full_pipeline,
                                 param_distributions=model_param_dict,
                                 n_iter=1000,
                                 cv=RepeatedKFold(n_splits=5, n_repeats=1),
                                 #scoring=scorer,
                                 #return_train_score=True,
                                 n_jobs=-1,
                                 #verbose=2,
                                )
random_search.fit(X_train, y_train)

In [None]:
cvres = random_search.cv_results_
results_df = pd.concat([pd.DataFrame({'mean_score': cvres["mean_test_score"], 'st_dev_score': cvres["std_test_score"]}),
          pd.DataFrame(cvres["params"])],
          axis=1)
results_df = results_df.sort_values(by=['mean_score'], ascending=False)

In [None]:

results_mod = results_df.copy()
#results_mod.drop(columns=['min', 'max'], inplace = True)
results_mod.insert(1, 'mean*-2SD', results_df['mean_score'] - (2 * results_df['st_dev_score']))
results_mod.insert(2, 'mean*+2SD', results_df['mean_score'] + (2 * results_df['st_dev_score']))
results_mod.drop(columns=['st_dev_score'], inplace = True)

In [None]:
results_mod. \
    head(30). \
    pipe(hlp.pandas_style.format,
         subset=['mean_score', 'mean*-2SD', 'mean*+2SD'],
         round_by=3,
         hide_index=True). \
    bar(subset=['mean_score'], color='#5fba7d').\
    bar(subset=['mean*+2SD'], color='gray').\
    pipe(hlp.pandas_style.bar_inverse, subset=['mean*-2SD'], color='gray')

# TODO

- decide between imputing missing values and removing missing data; via tuning parameter(s)

- get feature importance for model that has various transformations
    - https://towardsdatascience.com/how-to-get-feature-importances-from-any-sklearn-pipeline-167a19f1214
    - https://stackoverflow.com/questions/38787612/how-to-extract-feature-importances-from-an-sklearn-pipeline
   
```
grid_search.best_estimator_._final_estimator.feature_importances_
grid_search.best_estimator_._final_estimator.feature_importances_.shape
```

- future importance
    - https://www.kaggle.com/general/175075
        - LOFO (Leave one feature out) for feature importance.
    - https://explained.ai/rf-importance/