# Pipelines and Model Fits

In this notebook we create some `pipelines` and do some model fits.

## Importing Packages

In [1]:
import pandas as pd
import numpy as np

## Reading-In Data

In [2]:
df_default = pd.read_csv("data_processed/01_binary_training.csv", low_memory=False)
df_default.head().T

Unnamed: 0,0,1,2,3,4
funded_amnt,30000.0,7850.0,25000.0,23000.0,12000.0
addr_state,IL,IN,AZ,CO,TX
annual_inc,70000.0,95000.0,115000.0,177000.0,98000.0
application_type,Individual,Individual,Individual,Individual,Individual
dti,22.78,13.97,23.27,13.91,22.3
earliest_cr_line,Apr-1996,Apr-1990,Dec-2001,Aug-2001,Sep-2000
emp_length,< 1 year,4 years,9 years,< 1 year,10+ years
emp_title,Surgical Clinical Reviewer,Technician,VP of Operations,Property Manager,HEAVY DUTY DRIVER
fico_range_high,729.0,679.0,704.0,704.0,699.0
fico_range_low,725.0,675.0,700.0,700.0,695.0


In [3]:
df_default

Unnamed: 0,funded_amnt,addr_state,annual_inc,application_type,dti,earliest_cr_line,emp_length,emp_title,fico_range_high,fico_range_low,...,zip_code,last_pymnt_amnt,num_actv_rev_tl,mo_sin_rcnt_rev_tl_op,mo_sin_old_rev_tl_op,bc_util,bc_open_to_buy,avg_cur_bal,acc_open_past_24mths,charged_off
0,30000.0,IL,70000.0,Individual,22.78,Apr-1996,< 1 year,Surgical Clinical Reviewer,729.0,725.0,...,606xx,941.74,5.0,11.0,237.0,46.7,37957.0,5302.0,3.0,False
1,7850.0,IN,95000.0,Individual,13.97,Apr-1990,4 years,Technician,679.0,675.0,...,463xx,1594.23,5.0,2.0,297.0,73.7,947.0,4954.0,9.0,False
2,25000.0,AZ,115000.0,Individual,23.27,Dec-2001,9 years,VP of Operations,704.0,700.0,...,853xx,22209.46,7.0,7.0,158.0,60.1,17049.0,8770.0,7.0,False
3,23000.0,CO,177000.0,Individual,13.91,Aug-2001,< 1 year,Property Manager,704.0,700.0,...,805xx,11497.09,9.0,35.0,163.0,70.4,8341.0,8395.0,0.0,False
4,12000.0,TX,98000.0,Individual,22.30,Sep-2000,10+ years,HEAVY DUTY DRIVER,699.0,695.0,...,770xx,392.81,7.0,6.0,190.0,64.3,3316.0,6460.0,7.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942091,32000.0,HI,138000.0,Individual,15.51,Aug-1999,10+ years,human resources,724.0,720.0,...,967xx,744.26,6.0,0.0,179.0,37.9,33858.0,54002.0,8.0,True
942092,30000.0,TX,103000.0,Individual,11.95,Nov-2002,10+ years,General Superintendent,664.0,660.0,...,770xx,9740.59,4.0,9.0,130.0,85.5,5696.0,7356.0,5.0,False
942093,8000.0,MN,78000.0,Individual,18.72,Apr-2004,10+ years,"table gaming, dual supervisor",689.0,685.0,...,553xx,307.39,2.0,4.0,119.0,17.3,248.0,24778.0,3.0,True
942094,4325.0,NJ,109000.0,Individual,20.22,Jun-2001,10+ years,Director Materials Management,699.0,695.0,...,080xx,3144.54,3.0,86.0,181.0,90.3,2140.0,40658.0,1.0,False


## Experimenting with Encodings of Individual Categorical Features

The purpose of this section is to experiment with the different  encoders on the various categorical features.  Basically I want to see what kind of output to expect.

In the subsequent sections I start putting them together in a pipeline.

### `term`

In [4]:
df_default['term'] = df_default['term'].str.strip()

In [5]:
df_default['term'].value_counts()

term
36 months    714669
60 months    227427
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False,drop='first')
enc.fit(df_default[['term']])
enc.categories_

[array(['36 months', '60 months'], dtype=object)]

In [7]:
enc.transform(df_default[['term']])

array([[0.],
       [0.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

### `grade`

In [8]:
df_default['grade'].value_counts()

grade
B    275040
C    267146
A    164855
D    140609
E     65485
F     22497
G      6464
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])
enc.fit(df_default[['grade']])
enc.categories_

[array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object)]

In [10]:
enc.transform(df_default[['grade']])

array([[0.],
       [2.],
       [2.],
       ...,
       [4.],
       [2.],
       [2.]])

In [11]:
pd.DataFrame({'original':df_default['grade'], 'encoded':enc.transform(df_default[['grade']]).ravel()})

Unnamed: 0,original,encoded
0,A,0.0
1,C,2.0
2,C,2.0
3,A,0.0
4,B,1.0
...,...,...
942091,C,2.0
942092,C,2.0
942093,E,4.0
942094,C,2.0


In [12]:
pd.DataFrame({'original':df_default['grade'], 'encoded':enc.transform(df_default[['grade']]).ravel()}).groupby(['original'])['encoded'].agg(np.mean)

original
A    0.0
B    1.0
C    2.0
D    3.0
E    4.0
F    5.0
G    6.0
Name: encoded, dtype: float64

### `sub_grade`

In [13]:
list(df_default['sub_grade'].value_counts().sort_index().index)

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'B1',
 'B2',
 'B3',
 'B4',
 'B5',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'E1',
 'E2',
 'E3',
 'E4',
 'E5',
 'F1',
 'F2',
 'F3',
 'F4',
 'F5',
 'G1',
 'G2',
 'G3',
 'G4',
 'G5']

In [14]:
ordering = list(df_default['sub_grade'].value_counts().sort_index().index)
enc = OrdinalEncoder(categories=[ordering])
enc.fit(df_default[['sub_grade']])
enc.categories_

[array(['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1',
        'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2',
        'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3',
        'G4', 'G5'], dtype=object)]

In [15]:
enc.transform(df_default[['sub_grade']])

array([[ 4.],
       [14.],
       [12.],
       ...,
       [24.],
       [11.],
       [14.]])

In [16]:
pd.DataFrame({'original':df_default['sub_grade'], 'encoded':enc.transform(df_default[['sub_grade']]).ravel()}).groupby(['original'])[['encoded']].agg(np.mean)

Unnamed: 0_level_0,encoded
original,Unnamed: 1_level_1
A1,0.0
A2,1.0
A3,2.0
A4,3.0
A5,4.0
B1,5.0
B2,6.0
B3,7.0
B4,8.0
B5,9.0


### `home_ownership`

In [17]:
df_default['home_ownership'].value_counts()

home_ownership
MORTGAGE    465753
RENT        374452
OWN         101553
ANY            199
OTHER          110
NONE            29
Name: count, dtype: int64

In [18]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
enc.fit(df_default[['home_ownership']])
enc.categories_

[array(['ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT'], dtype=object)]

In [19]:
enc.transform(df_default[['home_ownership']])

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

### `verification_status`

In [20]:
df_default['verification_status'].value_counts()

verification_status
Source Verified    364751
Verified           292897
Not Verified       284448
Name: count, dtype: int64

In [21]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
enc.fit(df_default[['verification_status']])
enc.categories_

[array(['Not Verified', 'Source Verified', 'Verified'], dtype=object)]

In [22]:
enc.transform(df_default[['verification_status']])

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 0.],
       [0., 1.]])

### `purpose`

In [23]:
df_default['purpose'].value_counts()

purpose
debt_consolidation    546460
credit_card           206767
home_improvement       61414
other                  54458
major_purchase         20589
medical                10906
small_business         10842
car                    10108
moving                  6653
vacation                6278
house                   5113
wedding                 1595
renewable_energy         661
educational              252
Name: count, dtype: int64

In [24]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
enc.fit(df_default[['purpose']])
enc.categories_

[array(['car', 'credit_card', 'debt_consolidation', 'educational',
        'home_improvement', 'house', 'major_purchase', 'medical', 'moving',
        'other', 'renewable_energy', 'small_business', 'vacation',
        'wedding'], dtype=object)]

In [25]:
enc.transform(df_default[['purpose']])

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

### `addr_state`

In [26]:
#df_default['addr_state'].value_counts()

In [27]:
# enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
# enc.fit(df_default[['addr_state']])
# enc.categories_

In [28]:
#enc.transform(df_default[['addr_state']])

## Creating the `FeatureSelector` Column Transformer

Creating the customer feature selector which will be the first step in all the pipelines.

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

## Creating a Pipeline with a Single Feature `grade`

Let's build a model with a single categorical feature `grade`.  This will be a proof of concept just to see if I can get things to run.

In [30]:
features = ['grade']

Let's double check that our `FeatureSelector` is working properly.

In [31]:
FeatureSelector(features).fit_transform(df_default)

Unnamed: 0,grade
0,A
1,C
2,C
3,A
4,B
...,...
942091,C
942092,C
942093,E
942094,C


Next, we'll construct our column transformer.

In [32]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [33]:
categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']]))
])

Let's test the output of our column transformer.

In [34]:
testing_output = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor', categorical_transformer_1)
])
pd.DataFrame(testing_output.fit_transform(df_default))

Unnamed: 0,0
0,0.0
1,2.0
2,2.0
3,0.0
4,1.0
...,...
942091,2.0
942092,2.0
942093,4.0
942094,2.0


Now we can construct our model pipeline and fit our model.

In [35]:
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor',categorical_transformer_1),
    ('logistic_regression', LogisticRegression())
])

In [36]:
X = df_default.drop(columns=(['charged_off']))
y = df_default['charged_off']

In [37]:
model.fit(X, y)

Our in-sample accuracy doesn't really beat the baseline predictor of always predict non-default.

In [38]:
model.score(X, y)

0.7979802482974134

The actual percentage of defaults is almost exactly 20%.

In [39]:
y.mean()

0.1998023555985802

Notice that our model is greatly underpredicting the percentage of defaults.  This is typical for imbalanced datasets, we will address this with threshold tuning in a subsequent notebook.

In [41]:
model.predict(X).mean()

0.030741028515140708

Let's check the `roc_auc_score`.  This may be misleadingly good because we have an imbalanced data set.

In [42]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y, model.predict_proba(X)[:,1])

0.6796798890986282

Let's now examine precision and recall.

In [43]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y, model.predict_proba(X)[:,1])

In [44]:
print(precision)
print(recall)

[0.19980236 0.22925836 0.28118423 0.34557444 0.40852974 0.46393426
 0.4998453  1.        ]
[1.         0.9466406  0.75019258 0.43153432 0.20498    0.07137962
 0.0171649  0.        ]


In [45]:
from sklearn.metrics import auc
auc(recall, precision)

0.3441472492163303

Let's combine precision and recall into an f1-score.

In [46]:
from sklearn.metrics import f1_score
f1_score(y, model.predict(X))

0.1237234914408317

In [47]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y, model.predict(X)))
print(recall_score(y, model.predict(X)))
print(f1_score(y, model.predict(X)))

0.4639342564137979
0.07137961993911801
0.1237234914408317


Let's test for out-of-sample metrics with 10-fold cross validation.

In [48]:
import sklearn
cv_results = sklearn.model_selection.cross_validate(model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1)
cv_results

{'fit_time': array([0.76141763, 0.76526022, 0.78134847, 0.66590762, 0.63896704]),
 'score_time': array([0.16536617, 0.21954656, 0.14680219, 0.13975787, 0.13697481]),
 'test_accuracy': array([0.79794608, 0.79752573, 0.79805646, 0.79822099, 0.79815199]),
 'test_f1': array([0.12631096, 0.1220197 , 0.12242262, 0.12678288, 0.12105385]),
 'test_roc_auc': array([0.68187847, 0.68136712, 0.67889363, 0.67894696, 0.67731115])}

In [49]:
cv_results['test_f1'].mean()

0.12371800224574765

## Creating a Pipeline with Two Ordinal Encoded Features

Next, we'll up our game a little bit and have two different ordinal features in our model.  Since there are two different ordinal transformations going on we'll need to use a ColumnTransformer object. 

In [50]:
# features = \
# ['term',
#  'grade',
#  'sub_grade',
#  'home_ownership',
#  'verification_status',
#  'purpose',
#  'addr_state']

features = \
['grade',
 'sub_grade']

This code create a list that defines the order of the ordinal encoding:

In [51]:
print(list(df_default['sub_grade'].value_counts().sort_index().index))

['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']


Now we'll create the two column transformers for our two categorical variables.

In [52]:
categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']]))
])

categorical_transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[list(df_default['sub_grade'].value_counts().sort_index().index)]))
])

In [53]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('categorical_1', categorical_transformer_1, ['grade']),
    ('categorical_2', categorical_transformer_2, ['sub_grade'])],
    remainder = 'passthrough',
)

Let's create our model pipeline, and fit our model.

In [54]:
model = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor', preprocessor),
    ('logistic_regression', LogisticRegression())
])

In [55]:
X = df_default.drop(columns=(['charged_off']))
y = df_default['charged_off']

In [56]:
model.fit(X, y)

This model doesn't beat the baseline of always guessing non-default.

In [57]:
model.score(X, y)

0.799089477080892

Again, we can see that our model greatly underpredict defaults.

In [58]:
model.predict(X).mean()

0.023386151729760024

Next, let's check the areas under the ROC curve, and the precision-recall curve.

In [59]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y, model.predict_proba(X)[:,1])

0.6887038167023508

In [60]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y, model.predict_proba(X)[:,1])

In [61]:
from sklearn.metrics import auc
auc(recall, precision)

0.34009860050242463

Let's combine precision and recall into and f1-score.

In [62]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y, model.predict(X)))
print(recall_score(y, model.predict(X)))
print(f1_score(y, model.predict(X)))

0.47630718954248363
0.05575005445378865
0.09981689772430029


Finally, let's check out of sample metrics with 5-fold cross-validation.

In [63]:
import sklearn
cv_results = sklearn.model_selection.cross_validate(model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1)

Interestingly, our f1-score went down, perhaps because these features contain basically the same information.

In [64]:
cv_results['test_f1'].mean()

0.09981259206311024

## Creating a Pipeline with All Categorical Variables

Let's now create a model that incorporates all our categorical variables.

In [65]:
# features = \
# ['term',
#  'grade',
#  'sub_grade',
#  'home_ownership',
#  'verification_status',
#  'purpose',
#  'addr_state']

features = [
    'grade',
    'home_ownership',
    'purpose',
    'sub_grade',
    'term',
    'verification_status',
]

In [66]:
from sklearn.preprocessing import StandardScaler

categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])),
    ('scaler', StandardScaler())
])

categorical_transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[list(df_default['sub_grade'].value_counts().sort_index().index)])),
    ('scaler', StandardScaler())
])

categorical_transformer_3 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')),
    ('scaler', StandardScaler(with_mean=True))
])

In [67]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('categorical_1', categorical_transformer_1, ['grade']),
    ('categorical_2', categorical_transformer_2, ['sub_grade']),
    ('categorical_3', categorical_transformer_3, ['term', 'home_ownership', 'verification_status', 'purpose'])],
    remainder = 'passthrough',
)

In [68]:
model = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor', preprocessor),
    ('logistic_regression', LogisticRegression())
])

In [69]:
model.fit(X, y)

In [70]:
model.score(X, y)

0.8000373634958645

In [71]:
model.predict(X).mean()

0.02948850223331805

In [72]:
#roc_auc_score(y, model.predict_proba(X)[:,1])

In [73]:
# from sklearn.metrics import precision_recall_curve
# precision, recall, thresholds = precision_recall_curve(y, model.predict_proba(X)[:,1])

In [74]:
# from sklearn.metrics import auc
# auc(recall, precision)

In [75]:
# from sklearn.metrics import f1_score, precision_score, recall_score
# print(precision_score(y, model.predict(X)))
# print(recall_score(y, model.predict(X)))
# print(f1_score(y, model.predict(X)))

Let's check out-of-sample accuracy with 5-fold cross-validation.

In [76]:
# import sklearn
# cv_results = sklearn.model_selection.cross_validate(model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1)
# cv_results

Our mean f1-score increased slightly from just a single variable of `grade`.

In [77]:
# cv_results['test_f1'].mean()

## Creating a Pipeline with All Variables (Categorical and Numeric) - Logistic Regression

Now let's include all variables - the results seem too good to be true.

In [78]:
features = df_default.drop(columns='charged_off').columns.to_list()
features = [
    'grade',
    'home_ownership',
    'purpose', 
    'sub_grade',
    'term',
    'verification_status',
    "funded_amnt",
    "last_pymnt_amnt",
    "int_rate",
    "loan_amnt",
    "installment",
    "acc_open_past_24mths",
    "dti",
    "fico_range_low",
    "mort_acc",
]

In [79]:
# numeric_features = df_default.drop(columns='charged_off').columns[df_default.drop(columns='charged_off').dtypes!='object'].to_list()
numeric_features = [
    "funded_amnt",
    "last_pymnt_amnt",
    "int_rate",
    "loan_amnt",
    "installment",
    "acc_open_past_24mths",
    "dti",
    "fico_range_low",
    "mort_acc",
]

In [80]:
from sklearn.preprocessing import StandardScaler

categorical_transformer_1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['A', 'B', 'C', 'D', 'E', 'F', 'G']])),
    ('scaler', StandardScaler())
])

categorical_transformer_2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[list(df_default['sub_grade'].value_counts().sort_index().index)])),
    ('scaler', StandardScaler())
])

categorical_transformer_3 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')),
    ('scaler', StandardScaler(with_mean=True))  
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler(with_mean=True))  
])

In [81]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('categorical_1', categorical_transformer_1, ['grade']),
    ('categorical_2', categorical_transformer_2, ['sub_grade']),
    ('categorical_3', categorical_transformer_3, ['term', 'home_ownership', 'verification_status', 'purpose']),
    ('numerical', numerical_transformer, numeric_features),],
    remainder = 'passthrough',
)

In [82]:
model = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor', preprocessor),
    ('logistic_regression', LogisticRegression(max_iter=1000))
])

In [83]:
model.fit(X, y)

In [84]:
model.score(X, y)

0.865826837180075

This model is doing quite a bit better in terms of the total number of defaults it is predicting.

In [85]:
model.predict(X).mean()

0.1494125864030842

In [86]:
roc_auc_score(y, model.predict_proba(X)[:,1])

0.9254217422065676

In [87]:
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y, model.predict_proba(X)[:,1])
from sklearn.metrics import auc
auc(recall, precision)

0.7105173265272959

In [88]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y, model.predict(X)))
print(recall_score(y, model.predict(X)))
print(f1_score(y, model.predict(X)))

0.7196240435916198
0.5381362460354986
0.6157863061332426


Let's check out-of-sample performance with 5-fold cross-validation.

In [89]:
import sklearn
cv_results = sklearn.model_selection.cross_validate(model, X, y, cv=5, scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1)
cv_results

{'fit_time': array([6.53152037, 5.25809789, 5.49176764, 6.58697534, 5.5583446 ]),
 'score_time': array([1.45423603, 1.04589677, 1.20316339, 0.83496547, 0.92988563]),
 'test_accuracy': array([0.86680819, 0.86549127, 0.86618653, 0.86594239, 0.8645678 ]),
 'test_f1': array([0.61848586, 0.61469229, 0.61694596, 0.61580348, 0.61280044]),
 'test_roc_auc': array([0.92606727, 0.9251406 , 0.92578739, 0.92563019, 0.92434397])}

In [90]:
cv_results['test_f1'].mean()

0.6157456061189999

## All Variables Random Forest

Let's try a crude Random Forest model to see what performance looks like.

In [91]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[
    ('feature_selector', FeatureSelector(features)),
    ('preprocessor', preprocessor),
    ('random_forest', RandomForestClassifier(n_estimators=50, max_depth=10, n_jobs=-1))
])

In [92]:
model.fit(X, y)

In [93]:
model.score(X, y)

0.8802616718466059

In [95]:
model.predict(X).mean()

0.13520915065980538

In [96]:
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y, model.predict_proba(X)[:,1])
from sklearn.metrics import auc
auc(recall, precision)

0.7920399394814472

In [97]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y, model.predict(X)))
print(recall_score(y, model.predict(X)))
print(f1_score(y, model.predict(X)))

0.7960747370073795
0.5387153155929088
0.6425844309328196


In [98]:
import sklearn
cv_results = sklearn.model_selection.cross_validate(model, X, y, cv=3, scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1)
cv_results

{'fit_time': array([44.62666607, 51.23487353, 53.76212358]),
 'score_time': array([8.75273752, 4.08979273, 1.87983847]),
 'test_accuracy': array([0.87915881, 0.87638521, 0.87736282]),
 'test_f1': array([0.63603928, 0.62478856, 0.63324699]),
 'test_roc_auc': array([0.93942512, 0.93844253, 0.93827238])}

In [None]:
cv_results['test_f1'].mean()