In [None]:
! pip install dtreeviz -q
! pip install dalex -q

Let's use a decision tree as a baseline, then use Random Forest for the main model to have a decent performance without a lot of hyperparameter tuning. GBM variants usually take longer to get to optimal performance, so I'd stick with a Random Forest for good performance with less time.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import dalex as dx
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from dtreeviz.trees import dtreeviz
from functools import partial
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.inspection import permutation_importance

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
train.head()

In [None]:
X_train = train[:200000]
X_val = train[200000:250000]
X_test = train[250000:]

In [None]:
def split_data(df):
    return df.drop("TARGET", axis=1), df["TARGET"]
X_train, y_train = split_data(X_train)
X_val, y_val = split_data(X_val)
X_test, y_test = split_data(X_test)

Let's start by preprocessing the data we have by:
1. Creating is_null columns
2. Filling in nulls in the column with the median
3. Converting strings to categories

In [None]:
def fill_in_nulls_train(df):
    null_cols = set(df.columns[df.isna().any()].tolist())
    numeric_cols = set(df.select_dtypes(include= np.number).columns.tolist())
    cols_to_fill = null_cols.intersection(numeric_cols)
    null_dict = {}
    for col in cols_to_fill:
        df["{}_na".format(col)] = df[col].isna()
        median = df[col].median()
        df[col] = df[col].fillna(median)
        null_dict[col] = median
    return df, null_dict
X_train, nas = fill_in_nulls_train(X_train)

In [None]:
def fill_in_nulls_test(df, nas):
    for col, val in nas.items():
        df["{}_na".format(col)] = df[col].isna()
        df[col] = df[col].fillna(val)
    return df
X_val = fill_in_nulls_test(X_val, nas)
X_test = fill_in_nulls_test(X_test, nas)

In [None]:
def convert_to_cats_train(df):
    cat_codes = {}
    for col in df.select_dtypes("object").columns:
        df[col] = df[col].astype("category")
        cat_codes[col] = {v: k for k, v in dict( enumerate(df[col].cat.categories ) ).items()}
        df[col] = df[col].cat.codes
    return df, cat_codes
X_train, cats = convert_to_cats_train(X_train)

In [None]:
def convert_to_cats_test(df, cats):
    for col, mapping in cats.items():
        df[col] = df[col].map(mapping).fillna(-1)
    return df
X_val = convert_to_cats_test(X_val, cats)
X_test = convert_to_cats_test(X_test, cats)

Now that we have gotten the basic preprocessing out of the way, let's start by looking at a depth 3 decision tree to have a baseline

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3, class_weight="balanced")  # limit depth of tree
clf.fit(X_train, y_train)

viz = dtreeviz(clf, 
               X_train, 
               y_train,
               target_name='target',
               feature_names=X_train.columns)  
              
viz

Here we can see that the most indicative features are unsurprisingly external credit scores, though we see that the best credit scores are 2 and 3.

Now, let's see how good the ROC/AUC is with this decision tree.

In [None]:
def metric_checker(clf, metric_fn, X_train, y_train, X_val, y_val, proba=True):
    if proba:
        preds_train = clf.predict_proba(X_train)[:, 1]
        preds_val = clf.predict_proba(X_val)[:, 1]
    else:
        preds_train = clf.predict(X_train)
        preds_val = clf.predict(X_val)
    return metric_fn(y_train, preds_train), metric_fn(y_val, preds_val)
roc_auc_checker = partial(metric_checker, metric_fn=roc_auc_score,  X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val)
roc_auc_checker(clf)

In [None]:
def breakeven_interest(default_rate):
    return (1/(1-default_rate))-1

def print_extended_classification_report(clf, X, y):
    print(classification_report(y, clf.predict(X)))
    tn, fp, fn, tp = confusion_matrix(y, clf.predict(X)).ravel()
    orig_default = y.sum()/y.count()
    new_default = fn/(tn + fn)
    diff = abs(new_default - orig_default)/orig_default
    print("original default rate: {:.2%}".format(orig_default) )
    print("new default rate: {:.2%}".format(new_default))
    print("difference in default: {:.2%}".format(diff) )
    print("decrease in eligibility: {:.2%}".format((tp+fp)/y.shape[0]))
    print("decrease in default/decrease in eligibility: {:.5}".format(diff/((tp+fp)/y.shape[0])))
    print("old breakeven interest: {:.2%}".format(breakeven_interest(orig_default)))
    print("new breakeven interest: {:.2%}".format(breakeven_interest(new_default)))

print_extended_classification_report(clf, X_val, y_val)

With a quick baseline, we can see that we can have a model that reduces the default rate from 8.51% to 4.51% making the breakeven interest rate 4.72%. However, we decrease the eligible customers by 36.88% for a decrease in default of 44.27%, which is a good tradeoff. The good news here is that lower interest rates can attract more customers, but this can backfire if people keep getting rejected when applying for a loan.

Now that we have a decent baseline, let's start by seeing what we can do with a Random Forest model.

In [None]:
clf = RandomForestClassifier(n_jobs=-1, max_samples=50000, class_weight="balanced", min_samples_leaf=25)
clf.fit(X_train, y_train)
roc_auc_checker(clf)

Let's set up gridsearch to find a good set of parameters given "rule of thumb" values

In [None]:
# res = []
# for leaves in [1, 3, 5, 10, 25, 100]:
#      for feats in [None, "sqrt", "log2", 0.5]:
#             for max_samples in [0.25, 0.5, 0.623, None]:
#                 clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=42, n_estimators=50,
#                                              min_samples_leaf=leaves, max_features=feats, max_samples=max_samples)
#                 clf.fit(X_train, y_train)
#                 train_auc, val_auc = roc_auc_checker(clf)
#                 res.append({
#                     "min_samples_leaf": leaves,
#                     "max_features": feats,
#                     "max_samples": max_samples,
#                     "train": train_auc,
#                     "val": val_auc
#                 })
#                 print(val_auc)
# res_df = pd.DataFrame.from_records(res)
# res_df

In [None]:
# res_df.sort_values("val", ascending=False)

Best performing model appears to be min_samples_leaf = 100, max_features = sqrt and no max_samples. The code is commented out for now to orevent rerunning. Now let's use that in our model and understand the features better.

In [None]:
clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
                                             min_samples_leaf=100, max_features='sqrt')
clf.fit(X_train, y_train)
roc_auc_checker(clf)

In [None]:
print_extended_classification_report(clf, X_val, y_val)

Now it becomes quite interesting. We have a model that performs better based on the classification report with a higher AUC, but the breakeven interest is now ~6%. However, we also see that we reduce eligibility by ~15%, which is a better outcome. Compared to decreasing default by roughly 1.2% for every percentage point we lose in eligibility for the decision tree, this new model can decrease default by ~2% for every percentage point we lose in eligibility. 

Next, let's see if we can further improve the model by understanding the features and trimming ones that can be potential sources of leakage

In [None]:
# df_tv = pd.concat([X_train.copy().assign(target=0), X_val.copy().assign(target=1)])
# X_tv = df_tv.drop("target", axis=1)
# y_tv = df_tv["target"]
# X_train_tv, X_test_tv, y_train_tv, y_test_tv = train_test_split(X_tv, y_tv, test_size=0.33, random_state=42)

In [None]:
# clf_tv = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
#                                              min_samples_leaf=100, max_features='sqrt')

# clf_tv.fit(X_train_tv, y_train_tv)
# roc_auc_score(y_test_tv, clf_tv.predict_proba(X_test_tv)[:,1])

In [None]:
# importances_df = pd.DataFrame({"features": X_train_tv.columns, "importances": clf_tv.feature_importances_}).sort_values("importances", ascending=False)
# cols_to_drop = importances_df.head(10).features.tolist()

In [None]:
# clf_tv = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
#                                              min_samples_leaf=100, max_features='sqrt')

# clf_tv.fit(X_train_tv.drop(cols_to_drop, axis=1), y_train_tv)
# roc_auc_score(y_test_tv, clf_tv.predict_proba(X_test_tv.drop(cols_to_drop, axis=1))[:,1])

In [None]:
# importances_df = pd.DataFrame({"features": X_train_tv.drop(cols_to_drop, axis=1).columns, "importances": clf_tv.feature_importances_}).sort_values("importances", ascending=False)
# importances_df.head(20)

In [None]:
# cols_to_drop +=  importances_df.head(3).features.tolist()
# clf_tv = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
#                                              min_samples_leaf=100, max_features='sqrt')

# clf_tv.fit(X_train_tv.drop(cols_to_drop, axis=1), y_train_tv)
# roc_auc_score(y_test_tv, clf_tv.predict_proba(X_test_tv.drop(cols_to_drop, axis=1))[:,1])

In [None]:
# importances_df = pd.DataFrame({"features": X_train_tv.drop(cols_to_drop, axis=1).columns, "importances": clf_tv.feature_importances_}).sort_values("importances", ascending=False)
# importances_df.head(20)

In [None]:
# cols_to_drop +=  importances_df.head(4).features.tolist()
# clf_tv = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
#                                              min_samples_leaf=100, max_features='sqrt')

# clf_tv.fit(X_train_tv.drop(cols_to_drop, axis=1), y_train_tv)
# roc_auc_score(y_test_tv, clf_tv.predict_proba(X_test_tv.drop(cols_to_drop, axis=1))[:,1])

Alright we've now taken out the features that are most likely going to ruin the model through leakage, let's see how the model performs now.

In [None]:
cols_to_drop = ['NAME_INCOME_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_HOUSING_TYPE',
 'CODE_GENDER',
 'ORGANIZATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_CONTRACT_TYPE',
 'SK_ID_CURR',
 'WEEKDAY_APPR_PROCESS_START',
 'NAME_TYPE_SUITE',
 'NAME_EDUCATION_TYPE',
 'OCCUPATION_TYPE',
 'EMERGENCYSTATE_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'FONDKAPREMONT_MODE']

In [None]:
clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
                                             min_samples_leaf=100, max_features='sqrt')

clf.fit(X_train.drop(cols_to_drop, axis=1), y_train)
roc_auc_score(y_val, clf.predict_proba(X_val.drop(cols_to_drop, axis=1))[:,1])

In [None]:
print_extended_classification_report(clf, X_val.drop(cols_to_drop, axis=1), y_val)

For a slight drop in AUC, we should now be safe against leakage. It does look like the tradeoff doesn't look as good, but this can be managed by changing the cutoffs later on.

In [None]:
importances_df = pd.DataFrame({"features": X_train.drop(cols_to_drop, axis=1).columns, "importances": clf.feature_importances_}).sort_values("importances", ascending=False)
importances_df.head(20)

It seems like our most important features are the external credit scores(not surprising), but let's visit improvements later on by exploring their interactions for now, let's take a look at how big our potential market is as well as test set performance

In [None]:
roc_auc_score(y_test, clf.predict_proba(X_test.drop(cols_to_drop, axis=1))[:,1])

In [None]:
print_extended_classification_report(clf, X_test.drop(cols_to_drop, axis=1), y_test)

In [None]:
test = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

test = fill_in_nulls_test(test, nas)
test = convert_to_cats_test(test, cats)

In [None]:
test_preds = clf.predict(test.drop(cols_to_drop, axis=1))
eligible_users = np.abs((test_preds - 1).sum())
eligible_pct = eligible_users/test_preds.shape[0]
print("We will have {0} customers the test set or {1:.2%} of the test set will be eligible".format(eligible_users, eligible_pct))

While we can stop here in the meantime, we can also try to find the tradeoff in eligibility for different interest rates. Let's assume that a reasonable breakeven interest rate would be between 3-5%, can we find the cutoffs and the resulting eligible base?

For this exercise, we will be inverting the scale by making 1 a good payer and 0 a bad payer.

In [None]:
X_test_preds_df = pd.DataFrame({"prediction": 1 - clf.predict_proba(X_test.drop(cols_to_drop, axis=1))[:,1], 
                                "target": y_test}).sort_values("prediction", ascending=False).reset_index(drop=True)
X_test_preds_df["target_cumsum"] = X_test_preds_df["target"].cumsum()
X_test_preds_df["default_rate"] = X_test_preds_df["target_cumsum"]/(X_test_preds_df.index+1)
X_test_preds_df

In [None]:
for interest_rate in [0.03, 0.035, 0.04, 0.045, 0.05]:
    cutoff = X_test_preds_df.loc[X_test_preds_df["default_rate"] <= interest_rate, "prediction"].min()
    test_preds = 1 - clf.predict_proba(test.drop(cols_to_drop, axis=1))[:,1]
    eligible_users = (test_preds > cutoff).sum()
    eligible_pct = eligible_users/test_preds.shape[0]
    print("With a cutoff of {0:.2}, we will have {1} customers the test set or {2:.2%} of the test set will be eligible for a breakeven interest rate of {3:.1%}".format(
        cutoff, eligible_users, eligible_pct, interest_rate))

Now let's look into the fairness of the model. For now, we will assume a model is fair if it does not discriminate on the basis of race, gender or religion. Since we only have gender in this data set, let's take a look if the model discriminates on the basis of gender using the 4/5 rule.

In [None]:
cats["CODE_GENDER"]

In [None]:
exp = dx.Explainer(clf, X_test.drop(cols_to_drop, axis=1), y_test)
protected = X_test["CODE_GENDER"]
privileged = 1
fobject = exp.model_fairness(protected = protected, privileged=privileged)
fobject.fairness_check(epsilon = 0.8) 

Oh no, looks like we have features that are likely causing bias. Let's find those features and try to take them out.

In [None]:
X_f = X_train[X_train["CODE_GENDER"].isin([0, 1])].drop(cols_to_drop, axis=1)
y_f = X_train.loc[X_train["CODE_GENDER"].isin([0, 1]), 'CODE_GENDER']
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_f, y_f, test_size=0.33, random_state=42)

In [None]:
clf_f = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
                                             min_samples_leaf=100, max_features='sqrt')

clf_f.fit(X_train_f, y_train_f)
roc_auc_score(y_test_f, clf_f.predict_proba(X_test_f)[:,1])

Looks like some features are indicative of whether a user is male or female, so let's try to weed them out.

In [None]:
importances_df = pd.DataFrame({"features": X_train_f.columns, "importances": clf_f.feature_importances_}).sort_values("importances", ascending=False)
importances_df.head(10)

The first 5 features look significantly higher than the rest. Let's take them out for now.

In [None]:
cols_to_drop_f = ['OWN_CAR_AGE_na',
 'OWN_CAR_AGE',
 'AMT_INCOME_TOTAL',
 'FLAG_DOCUMENT_8',
 'EXT_SOURCE_1']

In [None]:
clf_f = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
                                             min_samples_leaf=100, max_features='sqrt')

clf_f.fit(X_train_f.drop(cols_to_drop_f, axis=1), y_train_f)
roc_auc_score(y_test_f, clf_f.predict_proba(X_test_f.drop(cols_to_drop_f, axis=1))[:,1])

In [None]:
clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=0, n_estimators=100,
                                             min_samples_leaf=100, max_features='sqrt')

clf.fit(X_train.drop(cols_to_drop + cols_to_drop_f, axis=1), y_train)
roc_auc_score(y_val, clf.predict_proba(X_val.drop(cols_to_drop + cols_to_drop_f, axis=1))[:,1])

In [None]:
print_extended_classification_report(clf, X_val.drop(cols_to_drop + cols_to_drop_f, axis=1), y_val)

In [None]:
exp = dx.Explainer(clf, X_test.drop(cols_to_drop + cols_to_drop_f, axis=1), y_test)
protected = X_test["CODE_GENDER"]
privileged = 1
fobject = exp.model_fairness(protected = protected, privileged=privileged)
fobject.fairness_check(epsilon = 0.8) 

We now have a fair model that doesn't discriminate based on gender, however since EXT_SOURCE_1 is one of the top features that increases bias, this external model likely has biases inside the model. 

For now, let's check what the test set has to say about our finals model.

In [None]:
print(roc_auc_score(y_test, clf.predict_proba(X_test.drop(cols_to_drop + cols_to_drop_f, axis=1))[:,1]))
print_extended_classification_report(clf, X_test.drop(cols_to_drop + cols_to_drop_f, axis=1), y_test)

In [None]:
test_preds = clf.predict(test.drop(cols_to_drop + cols_to_drop_f, axis=1))
eligible_users = np.abs((test_preds - 1).sum())
eligible_pct = eligible_users/test_preds.shape[0]
print("We will have {0} customers the test set or {1:.2%} of the test set will be eligible".format(eligible_users, eligible_pct))

In [None]:
X_test_preds_df = pd.DataFrame({"prediction": 1 - clf.predict_proba(X_test.drop(cols_to_drop + cols_to_drop_f, axis=1))[:,1], 
                                "target": y_test}).sort_values("prediction", ascending=False).reset_index(drop=True)
X_test_preds_df["target_cumsum"] = X_test_preds_df["target"].cumsum()
X_test_preds_df["default_rate"] = X_test_preds_df["target_cumsum"]/(X_test_preds_df.index+1)
X_test_preds_df
for interest_rate in [0.03, 0.035, 0.04, 0.045, 0.05]:
    cutoff = X_test_preds_df.loc[X_test_preds_df["default_rate"] <= interest_rate, "prediction"].min()
    test_preds = 1 - clf.predict_proba(test.drop(cols_to_drop + cols_to_drop_f, axis=1))[:,1]
    eligible_users = (test_preds > cutoff).sum()
    eligible_pct = eligible_users/test_preds.shape[0]
    print("With a cutoff of {0:.2}, we will have {1} customers the test set or {2:.2%} of the test set will be eligible for a breakeven interest rate of {3:.1%}".format(
        cutoff, eligible_users, eligible_pct, interest_rate))

It looks like we lose roughly 1-2k users by reducing the bias in our model, but at least we're fairly confident that we're treating customers fairly. Now let's understand how our features work.

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
# shap_values = explainer.shap_values(X_val[:10000])

In [None]:
shap_values = explainer.shap_values(X_val.drop(cols_to_drop + cols_to_drop_f, axis=1), approximate=True)

In [None]:
shap.summary_plot(shap_values[0], X_val.drop(cols_to_drop + cols_to_drop_f, axis=1))

No surprise, but the top features for the decision tree also appear to be our top features for the random forest. Looks like leveraging external credit scores is a good way to have a good model without exerting too much effort.

In [None]:
# shap.plots.beeswarm(shap_values[0], max_display=20)

In [None]:
# shap.summary_plot(shap_values, X_val)

In [None]:
# result = permutation_importance(clf, X_train, y_train, n_repeats=10,
#                                 random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()

# tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
# tree_indices = np.arange(0, len(clf.feature_importances_)) + 0.5

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
#          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticks(tree_indices)
# ax1.set_yticklabels(data.feature_names[tree_importance_sorted_idx])
# ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
#             labels=data.feature_names[perm_sorted_idx])
# fig.tight_layout()
# plt.show()