Author: Shan Qu, Delft University of Technology

First created: 18 April 2021

Product: credit modeling --- ensemble + adaboost learning based on the models:

Logistic regression / CART tree / Random forest / SVM

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
from sklearn.feature_selection import SelectFromModel


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier


%run helpers.ipynb


pd.set_option('display.max_columns', 20)
seed = 143

import warnings
warnings.filterwarnings("ignore")

ERROR:root:File `'helpers.ipynb.py'` not found.


In [2]:
df_X_train = pd.read_csv('Data/df_X_train.csv')
df_X_val = pd.read_csv('Data/df_X_val.csv')
df_y_train = pd.read_csv('Data/df_y_train.csv')
df_y_val = pd.read_csv('Data/df_y_val.csv')
print(df_X_train.info())
print(df_y_train.info())

# drop the unnamed ID column
df_X_train.drop('Unnamed: 0', axis=1, inplace=True)
df_X_val.drop('Unnamed: 0', axis=1, inplace=True)
df_y_train.drop('Unnamed: 0', axis=1, inplace=True)
df_y_val.drop('Unnamed: 0', axis=1, inplace=True)

df_X_train.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         506 non-null    int64  
 1   duration           506 non-null    float64
 2   credit_amount      506 non-null    float64
 3   installment_rate   506 non-null    float64
 4   residence_length   506 non-null    float64
 5   age                506 non-null    float64
 6   existing_credits   506 non-null    float64
 7   liable_people      506 non-null    float64
 8   checking_account   506 non-null    float64
 9   credit_history     506 non-null    float64
 10  purpose            506 non-null    float64
 11  savings_account    506 non-null    float64
 12  employment_length  506 non-null    float64
 13  status_sex         506 non-null    float64
 14  other_debtors      506 non-null    float64
 15  property           506 non-null    float64
 16  installment_plan   506 non

Unnamed: 0,duration,credit_amount,installment_rate,residence_length,age,existing_credits,liable_people,checking_account,credit_history,purpose,savings_account,employment_length,status_sex,other_debtors,property,installment_plan,housing,job,telephone,foreign_worker
0,0.294118,0.269616,1.0,1.0,0.25,0.0,0.0,0.0,0.0,0.636409,1.0,0.157661,0.0,0.436055,0.424937,0.0,0.0,0.235904,0.0,1.0
1,0.647059,0.687741,1.0,0.0,0.321429,0.0,0.0,0.0,0.324712,0.335297,0.211197,0.0,0.0,0.436055,0.424937,0.0,0.0,1.0,0.0,1.0
2,0.470588,0.155662,1.0,1.0,0.303571,0.0,0.0,0.0,0.326222,0.818572,0.212304,0.45202,0.0,0.436055,0.0,0.0,0.0,0.235904,1.0,1.0
3,0.073529,0.048861,1.0,1.0,0.107143,0.333333,0.0,1.0,0.0,0.335297,1.0,0.45202,0.0,0.436055,0.0,0.0,0.0,0.0,1.0,1.0
4,0.088235,0.035435,1.0,0.666667,0.375,0.0,0.0,0.0,0.326222,0.0,0.212304,0.0,0.637758,0.436055,0.419563,0.0,0.0,0.235904,0.0,1.0


In [22]:
lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
lr_ada = AdaBoostClassifier(base_estimator=lr, n_estimators=100, random_state=1)

cart = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
cart_ada = AdaBoostClassifier(base_estimator=cart, n_estimators=100, random_state=1)

rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
rf_ada = AdaBoostClassifier(base_estimator=rf, n_estimators=100, random_state=1)

svm = SVC(C=1000.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
svm_ada = AdaBoostClassifier(base_estimator=svm, n_estimators=100, random_state=1)

classifiers = [('lr_ada', lr_ada), ('cart_ada', cart_ada), ('rf_ada', rf_ada), ('svm_ada', svm_ada)]

for clf_name, clf in classifiers:   
  clf.fit(df_X_train, df_y_train)    
  
  #preds = clf.predict_proba(df_X_val)
  #df_preds = pd.DataFrame(preds[:,1], columns = ['prob_default'])
  #threshold = 0.5
  #df_preds['Risk_pred'] = df_preds['prob_default'].apply(lambda x: 1 if x > threshold else 0)
  
  preds = clf.predict(df_X_val)
  df_preds = pd.DataFrame(preds, columns = ['Risk_pred'])

  auc = roc_auc_score(df_y_val, df_preds['Risk_pred']) 
  print('{:s} : {:.3f}'.format(clf_name, auc))
  print(confusion_matrix(df_y_val, df_preds['Risk_pred']))



lr_ada : 0.724
[[111  42]
 [ 13  34]]
cart_ada : 0.704
[[108  45]
 [ 14  33]]
rf_ada : 0.750
[[109  44]
 [ 10  37]]
svm_ada : 0.631
[[76 77]
 [11 36]]


In [23]:
vc = VotingClassifier(estimators=classifiers)     
vc.fit(df_X_train, df_y_train)   
preds = vc.predict(df_X_val)

df_preds = pd.DataFrame(preds, columns = ['Risk_pred'])
#threshold = 0.5
#df_preds['Risk_pred'] = df_preds['prob_default'].apply(lambda x: 1 if x > threshold else 0)

auc = roc_auc_score(df_y_val, df_preds['Risk_pred']) 
print('{:s} : {:.3f}'.format('vc', auc))
print(confusion_matrix(df_y_val, df_preds['Risk_pred']))

vc : 0.741
[[116  37]
 [ 13  34]]
