# Credit Default Prediction Using (Random Forest)

OBJECTIVE.
1) Predicting customer default or not.
2) Tune the model for optimum hyperparameters.

In [1]:
# Packages importing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read data file.
df=pd.read_csv("credit-card-default.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaulted
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
df.info()  ## all are integers(NUmeric format)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID           30000 non-null int64
LIMIT_BAL    30000 non-null int64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_0        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null int64
BILL_AMT2    30000 non-null int64
BILL_AMT3    30000 non-null int64
BILL_AMT4    30000 non-null int64
BILL_AMT5    30000 non-null int64
BILL_AMT6    30000 non-null int64
PAY_AMT1     30000 non-null int64
PAY_AMT2     30000 non-null int64
PAY_AMT3     30000 non-null int64
PAY_AMT4     30000 non-null int64
PAY_AMT5     30000 non-null int64
PAY_AMT6     30000 non-null int64
defaulted    30000 non-null int64
dtypes: int64(25)
memory usage: 5.7 MB


In [5]:
df.isnull().sum()   ## No null Values.

ID           0
LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
defaulted    0
dtype: int64

Seems No Data Quality Issues. So we move directly with Modelling.

# MODELLING

In [8]:
from sklearn.model_selection import train_test_split

In [11]:
X = df.drop("defaulted",axis=1)
Y = df["defaulted"]
# Splitting Data into train and test.
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=10)

In [12]:
# Default Hyper-parameters,
from sklearn.ensemble import RandomForestClassifier

In [13]:
model_default = RandomForestClassifier()
model_default.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
# Making predictions
predict_default = model_default.predict(x_test)

In [15]:
# Evaluation metrics
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report



In [18]:
print(classification_report(y_test,predict_default))
print("\nConfusion Matrix\n",confusion_matrix(y_test,predict_default))
print("\nAccuracy is",accuracy_score(y_test,predict_default))

             precision    recall  f1-score   support

          0       0.82      0.95      0.88      6982
          1       0.62      0.30      0.40      2018

avg / total       0.78      0.80      0.77      9000


Confusion Matrix
 [[6607  375]
 [1415  603]]
Accuracy is 0.8011111111111111


In [19]:
#  All the Hyper-Parameters which we used for Decision tree algorithms can also be tuned here, Like min sample split, min sample leaf,
#  Max Depth of tree, Max leaf nodes, etc.

#  Apart from that we are going to tune Decision tree hyper-parameters. number of trees,
#  Number of attribute to be used at nodes for splitting.
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


In [21]:

model_1 = RandomForestClassifier()
parameters = {"max_depth":range(4,20,5)}
nFolds = 5

model_rf = GridSearchCV(estimator=model_1,param_grid=parameters,scoring="accuracy",cv=nFolds,verbose=1)
model_rf.fit(x_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    9.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(4, 20, 5)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=1)

In [22]:
scores = model_rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.19568,0.007396,0.812143,0.813786,4,{'max_depth': 4},3,0.809331,0.814572,0.812902,...,0.81381,0.815238,0.814241,0.812035,0.810431,0.813047,0.007622,0.0007998943,0.001928,0.001132
1,0.362979,0.008995,0.816762,0.850179,9,{'max_depth': 9},1,0.81671,0.849277,0.817186,...,0.814762,0.849524,0.820672,0.849771,0.81448,0.850485,0.0073,8.449576e-07,0.002221,0.000922
2,0.500695,0.011194,0.812857,0.899643,14,{'max_depth': 14},2,0.818377,0.896422,0.811235,...,0.81381,0.903512,0.808526,0.901077,0.812336,0.900185,0.011774,0.0004000909,0.003257,0.00263
3,0.58864,0.013792,0.809476,0.943536,19,{'max_depth': 19},4,0.810521,0.938151,0.810045,...,0.809286,0.94375,0.810431,0.946789,0.807097,0.944408,0.009731,0.0003999009,0.001267,0.00288


In [26]:
model_rf.best_score_
model_rf.best_params_

{'max_depth': 9}

Tuning on other parameters.

In [None]:
model_1 = RandomForestClassifier(random_state=20)
parameters = {"max_depth":range(5,20,5),
              "min_samples_split":range(50,150,50),
              "min_samples_leaf":range(50,150,50),
              "criterion":["entropy","gini"],
              "max_features":range(4,17,6),
              "n_estimators":range(100,1001,500)}
nFolds = 5

model_rf = GridSearchCV(estimator=model_1,param_grid=parameters,scoring="accuracy",cv=nFolds,verbose=1)
model_rf.fit(x_train,y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
