# Working with Loan Classification Problem :: RF, GBC, XGB
*Sreehari P S*
***

**Problem Statment ::**<br>
Given a loan dataset of a Finance Company (FC). This dataset contain loan information of various customer of the FC. Installment paid from months Jan-Dec for an year is recorded in m1-m12 columns.<br>
**Purpose ::**<br>
The FC wants an automated system to predict whether a given customer will pay loan installment on the 13th month.

In [1]:
import numpy as np
import pandas as pd

##### Step 1 >> Load Dataset
***

In [2]:
data = pd.read_csv('./datasets/loan_dataset.csv')

##### Step 2 >> Feature Description
***

In [3]:
data.shape

(116058, 29)

In [4]:
data.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,Z,"Turner, Baldwin and Rhodes",4.25,214000,360,2012-03-01,05/2012,95,1.0,...,0,0,0,1,0,0,0,0,0,1
1,672831657627,Y,"Swanson, Newton and Miller",4.875,144000,360,2012-01-01,03/2012,72,1.0,...,0,0,0,0,0,0,0,1,0,1
2,742515242108,Z,Thornton-Davis,3.25,366000,180,2012-01-01,03/2012,49,1.0,...,0,0,0,0,0,0,0,0,0,1
3,601385667462,X,OTHER,4.75,135000,360,2012-02-01,04/2012,46,2.0,...,0,0,0,0,0,1,1,1,1,1
4,273870029961,X,OTHER,4.75,124000,360,2012-02-01,04/2012,80,1.0,...,3,4,5,6,7,8,9,10,11,1


In [5]:
data.tail()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
116053,382119962287,Y,Browning-Hart,4.125,153000,360,2012-02-01,04/2012,88,2.0,...,0,0,0,0,0,0,0,0,0,0
116054,582803915466,Z,OTHER,3.0,150000,120,2012-01-01,03/2012,35,1.0,...,0,0,0,0,0,0,0,0,0,0
116055,837922316947,X,OTHER,3.875,166000,360,2012-02-01,04/2012,58,2.0,...,0,0,0,0,0,0,0,0,0,0
116056,477343182138,X,OTHER,4.25,169000,360,2012-02-01,04/2012,74,2.0,...,0,0,0,0,0,0,0,0,0,0
116057,763308490661,Z,Thornton-Davis,3.375,252000,180,2012-01-01,03/2012,29,2.0,...,0,0,0,0,0,0,0,0,0,0


##### Step 3 >> Handling Missing Values
***

In [6]:
data.isna().sum()

loan_id                     0
source                      0
financial_institution       0
interest_rate               0
unpaid_principal_bal        0
loan_term                   0
origination_date            0
first_payment_date          0
loan_to_value               0
number_of_borrowers         0
debt_to_income_ratio        0
borrower_credit_score       0
loan_purpose                0
insurance_percent           0
co-borrower_credit_score    0
insurance_type              0
m1                          0
m2                          0
m3                          0
m4                          0
m5                          0
m6                          0
m7                          0
m8                          0
m9                          0
m10                         0
m11                         0
m12                         0
m13                         0
dtype: int64

There are no missing values in this data set

##### Step 4 >> Preprocessing
***

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116058 entries, 0 to 116057
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   loan_id                   116058 non-null  int64  
 1   source                    116058 non-null  object 
 2   financial_institution     116058 non-null  object 
 3   interest_rate             116058 non-null  float64
 4   unpaid_principal_bal      116058 non-null  int64  
 5   loan_term                 116058 non-null  int64  
 6   origination_date          116058 non-null  object 
 7   first_payment_date        116058 non-null  object 
 8   loan_to_value             116058 non-null  int64  
 9   number_of_borrowers       116058 non-null  float64
 10  debt_to_income_ratio      116058 non-null  float64
 11  borrower_credit_score     116058 non-null  float64
 12  loan_purpose              116058 non-null  object 
 13  insurance_percent         116058 non-null  f

In [8]:
data.columns

Index(['loan_id', 'source', 'financial_institution', 'interest_rate',
       'unpaid_principal_bal', 'loan_term', 'origination_date',
       'first_payment_date', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'borrower_credit_score', 'loan_purpose',
       'insurance_percent', 'co-borrower_credit_score', 'insurance_type', 'm1',
       'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12',
       'm13'],
      dtype='object')

In [9]:
data['origination_date'].value_counts() # no use

2012-02-01    52334
2012-01-01    49093
2012-03-01    14631
Name: origination_date, dtype: int64

In [10]:
data['first_payment_date'].value_counts() # no use

04/2012    52840
03/2012    47680
05/2012    15014
02/2012      524
Name: first_payment_date, dtype: int64

In [11]:
data['insurance_percent'].value_counts() # no use until we know how many EMI is paid in loan term

0.0     102455
30.0      5113
25.0      4885
12.0      2493
6.0        701
35.0       372
16.0        18
18.0        12
20.0         3
17.0         2
22.0         1
15.0         1
40.0         1
39.0         1
Name: insurance_percent, dtype: int64

In [12]:
data['financial_institution'].value_counts() # Needs encoding

OTHER                          49699
Browning-Hart                  31852
Swanson, Newton and Miller      6874
Edwards-Hoffman                 4857
Martinez, Duffy and Bird        4715
Miller, Mcclure and Allen       3158
Nicholson Group                 2116
Turner, Baldwin and Rhodes      1846
Suarez Inc                      1790
Cole, Brooks and Vincent        1642
Richards-Walters                1459
Taylor, Hunt and Rodriguez      1259
Sanchez-Robinson                1193
Sanchez, Hays and Wilkerson      853
Romero, Woods and Johnson        750
Thornton-Davis                   651
Anderson-Taylor                  483
Richardson Ltd                   473
Chapman-Mcmahon                  388
Name: financial_institution, dtype: int64

In [13]:
data['source'].value_counts() # Needs encoding

X    63858
Y    37554
Z    14646
Name: source, dtype: int64

In [14]:
data['loan_purpose'].value_counts() # Needs encoding

A23    58462
B12    29383
C86    28213
Name: loan_purpose, dtype: int64

**Droping unwanted features**

In [15]:
data.columns

Index(['loan_id', 'source', 'financial_institution', 'interest_rate',
       'unpaid_principal_bal', 'loan_term', 'origination_date',
       'first_payment_date', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'borrower_credit_score', 'loan_purpose',
       'insurance_percent', 'co-borrower_credit_score', 'insurance_type', 'm1',
       'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12',
       'm13'],
      dtype='object')

In [16]:
# Feature set
x = data.drop(columns=['loan_id', 'origination_date', 'first_payment_date', 'insurance_percent','insurance_type', 'm13'], axis = 1)
# Target set
y = pd.DataFrame(data['m13'])

In [17]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116058 entries, 0 to 116057
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   source                    116058 non-null  object 
 1   financial_institution     116058 non-null  object 
 2   interest_rate             116058 non-null  float64
 3   unpaid_principal_bal      116058 non-null  int64  
 4   loan_term                 116058 non-null  int64  
 5   loan_to_value             116058 non-null  int64  
 6   number_of_borrowers       116058 non-null  float64
 7   debt_to_income_ratio      116058 non-null  float64
 8   borrower_credit_score     116058 non-null  float64
 9   loan_purpose              116058 non-null  object 
 10  co-borrower_credit_score  116058 non-null  float64
 11  m1                        116058 non-null  int64  
 12  m2                        116058 non-null  int64  
 13  m3                        116058 non-null  i

In [18]:
# Label Encode 'Financial Institution'
from sklearn.preprocessing import LabelEncoder
lbl_encoder = LabelEncoder()
x['financial_institution'] = lbl_encoder.fit_transform(x['financial_institution'])

In [19]:
x.financial_institution.value_counts().sort_index(ascending=True)

0       483
1     31852
2       388
3      1642
4      4857
5      4715
6      3158
7      2116
8     49699
9      1459
10      473
11      750
12      853
13     1193
14     1790
15     6874
16     1259
17      651
18     1846
Name: financial_institution, dtype: int64

In [20]:
# One hot encode 'source' and 'loan purpose'
x = pd.get_dummies(data = x, columns=['source', 'loan_purpose'])

In [21]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116058 entries, 0 to 116057
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   financial_institution     116058 non-null  int32  
 1   interest_rate             116058 non-null  float64
 2   unpaid_principal_bal      116058 non-null  int64  
 3   loan_term                 116058 non-null  int64  
 4   loan_to_value             116058 non-null  int64  
 5   number_of_borrowers       116058 non-null  float64
 6   debt_to_income_ratio      116058 non-null  float64
 7   borrower_credit_score     116058 non-null  float64
 8   co-borrower_credit_score  116058 non-null  float64
 9   m1                        116058 non-null  int64  
 10  m2                        116058 non-null  int64  
 11  m3                        116058 non-null  int64  
 12  m4                        116058 non-null  int64  
 13  m5                        116058 non-null  i

In [22]:
corr =data.corr()

In [23]:
corr[corr>0.4].count()

loan_id                      1
interest_rate                2
unpaid_principal_bal         1
loan_term                    2
loan_to_value                2
number_of_borrowers          2
debt_to_income_ratio         1
borrower_credit_score        1
insurance_percent            2
co-borrower_credit_score     2
insurance_type               1
m1                           1
m2                           4
m3                           6
m4                          11
m5                          11
m6                          10
m7                          10
m8                           9
m9                           9
m10                          9
m11                         10
m12                         10
m13                          3
dtype: int64

##### Step 5 >> Feature Engineering
***

In [24]:
x.columns

Index(['financial_institution', 'interest_rate', 'unpaid_principal_bal',
       'loan_term', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'borrower_credit_score',
       'co-borrower_credit_score', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7',
       'm8', 'm9', 'm10', 'm11', 'm12', 'source_X', 'source_Y', 'source_Z',
       'loan_purpose_A23', 'loan_purpose_B12', 'loan_purpose_C86'],
      dtype='object')

In [25]:
# Convert credit_scores to one feature
x['credit_score'] = x['borrower_credit_score'] + x['co-borrower_credit_score']
x.drop(['borrower_credit_score','co-borrower_credit_score'], inplace=True, axis = 1)

In [26]:
x.columns

Index(['financial_institution', 'interest_rate', 'unpaid_principal_bal',
       'loan_term', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8',
       'm9', 'm10', 'm11', 'm12', 'source_X', 'source_Y', 'source_Z',
       'loan_purpose_A23', 'loan_purpose_B12', 'loan_purpose_C86',
       'credit_score'],
      dtype='object')

In [27]:
# create mean, sum, skew and kurtosis through m1-m12
months =['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8','m9', 'm10', 'm11', 'm12']
x['m_mean'] = x[months].mean(axis = 1)
x['m_sum'] = x[months].sum(axis = 1)
x['m_skew'] = x[months].skew(axis = 1)
x['m_kurtosis'] = x[months].kurt(axis = 1)

In [28]:
x.columns

Index(['financial_institution', 'interest_rate', 'unpaid_principal_bal',
       'loan_term', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8',
       'm9', 'm10', 'm11', 'm12', 'source_X', 'source_Y', 'source_Z',
       'loan_purpose_A23', 'loan_purpose_B12', 'loan_purpose_C86',
       'credit_score', 'm_mean', 'm_sum', 'm_skew', 'm_kurtosis'],
      dtype='object')

##### Step 6 >> Build Model
***

## 1. Random Forest Algorithm
***

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=.3)

In [31]:
print(x_train.shape)
print(x_test.shape)

(81240, 30)
(34818, 30)


In [32]:
import warnings
warnings.filterwarnings("ignore") 

In [33]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

##### Step 7 >> Check Performance of Model
***

In [34]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

In [35]:
def check_model_metrices(y_test, y_pred):
    print('Model Accuracy = ', accuracy_score(y_test, y_pred))
    print('Model Precision = ', precision_score(y_test, y_pred))
    print('Model Recall = ', recall_score(y_test, y_pred))
    print('Model F1 Score = ', f1_score(y_test, y_pred))
    print('Confusion Matrix = \n', confusion_matrix(y_test, y_pred))

In [36]:
check_model_metrices(y_test, rf_pred)

Model Accuracy =  0.9958929289447986
Model Precision =  0.8214285714285714
Model Recall =  0.350253807106599
Model F1 Score =  0.491103202846975
Confusion Matrix = 
 [[34606    15]
 [  128    69]]


##### Step 8 >> Feature Importance
***

In [37]:
importance = pd.Series(rf.feature_importances_, index= x.columns).sort_values(ascending=False)*100
importance

m12                      13.911885
credit_score             12.039407
unpaid_principal_bal     11.410344
debt_to_income_ratio      8.483461
loan_to_value             7.853812
m_sum                     6.933960
interest_rate             6.878973
m_mean                    4.769622
financial_institution     3.915719
m11                       3.087498
m_kurtosis                2.366514
m_skew                    2.108148
loan_term                 1.896146
m9                        1.852801
m10                       1.426120
m7                        1.306763
source_X                  1.109537
m8                        1.025287
source_Y                  0.965905
loan_purpose_B12          0.954798
number_of_borrowers       0.849820
loan_purpose_C86          0.843769
loan_purpose_A23          0.816533
m4                        0.679113
m6                        0.611018
m5                        0.562958
source_Z                  0.546998
m3                        0.285799
m1                  

In [38]:
rf.fit(x_train.drop(['m2', 'm3', 'm1', 'm4','source_Z'], axis = True), y_train)
rf_pred = rf.predict(x_test.drop(['m2', 'm3', 'm1', 'm4','source_Z'], axis = True))

In [39]:
check_model_metrices(y_test, rf_pred)

Model Accuracy =  0.9957493250617496
Model Precision =  0.7692307692307693
Model Recall =  0.3553299492385787
Model F1 Score =  0.48611111111111116
Confusion Matrix = 
 [[34600    21]
 [  127    70]]


Slight increase in F1_Score

##### Step 9 >>Distribution of Prediction Probability
***

In [40]:
threshold = 0.22
rf.fit(x_train, y_train)
y_pred_proba = rf.predict_proba(x_test)[:,1]
rf_pred = (y_pred_proba>threshold).astype(int)

In [41]:
check_model_metrices(y_test, rf_pred)

Model Accuracy =  0.9947440978804067
Model Precision =  0.5402298850574713
Model Recall =  0.47715736040609136
Model F1 Score =  0.5067385444743935
Confusion Matrix = 
 [[34541    80]
 [  103    94]]


F1 score of the model has increased to .53

##### Step 10 >> Fine tune hyper parameters
***

In [42]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [43]:
rf = RandomForestClassifier(n_estimators=500, random_state=42, criterion='entropy', max_depth= 10)

In [44]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [45]:
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

In [46]:
check_model_metrices(y_test, rf_pred)

Model Accuracy =  0.9959503704980183
Model Precision =  0.8589743589743589
Model Recall =  0.3401015228426396
Model F1 Score =  0.4872727272727272
Confusion Matrix = 
 [[34610    11]
 [  130    67]]


## 2. Gradiant Boosting Algorithm
***

In [47]:
from sklearn.ensemble import GradientBoostingClassifier

In [48]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
gbc_pred = gbc.predict(x_test)

In [49]:
check_model_metrices(y_test, gbc_pred)

Model Accuracy =  0.9941696823482107
Model Precision =  0.4482758620689655
Model Recall =  0.1319796954314721
Model F1 Score =  0.203921568627451
Confusion Matrix = 
 [[34589    32]
 [  171    26]]


203 misclassifications with f1_score of 0.20

## 3. Extreme Gradiant Boosting (XGBoost) Algorithm
***

In [50]:
from xgboost import XGBClassifier

In [51]:
xgb = XGBClassifier(verbosity = 0)
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)

In [52]:
check_model_metrices(y_test, xgb_pred)

Model Accuracy =  0.9954621172956517
Model Precision =  0.7142857142857143
Model Recall =  0.3299492385786802
Model F1 Score =  0.45138888888888895
Confusion Matrix = 
 [[34595    26]
 [  132    65]]


158 misclassifications with f1_score of 0.45