In [None]:
Case Study

Domain – Banking/Loan

focus – Lower NPA (Non Performing Asset)

Business challenge/requirement
PeerLoanKart is an NBFC (Non-Banking Financial Company) which facilitates peer to peer loan.

It connects people who need money (borrowers) with people who have money (investors). As an investor, you would want to invest in people who showed a profile of having a high probability of paying you back.

You as a ML expert create a model that will help predict whether a borrower will pay the loan or not.

Key issues
Ensure NPAs are lower – meaning PeerLoanKart wants to be very diligent in giving loans to borrower

Considerations
NONE

Data volume
- Approx 9578 records – file loan_borowwer_data.csv
Fields in Data
• credit.policy: 1 if the customer meets the credit underwriting criteria of PeerLoanKart, and 0 otherwise
• purpose: The purpose of the loan (takes values "credit_card", "debt_consolidation", "educational", "major_purchase", "small_business", and "all_other")
• int.rate: The interest rate of the loan, as a proportion (a rate of 11% would be stored as 0.11). Borrowers judged by PeerLoanKart to be more risky are assigned higher interest rates
• installment: The monthly installments owed by the borrower if the loan is funded
• log.annual.inc: The natural log of the self-reported annual income of the borrower
• dti: The debt-to-income ratio of the borrower (amount of debt divided by annual income)
• fico: The FICO credit score of the borrower
• days.with.cr.line: The number of days the borrower has had a credit line
• revol.bal: The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle)
• revol.util: The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available)
• inq.last.6mths: The borrower's number of inquiries by creditors in the last 6 months
• delinq.2yrs: The number of times the borrower had been 30+ days past due on a payment in the past 2 years
• pub.rec: The borrower's number of derogatory public records (bankruptcy filings, tax liens, or judgments)
• not.fully.paid: This is the output field. Please note that 1 means borrower is not going to pay the loan completely

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('loan_borowwer_data.csv')
data.shape

(9578, 14)

In [2]:
data

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,all_other,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,home_improvement,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [3]:
# label encode the purpose column

from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()
data['purpose']=LE.fit_transform(data['purpose'])

In [4]:
data

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,2,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,2,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,2,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,1,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,0,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,0,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,2,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,4,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [None]:
#You will use the dataset loan_borowwer_data.csv in this assignment, creating a model that will help predict whether a borrower will pay the loan or not. You will achieve the same using Decision Tree or Random Foreset model depending on their accuracy.

In [30]:
from sklearn.tree import DecisionTreeClassifier
#print(data.columns.drop('not.fully.paid'))
XCols = data.columns
XCols = XCols.drop('not.fully.paid')
print(XCols)

# DecisionTreeClassifier
model = DecisionTreeClassifier() #(criterion='entropy',max_depth=5)
model.fit(data[XCols],data[['not.fully.paid']])
predictions = model.predict(data[XCols])

from sklearn import metrics

print("Accuracy score",metrics.accuracy_score(data['not.fully.paid'],predictions))

print("Confusion Matrix",metrics.confusion_matrix(data['not.fully.paid'],predictions))

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec'],
      dtype='object')
Accuracy score 1.0
Confusion Matrix [[8045    0]
 [   0 1533]]


In [32]:
#RandomForsetClassifier

from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier()
model2.fit(data[XCols],data[['not.fully.paid']])
predictions2 = model2.predict(data[XCols])

print("Accuracy score",metrics.accuracy_score(data['not.fully.paid'],predictions2))

print("Confusion Matrix",metrics.confusion_matrix(data['not.fully.paid'],predictions2))

  model2.fit(data[XCols],data[['not.fully.paid']])


Accuracy score 1.0
Confusion Matrix [[8045    0]
 [   0 1533]]
