In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly

Reading in cleaned data

In [2]:
loans = pd.read_csv("../clean_data/loans_data.csv")

In [3]:
loans

Unnamed: 0,loan_amnt,term,int_rate,installment,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,...,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_util,pub_rec_bankruptcies,state_name,grade,fico_range,credit_score
0,5000.0,36 months,10.65,162.87,10+ years,RENT,24000.0,Verified,2011,Fully Paid,...,0,1.0,3.0,0,83.7,0.0,Arizona,B,737.0,Good
1,5000.0,36 months,7.90,156.46,2 - 4 years,RENT,36000.0,Source Verified,2011,Fully Paid,...,0,3.0,9.0,0,28.3,0.0,Arizona,A,732.0,Good
2,6500.0,60 months,14.65,153.45,5 - 9 years,OWN,72000.0,Not Verified,2011,Fully Paid,...,0,2.0,14.0,0,20.6,0.0,Arizona,C,697.0,Good
3,6000.0,36 months,12.42,200.50,< 2 years,RENT,36852.0,Source Verified,2011,Fully Paid,...,0,1.0,7.0,0,66.5,0.0,Arizona,B,697.0,Good
4,12000.0,36 months,16.29,423.61,5 - 9 years,RENT,88365.0,Verified,2011,Fully Paid,...,0,0.0,6.0,0,96.4,0.0,Arizona,D,682.0,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38279,9600.0,36 months,10.08,310.13,2 - 4 years,MORTGAGE,60000.0,Not Verified,2008,Fully Paid,...,0,0.0,8.0,0,30.9,0.0,Idaho,B,732.0,Good
38280,25000.0,36 months,10.08,807.62,< 2 years,MORTGAGE,145000.0,Not Verified,2008,Fully Paid,...,0,1.0,9.0,0,14.1,0.0,Idaho,B,762.0,Very Good
38281,6000.0,36 months,12.92,201.94,< 2 years,MORTGAGE,40800.0,Not Verified,2008,Fully Paid,...,2+,2.0,11.0,0,47.3,0.0,Idaho,D,667.0,Fair
38282,6150.0,36 months,11.03,201.44,< 2 years,RENT,35713.6,Not Verified,2008,Fully Paid,...,0,0.0,4.0,0,64.3,0.0,Idaho,C,687.0,Good


### Summary of Statistics

In [4]:
loans.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,issue_d,dti,inq_last_6mths,open_acc,revol_util,pub_rec_bankruptcies,fico_range
count,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0,38284.0
mean,11156.807021,12.000834,323.266224,68853.03,2010.369998,13.346091,0.869188,9.290121,48.850521,0.043674,717.060469
std,7408.188582,3.726761,208.28041,63470.98,0.807165,6.669822,1.068543,4.38074,28.317101,0.205009,35.820829
min,500.0,5.42,16.08,4000.0,2007.0,0.0,0.0,2.0,0.0,0.0,662.0
25%,5500.0,8.94,166.63,40320.0,2010.0,8.21,0.0,6.0,25.4,0.0,687.0
50%,10000.0,11.86,278.97,59000.0,2011.0,13.44,1.0,9.0,49.3,0.0,712.0
75%,15000.0,14.59,427.18,82000.0,2011.0,18.62,1.0,12.0,72.4,0.0,742.0
max,35000.0,24.59,1305.19,6000000.0,2011.0,29.99,8.0,44.0,99.9,2.0,827.0


Checking there aren't any missing values.

In [5]:
loans.isnull().any()

loan_amnt               False
term                    False
int_rate                False
installment             False
emp_length              False
home_ownership          False
annual_inc              False
verification_status     False
issue_d                 False
loan_status             False
purpose                 False
dti                     False
delinq_2yrs             False
inq_last_6mths          False
open_acc                False
pub_rec                 False
revol_util              False
pub_rec_bankruptcies    False
state_name              False
grade                   False
fico_range              False
credit_score            False
dtype: bool

## Pandas Profiling

In [6]:
import pandas_profiling
pandas_profiling.ProfileReport(loans)

Summarize dataset:   0%|          | 0/35 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Dropping credit_score as this was made for analysis purposes and won't be used in the model. Issue date is also information which is specific to applications and does not add value to future predictions. States are being excluded from the model as this could lead to bias.

In [7]:
loans = loans.drop(["credit_score", "issue_d", "state_name"], axis = 1).copy()

The Panda's Profile shows "installment" and "loan_amnt" are highly correlated so only one of these variables will be required to avoid overfitting.

In [None]:
loans = loans.drop(["installment"], axis = 1)

Converting loan_status to numerical data for modelling.

In [8]:
loan_status_replace = {
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}
loans = loans.replace(loan_status_replace)

In [9]:
loans.dtypes

loan_amnt               float64
term                     object
int_rate                float64
installment             float64
emp_length               object
home_ownership           object
annual_inc              float64
verification_status      object
loan_status               int64
purpose                  object
dti                     float64
delinq_2yrs              object
inq_last_6mths          float64
open_acc                float64
pub_rec                  object
revol_util              float64
pub_rec_bankruptcies    float64
grade                    object
fico_range              float64
dtype: object

### Dummy Variables

Now generating dummy variables for categorical variables.

In [10]:
loans = pd.get_dummies(loans, drop_first = True)

loans.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,inq_last_6mths,open_acc,revol_util,pub_rec_bankruptcies,...,purpose_wedding,delinq_2yrs_1,delinq_2yrs_2+,pub_rec_1+,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
0,5000.0,10.65,162.87,24000.0,1,27.65,1.0,3.0,83.7,0.0,...,0,0,0,0,1,0,0,0,0,0
1,5000.0,7.9,156.46,36000.0,1,11.2,3.0,9.0,28.3,0.0,...,1,0,0,0,0,0,0,0,0,0
2,6500.0,14.65,153.45,72000.0,1,16.12,2.0,14.0,20.6,0.0,...,0,0,0,0,0,1,0,0,0,0
3,6000.0,12.42,200.5,36852.0,1,10.62,1.0,7.0,66.5,0.0,...,0,0,0,0,1,0,0,0,0,0
4,12000.0,16.29,423.61,88365.0,1,16.85,0.0,6.0,96.4,0.0,...,0,0,0,0,0,0,1,0,0,0


## Splitting into predictors and target variable

In [11]:
X = loans.drop(columns = "loan_status")
y = loans["loan_status"]

### Test & Train Data
Using a 20/80 split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Logistic Regression Model

In [14]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

LogisticRegression()

### Checking the mean accuracy of the training data.

In [15]:
model.score(X_train, y_train)

0.8547033663107716

### Mean accuracy of the test data

In [16]:
model.score(X_test, y_test)

0.8607809847198642

### Predicted Probabilities

In [17]:
pred_test = model.predict_proba(X_test)
pred_test

array([[0.1013938 , 0.8986062 ],
       [0.06946096, 0.93053904],
       [0.21446979, 0.78553021],
       ...,
       [0.11219175, 0.88780825],
       [0.0880616 , 0.9119384 ],
       [0.06267082, 0.93732918]])

### AUC score from ROC

In [18]:
from sklearn.metrics import roc_auc_score
pred_test = pred_test[:, 1]
pred_test
roc_auc_score(y_true = y_test, y_score = pred_test)

0.6908906710298852

## Decision Tree

In [19]:
from sklearn import tree

model = tree.DecisionTreeClassifier(max_features = 3, max_depth = 5)

model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, max_features=3)

In [20]:
model.score(X_test, y_test)

0.8605197858168996

## Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

#using 10 decision trees
model = RandomForestClassifier(n_estimators = 10)

model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

In [22]:
model.score(X_test, y_test)

0.8432806582212354

In [23]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(random_state=10)
clf = clf.fit(X_train, y_train)
importances = clf.feature_importances_

In [24]:
features_to_drop = (importances <= 0.01)
features_indexes, = np.where(features_to_drop == True)
print(features_indexes)

[ 8 14 19 21 22 23 24 25 26 28 29 30 31 33 34 35 36 37 38 39 40]


In [25]:
data = [go.Bar(x = loans.columns, y = clf.feature_importances_)]
layout = go.Layout(
    title='Feature Importances',
    autosize=False,
    width=1000,
    height=500,
    margin=go.Margin(
        l=50,
        r=50,
        b=250,
        t=50,
        pad=4
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='feature_importances')

Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.Margin


Your filename `feature_importances` didn't end with .html. Adding .html to the end of your file.



'feature_importances.html'

In [26]:
predictions = model.predict(X_test)

from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)

metrics.auc(fpr, tpr)

0.5295460180364207