In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
data = 'loan_data.csv'

In [3]:
df = pd.read_csv(data)

In [4]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [6]:
y = df.pop('not.fully.paid')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
dtypes: float64(6), int64(6), object(1)
memory usage: 935.4+ KB


In [19]:
df['purpose'].value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [27]:
df = pd.concat([df.drop(['purpose'],axis=1),pd.get_dummies(df['purpose'])],axis=1)

In [28]:
y.value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [77]:
train,test,y_train,y_test = train_test_split(df,y,test_size=0.4,random_state=42)

In [78]:
lr = LogisticRegression()
lr.fit(train,y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:',accuracy_score(y_test,y_pred))

Accuracy score baseline: 0.8413361169102297


In [79]:
def fit_predict(train,
                test,
                y_train,
                y_test,
                scaler,
                max_depth,criterion='entropy',
                max_features=1,
                min_samples_split=4):
    
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion=criterion,
                               max_depth=max_depth,
                               random_state=42,
                               max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled,y_train)
    y_pred=dt.predict(test_scaled)
    print(accuracy_score(y_test,y_pred))

In [80]:
dt = DecisionTreeClassifier()
dt.fit(train,y_train)
y_pred=dt.predict(test)
print(accuracy_score(y_test,y_pred))

0.7382567849686847


### Max Depth tuning

In [81]:
for i in range(1,50):
    print('Accuracy score using max_depth=',i, end=':')
    fit_predict(train,test,y_train,y_test,StandardScaler(),i)

Accuracy score using max_depth= 1:0.8415970772442589
Accuracy score using max_depth= 2:0.8415970772442589
Accuracy score using max_depth= 3:0.8413361169102297
Accuracy score using max_depth= 4:0.8415970772442589
Accuracy score using max_depth= 5:0.8413361169102297
Accuracy score using max_depth= 6:0.840553235908142
Accuracy score using max_depth= 7:0.8374217118997912
Accuracy score using max_depth= 8:0.8327244258872651
Accuracy score using max_depth= 9:0.8319415448851775
Accuracy score using max_depth= 10:0.8353340292275574
Accuracy score using max_depth= 11:0.8280271398747391
Accuracy score using max_depth= 12:0.8288100208768268
Accuracy score using max_depth= 13:0.8186325678496869
Accuracy score using max_depth= 14:0.8178496868475992
Accuracy score using max_depth= 15:0.8113256784968684
Accuracy score using max_depth= 16:0.7974947807933194
Accuracy score using max_depth= 17:0.7969728601252609
Accuracy score using max_depth= 18:0.7914926931106472
Accuracy score using max_depth= 19:0.7

### Max Features tuning

In [82]:
for i in np.arange(0.1,1.0,0.1):
    print('Accuracy score using max_depth of 2, max features=',i, end=':')
    fit_predict(train,test,y_train,y_test,StandardScaler(),2,max_features=i)

Accuracy score using max_depth of 2, max features= 0.1:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.2:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.30000000000000004:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.4:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.5:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.6:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.7000000000000001:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.8:0.8415970772442589
Accuracy score using max_depth of 2, max features= 0.9:0.8415970772442589


### Min samples split tuning

In [85]:
for i in range(2,10):
    print('Accuracy score using max_depth of 2, max features=1, min sample split=',i, end=':')
    fit_predict(train,test,y_train,y_test,StandardScaler(),2,
                max_features=0.1,
                min_samples_split=i)

Accuracy score using max_depth of 2, max features=1, min sample split= 2:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 3:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 4:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 5:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 6:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 7:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 8:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split= 9:0.8415970772442589


### Criterion tuning

In [86]:
for i in ['gini','entropy']:
    print('Accuracy score using max_depth of 2, max features=1, min sample split=2',i, end=':')
    fit_predict(train,test,y_train,y_test,StandardScaler(),2,
                max_features=0.1,
                min_samples_split=2,
                criterion=i)

Accuracy score using max_depth of 2, max features=1, min sample split=2 gini:0.8415970772442589
Accuracy score using max_depth of 2, max features=1, min sample split=2 entropy:0.8415970772442589


In [87]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return(train_poly,test_poly)

In [88]:
for degree in [1,2,3]:
    train_poly,test_poly, = create_poly(train,test,degree)
    print('Polynomial degree', degree)
    fit_predict(train_poly,
                test_poly,
                y_train,
                y_test,
                StandardScaler(),2,
                max_features=0.1,
                min_samples_split=2,
                criterion='entropy')
    print(10*'-')

Polynomial degree 1
0.8415970772442589
----------
Polynomial degree 2
0.8415970772442589
----------
Polynomial degree 3
0.8415970772442589
----------


In [89]:
for degree in [1,2,3]:
    train_poly,test_poly, = create_poly(train,test,degree)
    print('Polynomial degree', degree)
    fit_predict(train_poly,
                test_poly,
                y_train,
                y_test,
                StandardScaler(),2,
                max_features=0.1,
                min_samples_split=2,
                criterion='gini')
    print(10*'-')

Polynomial degree 1
0.8415970772442589
----------
Polynomial degree 2
0.8415970772442589
----------
Polynomial degree 3
0.8415970772442589
----------


### Random Forest

In [90]:
from sklearn.ensemble import RandomForestClassifier

In [91]:
rf= RandomForestClassifier(oob_score=True)

In [92]:
rf.fit(train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [93]:
pred_rf = rf.predict(test)

In [94]:
print(accuracy_score(y_test,pred_rf))

0.8301148225469729


In [95]:
rf.oob_score_

0.8054298642533937

In [96]:
from sklearn.model_selection import GridSearchCV

In [97]:
params = {'n_estimators':[200,500,700],
        'max_depth':[1,5,10],
         'min_samples_leaf':[3,6,10]}

In [98]:
gs = GridSearchCV(rf,params,verbose=3)

In [99]:
gs.fit(train,y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] max_depth=1, min_samples_leaf=3, n_estimators=200 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=200, score=0.8387265135699373, total=   0.6s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  max_depth=1, min_samples_leaf=3, n_estimators=200, score=0.8387265135699373, total=   0.5s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


[CV]  max_depth=1, min_samples_leaf=3, n_estimators=200, score=0.8390804597701149, total=   0.7s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=500, score=0.8387265135699373, total=   1.6s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=500, score=0.8387265135699373, total=   1.5s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=500 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=500, score=0.8390804597701149, total=   1.5s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=700 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=700, score=0.8387265135699373, total=   2.2s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=700 ...............
[CV]  max_depth=1, min_samples_leaf=3, n_estimators=700, score=0.8387265135699373, total=   2.1s
[CV] max_depth=1, min_samples_leaf=3, n_estimators=700 ........

[CV]  max_depth=5, min_samples_leaf=10, n_estimators=700, score=0.8387265135699373, total=   4.0s
[CV] max_depth=5, min_samples_leaf=10, n_estimators=700 ..............
[CV]  max_depth=5, min_samples_leaf=10, n_estimators=700, score=0.8387265135699373, total=   4.1s
[CV] max_depth=5, min_samples_leaf=10, n_estimators=700 ..............
[CV]  max_depth=5, min_samples_leaf=10, n_estimators=700, score=0.8390804597701149, total=   3.9s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.8387265135699373, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.8392484342379958, total=   1.7s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.8385579937304075, total=   1.7s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 .

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  4.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [200, 500, 700], 'max_depth': [1, 5, 10], 'min_samples_leaf': [3, 6, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [100]:
gs.best_params_

{'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 200}

In [101]:
gs.best_score_

0.838844413505047

In [102]:
from sklearn.metrics import confusion_matrix,classification_report

In [103]:
print(confusion_matrix(y_test,pred_rf))

[[3152   73]
 [ 578   29]]


In [104]:
print(classification_report(y_test,pred_rf))

             precision    recall  f1-score   support

          0       0.85      0.98      0.91      3225
          1       0.28      0.05      0.08       607

avg / total       0.76      0.83      0.78      3832



In [105]:
rf1 = gs.best_estimator_
rf1.oob_score = True

In [106]:
rf1.fit(train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [107]:
pred_rf1 = rf1.predict(test)

In [108]:
print(accuracy_score(y_test,pred_rf1))

0.8415970772442589


In [109]:
rf1.oob_score_

0.838844413505047

In [110]:
rf1.feature_importances_

array([0.185, 0.115, 0.045, 0.04 , 0.05 , 0.14 , 0.055, 0.02 , 0.065,
       0.185, 0.005, 0.035, 0.   , 0.   , 0.   , 0.005, 0.   , 0.01 ,
       0.045])

In [112]:
sorted(list(zip(rf1.feature_importances_,df.columns)),reverse=True)

[(0.185, 'inq.last.6mths'),
 (0.185, 'credit.policy'),
 (0.14, 'fico'),
 (0.115, 'int.rate'),
 (0.065, 'revol.util'),
 (0.055, 'days.with.cr.line'),
 (0.05, 'dti'),
 (0.045, 'small_business'),
 (0.045, 'installment'),
 (0.04, 'log.annual.inc'),
 (0.035, 'pub.rec'),
 (0.02, 'revol.bal'),
 (0.01, 'major_purchase'),
 (0.005, 'educational'),
 (0.005, 'delinq.2yrs'),
 (0.0, 'home_improvement'),
 (0.0, 'debt_consolidation'),
 (0.0, 'credit_card'),
 (0.0, 'all_other')]

In [113]:
print(confusion_matrix(y_test,pred_rf1))

[[3225    0]
 [ 607    0]]


In [114]:
print(classification_report(y_test,pred_rf1))

             precision    recall  f1-score   support

          0       0.84      1.00      0.91      3225
          1       0.00      0.00      0.00       607

avg / total       0.71      0.84      0.77      3832



In [115]:
print(pred_rf1)

[0 0 0 ... 0 0 0]
