In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
df1 = pd.read_csv('loan_data.csv')

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [9]:
df1.isnull().sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

In [10]:
df1.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [11]:
df1['not.fully.paid'].value_counts()

0    8045
1    1533
Name: not.fully.paid, dtype: int64

In [12]:
loans=df1.copy(deep=True)

In [13]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
loans['purpose']=enc.fit_transform(loans['purpose'].values)

In [14]:
loans

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,2,0.1189,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,1,0.1071,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,0,0
2,1,2,0.1357,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,0,0
3,1,2,0.1008,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,0,0
4,1,1,0.1426,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,0,0.1461,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,0,1
9574,0,0,0.1253,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,0,1
9575,0,2,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,4,0.1600,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,0,1


In [15]:
loans[['int.rate','not.fully.paid']].corr()

Unnamed: 0,int.rate,not.fully.paid
int.rate,1.0,0.159552
not.fully.paid,0.159552,1.0


In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
from sklearn.model_selection import train_test_split


In [18]:
X = loans.drop('not.fully.paid',axis=1)
y= loans['not.fully.paid']


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [20]:
dtree = DecisionTreeClassifier()

In [21]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

In [22]:
predictions = dtree.predict(X_test)

In [23]:
from sklearn.metrics import classification_report,confusion_matrix

In [24]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84      2431
           1       0.20      0.25      0.23       443

    accuracy                           0.73      2874
   macro avg       0.53      0.54      0.53      2874
weighted avg       0.76      0.73      0.74      2874



In [25]:
dt=DecisionTreeClassifier()

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"],
    'splitter':['best','random'],
    'min_samples_split':[2,5,8,15],
    'max_features':['auto','sqrt','log2'],
    'class_weight':['balanced',None]}


In [28]:
dt_grid = GridSearchCV(estimator=dt,param_grid=params,cv=4,n_jobs=-1,verbose=True,scoring='accuracy')


In [29]:
dt_grid.fit(X_train,y_train)

Fitting 4 folds for each of 2400 candidates, totalling 9600 fits


GridSearchCV(cv=4, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 5, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 10, 20, 50, 100],
                         'min_samples_split': [2, 5, 8, 15],
                         'splitter': ['best', 'random']},
             scoring='accuracy', verbose=True)

In [30]:
dt_grid.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'log2',
 'min_samples_leaf': 50,
 'min_samples_split': 15,
 'splitter': 'best'}

In [36]:
dt_tuned = DecisionTreeClassifier(class_weight=None,
 criterion='gini',
 max_depth=20,
 max_features='log2',
 min_samples_leaf=50,
 min_samples_split=2,
 splitter='best')

In [37]:
dt_tuned.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=20, max_features='log2', min_samples_leaf=50)

In [38]:
tunes_pred = dt_tuned.predict(X_test)

In [39]:
print(classification_report(y_test,tunes_pred))

              precision    recall  f1-score   support

           0       0.85      0.99      0.91      2431
           1       0.29      0.02      0.03       443

    accuracy                           0.84      2874
   macro avg       0.57      0.50      0.47      2874
weighted avg       0.76      0.84      0.78      2874

