In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("loan_data.csv")

y = df.pop("credit.policy")


In [3]:
df.head()

Unnamed: 0,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [4]:
y

0       1
1       1
2       1
3       1
4       1
       ..
9573    0
9574    0
9575    0
9576    0
9577    0
Name: credit.policy, Length: 9578, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   purpose            9578 non-null   object 
 1   int.rate           9578 non-null   float64
 2   installment        9578 non-null   float64
 3   log.annual.inc     9578 non-null   float64
 4   dti                9578 non-null   float64
 5   fico               9578 non-null   int64  
 6   days.with.cr.line  9578 non-null   float64
 7   revol.bal          9578 non-null   int64  
 8   revol.util         9578 non-null   float64
 9   inq.last.6mths     9578 non-null   int64  
 10  delinq.2yrs        9578 non-null   int64  
 11  pub.rec            9578 non-null   int64  
 12  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(6), object(1)
memory usage: 972.9+ KB


In [6]:
df.describe()

Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
count,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0,9578.0
mean,0.12264,319.089413,10.932117,12.606679,710.846314,4560.767197,16913.96,46.799236,1.577469,0.163708,0.062122,0.160054
std,0.026847,207.071301,0.614813,6.88397,37.970537,2496.930377,33756.19,29.014417,2.200245,0.546215,0.262126,0.366676
min,0.06,15.67,7.547502,0.0,612.0,178.958333,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.1039,163.77,10.558414,7.2125,682.0,2820.0,3187.0,22.6,0.0,0.0,0.0,0.0
50%,0.1221,268.95,10.928884,12.665,707.0,4139.958333,8596.0,46.3,1.0,0.0,0.0,0.0
75%,0.1407,432.7625,11.291293,17.95,737.0,5730.0,18249.5,70.9,2.0,0.0,0.0,0.0
max,0.2164,940.14,14.528354,29.96,827.0,17639.95833,1207359.0,119.0,33.0,13.0,5.0,1.0


In [7]:
df.corr()

Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
int.rate,1.0,0.27614,0.056383,0.220006,-0.714821,-0.124022,0.092527,0.464837,0.20278,0.156079,0.098162,0.159552
installment,0.27614,1.0,0.448102,0.050202,0.086039,0.183297,0.233625,0.081356,-0.010419,-0.004368,-0.03276,0.049955
log.annual.inc,0.056383,0.448102,1.0,-0.054065,0.114576,0.336896,0.37214,0.054881,0.029171,0.029203,0.016506,-0.033439
dti,0.220006,0.050202,-0.054065,1.0,-0.241191,0.060101,0.188748,0.337109,0.029189,-0.021792,0.006209,0.037362
fico,-0.714821,0.086039,0.114576,-0.241191,1.0,0.26388,-0.015553,-0.541289,-0.185293,-0.21634,-0.147592,-0.149666
days.with.cr.line,-0.124022,0.183297,0.336896,0.060101,0.26388,1.0,0.229344,-0.024239,-0.041736,0.081374,0.071826,-0.029237
revol.bal,0.092527,0.233625,0.37214,0.188748,-0.015553,0.229344,1.0,0.203779,0.022394,-0.033243,-0.03101,0.053699
revol.util,0.464837,0.081356,0.054881,0.337109,-0.541289,-0.024239,0.203779,1.0,-0.01388,-0.04274,0.066717,0.082088
inq.last.6mths,0.20278,-0.010419,0.029171,0.029189,-0.185293,-0.041736,0.022394,-0.01388,1.0,0.021245,0.072673,0.149452
delinq.2yrs,0.156079,-0.004368,0.029203,-0.021792,-0.21634,0.081374,-0.033243,-0.04274,0.021245,1.0,0.009184,0.008881


In [8]:
df.groupby('purpose')['purpose'].unique()

purpose
all_other                      [all_other]
credit_card                  [credit_card]
debt_consolidation    [debt_consolidation]
educational                  [educational]
home_improvement        [home_improvement]
major_purchase            [major_purchase]
small_business            [small_business]
Name: purpose, dtype: object

In [9]:
df.head()

Unnamed: 0,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [10]:
df.shape

(9578, 13)

In [11]:
df['purpose'] = df['purpose'].astype('category')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   purpose            9578 non-null   category
 1   int.rate           9578 non-null   float64 
 2   installment        9578 non-null   float64 
 3   log.annual.inc     9578 non-null   float64 
 4   dti                9578 non-null   float64 
 5   fico               9578 non-null   int64   
 6   days.with.cr.line  9578 non-null   float64 
 7   revol.bal          9578 non-null   int64   
 8   revol.util         9578 non-null   float64 
 9   inq.last.6mths     9578 non-null   int64   
 10  delinq.2yrs        9578 non-null   int64   
 11  pub.rec            9578 non-null   int64   
 12  not.fully.paid     9578 non-null   int64   
dtypes: category(1), float64(6), int64(6)
memory usage: 907.8 KB


In [13]:
df.shape

(9578, 13)

In [14]:
encodedDF = pd.get_dummies(df['purpose'])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   purpose            9578 non-null   category
 1   int.rate           9578 non-null   float64 
 2   installment        9578 non-null   float64 
 3   log.annual.inc     9578 non-null   float64 
 4   dti                9578 non-null   float64 
 5   fico               9578 non-null   int64   
 6   days.with.cr.line  9578 non-null   float64 
 7   revol.bal          9578 non-null   int64   
 8   revol.util         9578 non-null   float64 
 9   inq.last.6mths     9578 non-null   int64   
 10  delinq.2yrs        9578 non-null   int64   
 11  pub.rec            9578 non-null   int64   
 12  not.fully.paid     9578 non-null   int64   
dtypes: category(1), float64(6), int64(6)
memory usage: 907.8 KB


In [16]:
encodedDF

Unnamed: 0,all_other,credit_card,debt_consolidation,educational,home_improvement,major_purchase,small_business
0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
9573,1,0,0,0,0,0,0
9574,1,0,0,0,0,0,0
9575,0,0,1,0,0,0,0
9576,0,0,0,0,1,0,0


In [17]:
df = df.drop(['purpose'],axis=1)


In [18]:
df = pd.concat([df,encodedDF],axis=1)

df.info()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   int.rate            9578 non-null   float64
 1   installment         9578 non-null   float64
 2   log.annual.inc      9578 non-null   float64
 3   dti                 9578 non-null   float64
 4   fico                9578 non-null   int64  
 5   days.with.cr.line   9578 non-null   float64
 6   revol.bal           9578 non-null   int64  
 7   revol.util          9578 non-null   float64
 8   inq.last.6mths      9578 non-null   int64  
 9   delinq.2yrs         9578 non-null   int64  
 10  pub.rec             9578 non-null   int64  
 11  not.fully.paid      9578 non-null   int64  
 12  all_other           9578 non-null   uint8  
 13  credit_card         9578 non-null   uint8  
 14  debt_consolidation  9578 non-null   uint8  
 15  educational         9578 non-null   uint8  
 16  home_i

In [20]:
df.shape

(9578, 19)

In [21]:
df.head()

Unnamed: 0,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid,all_other,credit_card,debt_consolidation,educational,home_improvement,major_purchase,small_business
0,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0,0,0,1,0,0,0,0
1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0,0,1,0,0,0,0,0
2,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0,0,0,1,0,0,0,0
3,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0,0,0,1,0,0,0,0
4,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0,0,1,0,0,0,0,0


In [22]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i])) 
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state=101)

In [23]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:',accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.8721294363256785


In [24]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [25]:
dt=DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.9937369519832986


In [26]:
#max depth tuning
for i in range(1,50):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth = 1: 0.808455114822547
Accuracy score using max_depth = 2: 0.808455114822547
Accuracy score using max_depth = 3: 0.8079331941544885
Accuracy score using max_depth = 4: 0.8152400835073069
Accuracy score using max_depth = 5: 0.8220250521920668
Accuracy score using max_depth = 6: 0.8376826722338204
Accuracy score using max_depth = 7: 0.8778705636743215
Accuracy score using max_depth = 8: 0.8408141962421712
Accuracy score using max_depth = 9: 0.8303757828810021
Accuracy score using max_depth = 10: 0.9018789144050104
Accuracy score using max_depth = 11: 0.8987473903966597
Accuracy score using max_depth = 12: 0.9044885177453027
Accuracy score using max_depth = 13: 0.8820459290187892
Accuracy score using max_depth = 14: 0.8810020876826722
Accuracy score using max_depth = 15: 0.8632567849686847
Accuracy score using max_depth = 16: 0.9164926931106472
Accuracy score using max_depth = 17: 0.8997912317327766
Accuracy score using max_depth = 18: 0.8815240083507306
Acc

In [27]:
#Max features tuning
for i in np.arange(0.1,1.0,0.1):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = i)

Accuracy score using max_depth = 0.1: 0.9279749478079332
Accuracy score using max_depth = 0.2: 0.9582463465553236
Accuracy score using max_depth = 0.30000000000000004: 0.9791231732776617
Accuracy score using max_depth = 0.4: 0.982776617954071
Accuracy score using max_depth = 0.5: 0.9754697286012526
Accuracy score using max_depth = 0.6: 0.9895615866388309
Accuracy score using max_depth = 0.7000000000000001: 0.9906054279749478
Accuracy score using max_depth = 0.8: 0.9932150313152401
Accuracy score using max_depth = 0.9: 0.9926931106471816


In [28]:
#Min samples split tuning
for i in range(2,10):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = 0.8,
               min_samples_split = i)

Accuracy score using max_depth = 2: 0.9906054279749478
Accuracy score using max_depth = 3: 0.9932150313152401
Accuracy score using max_depth = 4: 0.9932150313152401
Accuracy score using max_depth = 5: 0.9906054279749478
Accuracy score using max_depth = 6: 0.9911273486430062
Accuracy score using max_depth = 7: 0.9911273486430062
Accuracy score using max_depth = 8: 0.9911273486430062
Accuracy score using max_depth = 9: 0.9921711899791231


In [29]:
#Criterion tunin
for i in ['entropy','gini']:
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = 0.8,
               min_samples_split = 3, criterion = i)

Accuracy score using max_depth = entropy: 0.9932150313152401
Accuracy score using max_depth = gini: 0.9906054279749478


In [30]:
def create_poly(train,test,degree):
    poly=PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [31]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print('Polyniminal degree', degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = 0.8, 
                min_samples_split = 3, criterion = 'entropy')
    print(20*'-')

train_poly, test_poly = create_poly(train, test, 2)

Polyniminal degree 1
0.9895615866388309
--------------------
Polyniminal degree 2
0.9921711899791231
--------------------
Polyniminal degree 3
0.9926931106471816
--------------------
Polyniminal degree 4
0.9864300626304802
--------------------


In [33]:
fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = 0.8, 
                min_samples_split = 3, criterion = 'gini')

train_poly, test_poly = create_poly(train, test, 3)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), max_depth = 24,  max_features = 0.8, 
                min_samples_split = 3, criterion = 'gini')

0.9906054279749478
0.9853862212943633


In [34]:
#Randon Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf = RandomForestClassifier()

In [37]:
rf.fit(train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
pred_rf = rf.predict(test)

In [39]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_rf))

0.988517745302714


In [40]:
from sklearn.model_selection import GridSearchCV

In [42]:
params = {'n_estimators':[200,500,700],'max_depth':[10,15,18,20],
         'min_samples_leaf':[3,5,7]}

In [43]:
gs = GridSearchCV(rf,params,verbose=3)

In [44]:
gs.fit(train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.981, total=   1.9s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.985, total=   1.9s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.994, total=   1.9s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.990, total=   2.0s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.986, total=   2.1s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.982, total=   4.7s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.984, total=   4.8s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.995, total=   4.5s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=

[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.981, total=   6.6s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.986, total=   6.6s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.995, total=   6.4s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.989, total=   6.4s
[CV] max_depth=15, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=3, n_estimators=700, score=0.987, total=   6.6s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=0.980, total=   1.8s
[CV] max_depth=15, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=15, min_samples_leaf=5, n_estimators=200, score=

[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.989, total=   2.0s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=200, score=0.984, total=   1.9s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.981, total=   4.6s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.984, total=   4.8s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.993, total=   4.7s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=0.988, total=   4.7s
[CV] max_depth=18, min_samples_leaf=5, n_estimators=500 ..............
[CV]  max_depth=18, min_samples_leaf=5, n_estimators=500, score=

[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.984, total=   6.9s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.993, total=   6.6s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.988, total=   6.7s
[CV] max_depth=20, min_samples_leaf=5, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=5, n_estimators=700, score=0.986, total=   6.5s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.981, total=   1.8s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.982, total=   1.8s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 13.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [45]:
gs.best_params_

{'max_depth': 10, 'min_samples_leaf': 3, 'n_estimators': 700}

In [46]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [47]:
rf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                            max_depth=10, max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=3, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=None,
                            oob_score=True, random_state=101, verbose=0, warm_start=False)

In [49]:
rf1.fit(train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=True, random_state=101, verbose=0,
                       warm_start=False)

In [50]:
pred_rf1 = rf1.predict(test)

In [51]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_rf1))

0.9879958246346555


In [52]:
rf1.oob_score_

0.9877316627512399

In [53]:
rf1.feature_importances_

array([4.28155533e-02, 1.02396219e-02, 1.07592456e-02, 5.34896430e-02,
       2.43839556e-01, 1.16336240e-01, 6.42242589e-02, 1.86156364e-02,
       4.26816575e-01, 1.81998102e-03, 6.74353202e-04, 6.21322805e-03,
       7.36551178e-04, 5.08152190e-04, 8.33934880e-04, 1.98098624e-04,
       9.35067964e-04, 1.12149232e-04, 8.32152909e-04])

In [54]:
sorted(list(zip(rf1.feature_importances_,train.columns)),reverse=True)

[(0.42681657530448713, 'inq.last.6mths'),
 (0.24383955624653234, 'fico'),
 (0.11633624007866779, 'days.with.cr.line'),
 (0.0642242589125792, 'revol.bal'),
 (0.05348964301679589, 'dti'),
 (0.042815553281783526, 'int.rate'),
 (0.018615636384794666, 'revol.util'),
 (0.010759245592893561, 'log.annual.inc'),
 (0.010239621933797266, 'installment'),
 (0.0062132280489296775, 'not.fully.paid'),
 (0.0018199810204895426, 'delinq.2yrs'),
 (0.0009350679638081287, 'home_improvement'),
 (0.0008339348798587869, 'debt_consolidation'),
 (0.0008321529087110512, 'small_business'),
 (0.0007365511778078435, 'all_other'),
 (0.0006743532024639461, 'pub.rec'),
 (0.0005081521898995906, 'credit_card'),
 (0.00019809862381698829, 'educational'),
 (0.00011214923188311706, 'major_purchase')]

In [56]:
print('Logistic Regression --> Accuracy score baseline: 0.8721294363256785') 
print('Decision Tree --> Accuracy score baseline: 0.9937369519832986') 
print('Decision Tree with entropy tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9932150313152401') 
print('Decision Tree with gini tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9906054279749478') 
print('Ramdom Forest --> Accuracy score baseline: 0.988517745302714') 
print('Ramdom Forest Grid Search CV --> Accuracy score baseline: 0.9879958246346555')

Logistic Regression --> Accuracy score baseline: 0.8721294363256785
Decision Tree --> Accuracy score baseline: 0.9937369519832986
Decision Tree with entropy tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9932150313152401
Decision Tree with gini tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9906054279749478
Ramdom Forest --> Accuracy score baseline: 0.988517745302714
Ramdom Forest Grid Search CV --> Accuracy score baseline: 0.9879958246346555


# Logistic Regression --> Accuracy score baseline: 0.8721294363256785

# Decision Tree --> Accuracy score baseline: 0.9937369519832986

# Decision Tree with entropy tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9932150313152401

# Decision Tree with gini tuning (max depth,max features,Min samples split)--> Accuracy score baseline: 0.9906054279749478

# Ramdom Forest --> Accuracy score baseline: 0.988517745302714

# Ramdom Forest Grid Search CV --> Accuracy score baseline: 0.9879958246346555