In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [2]:
df=pd.read_csv('cleaned loan data.csv')

In [3]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
X=df.drop('Loan_Status',axis=1)
y=df['Loan_Status']

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=70,test_size=0.2)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score

# Random Forest

In [7]:
estimator=RandomForestClassifier(random_state=True)
param_grid={'n_estimators':list(range(1,50))}

rf_grid=GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
rf_grid.fit(X_train,y_train)

rf=rf_grid.best_estimator_
rf

In [8]:
feats_ab=pd.DataFrame(data=rf.feature_importances_,
                     index=X.columns,
                     columns=['Importance'])
important_features_rf=feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_rf

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Income']

In [9]:
X_train_rf=X_train[important_features_rf]
X_test_rf=X_test[important_features_rf]

rf=rf_grid.best_estimator_
rf.fit(X_train_rf,y_train)

ypred_train=rf.predict(X_train_rf)
ypred_test=rf.predict(X_test_rf)

print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('CV score:',cross_val_score(rf,X_train_rf,y_train,cv=5,scoring='accuracy').mean())
print('Test accuracy:',accuracy_score(y_test,ypred_test))

Train accuracy: 0.9952718676122931
CV score: 0.8203641456582634
Test accuracy: 0.7735849056603774


# Ada Boost

In [10]:
estimator=AdaBoostClassifier(random_state=True)
param_grid={'n_estimators':list(range(1,51))}
ab_grid=GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
ab_grid.fit(X_train,y_train)

ab=ab_grid.best_estimator_
ab

In [11]:
feats_ab=pd.DataFrame(data=ab.feature_importances_,
                     index=X.columns,
                     columns=['Importance'])
important_features_ab=feats_ab[feats_ab['Importance']>0].index.tolist()
important_features_ab

['Credit_History', 'Property_Area', 'Income']

In [16]:
X_train_ab=X_train[important_features_ab]
X_test_ab=X_test[important_features_ab]

ab=ab_grid.best_estimator_
ab.fit(X_train_ab,y_train)

ypred_train=ab.predict(X_train_ab)
ypred_test=ab.predict(X_test_ab)

print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('CV score:',cross_val_score(ab,X_train_ab,y_train,cv=5,scoring='accuracy').mean())
print('Test accuracy:',accuracy_score(y_test,ypred_test))

Train accuracy: 0.83451536643026
CV score: 0.8298879551820729
Test accuracy: 0.7641509433962265


# Gradient Boost

In [14]:
estimator=GradientBoostingClassifier(random_state=True)
param_grid={'n_estimators':list(range(1,10)),
           'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}

gb_grid=GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')
gb_grid.fit(X_train,y_train)

gb=gb_grid.best_estimator_
gb

In [15]:
feats_gb=pd.DataFrame(data=gb.feature_importances_,
                     index=X.columns,
                     columns=['Importance'])
important_features_gb=feats_gb[feats_gb['Importance']>0].index.tolist()
important_features_gb

['Gender',
 'Married',
 'Self_Employed',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Income']

In [17]:
X_train_gb=X_train[important_features_gb]
X_test_gb=X_test[important_features_gb]

gb=gb_grid.best_estimator_
gb.fit(X_train_gb,y_train)

ypred_train=gb.predict(X_train_gb)
ypred_test=gb.predict(X_test_gb)

print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('CV score:',cross_val_score(gb,X_train_gb,y_train,cv=5,scoring='accuracy').mean())
print('Test accuracy:',accuracy_score(y_test,ypred_test))

Train accuracy: 0.8747044917257684
CV score: 0.8228011204481793
Test accuracy: 0.7358490566037735


# XGBoost

In [20]:
estimator=XGBClassifier()
param_grid={'n_estimators':[10,20,40,100],
            'max_depth':[3,4,5],
           'gamma':[0,0.15,0.3,0.5,1]}

xgb_grid=GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')
xgb_grid.fit(X_train,y_train)

xgb=xgb_grid.best_estimator_
xgb

In [21]:
feats_xgb=pd.DataFrame(data=xgb.feature_importances_,
                     index=X.columns,
                     columns=['Importance'])
important_features_xgb=feats_xgb[feats_xgb['Importance']>0].index.tolist()
important_features_xgb

['Dependents',
 'Self_Employed',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Income']

In [22]:
X_train_xgb=X_train[important_features_xgb]
X_test_xgb=X_test[important_features_xgb]

xgb=xgb_grid.best_estimator_
xgb.fit(X_train_xgb,y_train)

ypred_train=xgb.predict(X_train_xgb)
ypred_test=xgb.predict(X_test_xgb)

print('Train accuracy:',accuracy_score(y_train,ypred_train))
print('CV score:',cross_val_score(xgb,X_train_xgb,y_train,cv=5,scoring='accuracy').mean())
print('Test accuracy:',accuracy_score(y_test,ypred_test))

Train accuracy: 0.8321513002364066
CV score: 0.82515406162465
Test accuracy: 0.7830188679245284
