In [248]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

import pickle

In [249]:
data=pd.read_csv('loan_prediction.csv')

In [250]:
data = data.drop(['Loan_ID'], axis = 1)

In [251]:
data['Gender'].fillna(data['Gender'].mode()[0],inplace=True)
data['Married'].fillna(data['Married'].mode()[0],inplace=True)
data['Dependents'].fillna(data['Dependents'].mode()[0],inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0],inplace=True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0],inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0],inplace=True)

data['LoanAmount'].fillna(data['LoanAmount'].mean(),inplace=True)

In [252]:
data = pd.get_dummies(data)

# Drop columns
data = data.drop(['Gender_Female', 'Married_No', 'Education_Not Graduate', 
              'Self_Employed_No', 'Loan_Status_N'], axis = 1)

# Rename columns name
new = {'Gender_Male': 'Gender', 'Married_Yes': 'Married', 
       'Education_Graduate': 'Education', 'Self_Employed_Yes': 'Self_Employed',
       'Loan_Status_Y': 'Loan_Status'}
       
data.rename(columns=new, inplace=True)

In [253]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [254]:
data[['ApplicantIncome']] = np.sqrt(data[['ApplicantIncome']])
data[['CoapplicantIncome']] = np.sqrt(data[['CoapplicantIncome']])
data[['LoanAmount']] = np.sqrt(data[['LoanAmount']])

In [255]:
X = data.drop(["Loan_Status"], axis=1)
y = data["Loan_Status"]

In [256]:
X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education,Self_Employed,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,76.478755,0.000000,12.100089,360.0,1.0,1,0,1,0,0,0,1,0,0,0,1
4,77.459667,0.000000,11.874342,360.0,1.0,1,0,1,0,0,0,1,0,0,0,1
13,43.046487,53.291650,10.677078,360.0,1.0,1,0,1,0,0,0,1,0,1,0,0
15,70.356236,0.000000,11.180340,360.0,1.0,1,0,1,0,0,0,1,0,0,0,1
19,50.990195,59.160798,10.723805,360.0,1.0,1,1,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,56.409219,54.009258,12.688578,360.0,1.0,1,0,1,0,0,0,1,0,0,0,1
586,47.927028,39.012818,10.198039,360.0,1.0,1,1,1,0,0,0,1,0,0,0,1
588,68.920244,0.000000,9.695360,360.0,1.0,1,0,1,0,0,0,1,0,0,1,0
603,60.630026,65.582010,13.114877,360.0,1.0,1,0,1,0,0,0,1,0,1,0,0


In [257]:
X, y = SMOTE().fit_resample(X, y)

In [258]:
X = MinMaxScaler().fit_transform(X)

In [259]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [260]:
dt_model = pickle.load(open('dt_model.pkl', 'rb'))
knn_model = pickle.load(open('knn_model.pkl', 'rb'))
rf_model = pickle.load(open('rf_model.pkl', 'rb'))
xg_model = pickle.load(open('xg_model.pkl', 'rb'))

In [261]:
dt_model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=35)

In [262]:
dt_acc = dt_model.score(X_test,y_test)
knn_acc = knn_model.score(X_test, y_test)
rf_acc = rf_model.score(X_test, y_test)
xg_acc = xg_model.score(X_test, y_test)

In [263]:
estimators=[('knn', knn_model), ('rf', rf_model), ('dt', dt_model), ('xg', xg_model)]

ensemble = VotingClassifier(estimators, voting='hard')

In [264]:
ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=3)),
                             ('rf',
                              RandomForestClassifier(max_leaf_nodes=20,
                                                     n_estimators=1000,
                                                     random_state=1)),
                             ('dt',
                              DecisionTreeClassifier(max_depth=3,
                                                     min_samples_leaf=35)),
                             ('xg', XGBClassifier(max_depth=6, missing=nan))])

In [265]:
ens_acc = ensemble.score(X_test, y_test)

In [266]:
comp_acc = pd.DataFrame({'Model':['Decision Tree','K-Nearest Neighbours','Random Forest','XGBoost','Ensemble'],'Accuracy':[dt_acc,knn_acc,rf_acc,xg_acc,ens_acc]})
print(comp_acc)

                  Model  Accuracy
0         Decision Tree  0.822222
1  K-Nearest Neighbours  0.844444
2         Random Forest  0.888889
3               XGBoost  0.822222
4              Ensemble  0.911111


In [267]:
pickle.dump(ensemble,open("final_ens_model.pkl","wb"))

In [268]:
dt_model = pickle.load(open('final_ens_model.pkl', 'rb'))
dt_acc = dt_model.score(X_test,y_test)
dt_acc

0.9111111111111111