In [20]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV

import pickle

In [2]:
df = pd.read_csv("loan_prediction.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
print(df.shape)

(614, 13)


In [4]:
df = df.drop(['Loan_ID'], axis = 1)

In [5]:
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Married'].fillna(df['Married'].mode()[0],inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0],inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0],inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0],inplace=True)

In [6]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(),inplace=True)

In [7]:
df = pd.get_dummies(df)

# Drop columns
df = df.drop(['Gender_Female', 'Married_No', 'Education_Not Graduate', 
              'Self_Employed_No', 'Loan_Status_N'], axis = 1)

# Rename columns name
new = {'Gender_Male': 'Gender', 'Married_Yes': 'Married', 
       'Education_Graduate': 'Education', 'Self_Employed_Yes': 'Self_Employed',
       'Loan_Status_Y': 'Loan_Status'}
       
df.rename(columns=new, inplace=True)

In [8]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [10]:
X = df.drop(["Loan_Status"], axis=1)
y = df["Loan_Status"]

In [11]:
X = MinMaxScaler().fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [13]:
knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1, 25)}
knn_gs = GridSearchCV(knn, params_knn, cv=5)
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_

In [14]:
dt = DecisionTreeClassifier()
params_dt = {'max_leaf_nodes': np.arange(2,20)}
dt_gs = GridSearchCV(dt, params_dt, cv=5)
dt_gs.fit(X_train, y_train)
dt_best = dt_gs.best_estimator_

In [15]:
rf = RandomForestClassifier()
params_rf = {'n_estimators': [50, 100, 200]}
rf_gs = GridSearchCV(rf, params_rf, cv=5)
rf_gs.fit(X_train, y_train)
rf_best = rf_gs.best_estimator_

In [16]:
xg = XGBClassifier()
params_xg = {'max_depth': np.arange(2,10)}
xg_gs = GridSearchCV(xg, params_xg, cv=5)
xg_gs.fit(X_train, y_train)
xg_best = xg_gs.best_estimator_

In [17]:
estimators=[('knn', knn_best), ('rf', rf_best), ('dt', dt_best), ('xg', xg_best)]

ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=11)),
                             ('rf', RandomForestClassifier(n_estimators=50)),
                             ('dt', DecisionTreeClassifier(max_leaf_nodes=5)),
                             ('xg', XGBClassifier(max_depth=2))])

In [18]:
ensemble.score(X_test, y_test)

0.8529411764705882

In [21]:
filename="model.pkl"
pickle.dump(ensemble,open(filename,'wb'))

In [23]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.8529411764705882


In [None]:
from google.colab import drive
drive.mount('/content/drive')