## Best Model Selection and Hyperparameter Tuning

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from math import sqrt
import re
import os
import string
import nltk 
from nltk.corpus import stopwords
import category_encoders as cat_encoder

In [2]:
df = pd.read_csv('Loan_Train.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 13)

In [5]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
# Drop the column Loan_ID
df = df.drop(['Loan_ID'], axis = 1)

In [7]:
# Drop any rows with missing data
df = df.dropna()

In [8]:
# Confirm column drop
df.shape

(480, 12)

In [9]:
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [11]:
df = pd.get_dummies(df, columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status'], drop_first = True)

In [12]:
df.shape

(480, 15)

In [13]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
5,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,0,1,0,1,1


In [14]:
# Split the data into training and test set.

In [46]:
X = df.drop(['Loan_Status_Y'], axis = 1)
y = df['Loan_Status_Y']

In [49]:
train = df.copy()
test = df.copy()

In [50]:
X = pd.get_dummies(X)
train=pd.get_dummies(train)
test=pd.get_dummies(test)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [52]:
print(X_train.shape)
print(X_test.shape)

(384, 14)
(96, 14)


In [18]:
print(y_train.value_counts())
print(y_test.value_counts())

1    263
0    121
Name: Loan_Status_Y, dtype: int64
1    69
0    27
Name: Loan_Status_Y, dtype: int64


In [19]:
standardizer = StandardScaler()

In [20]:
features_standardized = standardizer.fit_transform(X)

In [21]:
# Fit KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

In [22]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=-1)

In [23]:
# Create a pipeline 
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

In [24]:
# Create n_neighbors
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [25]:
# Fit a grid search with your pipeline, search space, and 
# 5-fold cross-validation to find the best value for the “n_neighbors” parameter.
classifier = GridSearchCV(
pipe, search_space, cv=5, verbose=0).fit(features_standardized, y)

In [26]:
classifier

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardizer', StandardScaler()),
                                       ('knn',
                                        KNeighborsClassifier(n_jobs=-1))]),
             param_grid=[{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}])

In [28]:
train_preds = knn.predict(X_train)
mse = mean_squared_error(y_train, train_preds)
rmse = sqrt(mse)
rmse

0.4973890160963884

In [29]:
train_preds

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,

In [None]:
score = accuracy_score(train_preds, X_train)

In [40]:
model = LogisticRegression(solver='lbfgs', max_iter=1000)

In [41]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [69]:
pred_cv = model.predict(X_test)
print('Accuracy = ', accuracy_score(y_test,pred_cv))
print('F1-Score = ', f1_score(y_test,pred_cv))

Accuracy =  0.8229166666666666
F1-Score =  0.8917197452229301


In [63]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=42,n_estimators=150,max_depth=4)
randomforest.fit(X_train,y_train)
print(rf)

RandomForestClassifier(max_depth=4, n_estimators=150, random_state=42)


In [70]:
y_pred_random_for = randomforest.predict(X_test)
f1 = f1_score(y_test,y_pred_random_for)
print('Accuracy = ', accuracy_score(y_test,y_pred_random_for))
print('F1-Score = ', f1)

Accuracy =  0.8125
F1-Score =  0.8860759493670887


When comparying both accuracy scores and F1 scores both are very similar. However, the logistic regression is slight better with an 82.29% accuracy versus randomforest with 81.25% accuracy. The F1-score is slight better as well too. The conclusion would be that the best model for predicting loan approval rate would be logistic regression. 