In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the csv file.

df0=pd.read_csv('loan_prediction.csv')
df0

In [None]:
df0.shape

In [None]:
df0.describe()

In [None]:
df0.info()

# From the above statistics
1. Most of the types are object which need to transform the columns with type as object using label encoder.
2. Also Loan_ID is acting as just an identifier so it can be remove from the dataset to improvise modelling.
3. In the dataset there are values as 3+ for Dependents which needs to be replaced.

In [None]:
# Dropping Loan_ID column.

df0.drop(['Loan_ID'], axis=1, inplace=True)
df0.dtypes

In [None]:
# In the dataset there are values as 3+ for Dependents.
# Replacing the value 3+ with 3

df0['Dependents'] =df0['Dependents'].replace(to_replace ="3+",value ="3") 

In [None]:
# Checking which all columns have null values
df0.isnull().sum().sort_values(ascending=False)

In [None]:
# Fill the nan or nulls

df0['Credit_History'].fillna(df0['Credit_History'].mean(), inplace=True)
df0['Self_Employed'].fillna('No', inplace=True)
df0['Dependents'].fillna('2', inplace=True)
df0['LoanAmount'].fillna(df0['LoanAmount'].mean(), inplace=True)
df0['Loan_Amount_Term'].fillna(df0['Loan_Amount_Term'].mean(), inplace=True)
df0['Gender'].fillna('Male', inplace=True)
df0['Married'].fillna('Yes', inplace=True)

In [None]:
# Checking which all columns have null values
df0.isnull().sum().sort_values(ascending=False)

In [None]:
# Encoding columns as part of transformation.

from sklearn.preprocessing import LabelEncoder 
le= LabelEncoder()

#df = le.fit_transform(df0)
df0['Dependents'] = le.fit_transform(df0['Dependents'])
df0['Gender'] = le.fit_transform(df0['Gender'])
df0['Married'] = le.fit_transform(df0['Married'])
df0['Education'] = le.fit_transform(df0['Education'])
df0['Self_Employed'] = le.fit_transform(df0['Self_Employed'])

df0['Property_Area'] = le.fit_transform(df0['Property_Area'])
df0['Loan_Status'] = le.fit_transform(df0['Loan_Status'])

In [None]:
df0.skew()

In [None]:
# Checking the zscore in order to normalize the data.

from scipy.stats import zscore
z= np.abs(zscore(df0))
z

In [None]:
# Verifying for the presence of zscore value of data with threshold of more than 3 std score.

threshold=3
print(np.where(z>3))

In [None]:
# Removing the outliers having zscore value of more than 3.
data=df0[(z<3).all(axis=1)]
data

In [None]:
# Checking the relation between the ApplicantIncome & Loan Status

#plt.bar(data['Credit_History'], data['Loan_Status'], color='g')
plt.bar(data['Loan_Status'], data['Self_Employed'], color='g')

plt.ylabel('Self')
plt.xlabel('Loan_Status')
plt.show()

In [None]:
plt.figure(figsize=(20,10)) 
sns.heatmap(df0.corr(), annot=True, fmt=".2f") 
plt.suptitle("Correlation Map", fontsize=18)
plt.show()  

# Correlation Status

    1. Based on the correlation map the Loan_Status is related to only Credit_History.
    2. Also Loan Amount is related to Applicant's income.
    3. Self_Employed, Dependents, Applicantincome does not correlate much with Loan_Status which can be dropped.

In [None]:
data.drop(['Self_Employed','Dependents','ApplicantIncome'],axis=1 ,inplace=True)

In [None]:
# Checking for skewed data
data.skew()

In [None]:
# Reducing the skewness with boxcox1p in order to avoid 0 encountered as negative while transformation.

from scipy.special import boxcox1p
# 0 -> log transform
# .5 -> square root transform


data['Education']=boxcox1p(data['Education'],0.5)
data['CoapplicantIncome']=boxcox1p(data['CoapplicantIncome'],0.5)
data['LoanAmount']=boxcox1p(data['LoanAmount'],0.5)

In [None]:
# Checking for skewed data
data.skew()

In [None]:
# Civics Marks 
data.hist(column='Credit_History')

In [None]:
pd.crosstab(data['Credit_History'],data['Loan_Status'],margins=True,margins_name='Total')

In [None]:
# Setting x  by excluding AveragePrice column which is y here for prediction.
x=data.drop(['Loan_Status'],axis=1)
x

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scale = StandardScaler()
#scale = MinMaxScaler()
x=scale.fit_transform(x)

In [None]:
# Settng Y

y=data['Loan_Status']
y

In [None]:
max_acc_score=0
for r_state in range(42,101):
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.25,random_state=r_state)
    lg=LogisticRegression()
    lg.fit(train_x,train_y)
    pred=lg.predict(test_x)
    accuracyScore=accuracy_score(test_y,pred)
    #print("Accuracy_Score corresponding to r_state: ",r_state," is ",accuracyScore)
    if(accuracyScore>max_acc_score):
        max_acc_score=accuracyScore
        final_rstate=r_state
        
print("\n\n")
print("Max_accuracy_Score corresponding to final_r_state: ",final_rstate," is ",max_acc_score)

In [None]:
# Setting the test x & y values and using the random state from above step which is 81.

train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=.25,random_state=68)

In [None]:
train_x.shape

In [None]:
train_y.shape

In [None]:
test_x.shape

In [None]:
test_y.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
KNN=KNeighborsClassifier(n_neighbors=13)
SV=SVC(kernel="linear", C=1)
LR=LogisticRegression()
DT=DecisionTreeClassifier(criterion='entropy',max_depth=4)
GNB=GaussianNB()
RFC=RandomForestClassifier(n_estimators=100,random_state=100)
ADC=AdaBoostClassifier(n_estimators=100,random_state=10)
GBC=GradientBoostingClassifier(n_estimators=100,random_state=10)

In [None]:
models=[]
models.append(('KNeighborsClassifier',KNN))
models.append(('SVC',SV))
models.append(('LogisticRegression',LR))
models.append(('DecisionTreeClassifier',DT))
models.append(('GaussianNB',GNB))
models.append(('RandomForestClassifier',RFC))
models.append(('AdaBoostClassifier',ADC))
models.append(('GradientBoostingClassifier',GBC))

In [None]:
Model=[]
score=[]
cvs=[]
rocscore=[]

for name,model in models:
    print("--------------",name,"--------------")
    Model.append(name)
    model.fit(train_x,train_y)
    print(model)
    pre=model.predict(test_x)
    AS=accuracy_score(test_y,pre)
    print("Accuracy Score: ", AS)
    score.append(AS*100)
    sc=cross_val_score(model,x,y,cv=10,scoring='accuracy').mean()
    print("Cross_Val_Score: ", sc)
    cvs.append(sc*100)
    cm=confusion_matrix(test_y,pre)
    print(cm)
    print("\n")

In [None]:
result=pd.DataFrame({"Model": Model, "Score": score, "Cross Val Score":cvs})
result

# Selecting best parameters for the models using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

#parameters for SVC
params_svc={'kernel':('linear','rbf'), 'C':[1,10]}

#parameter for KNN
params_knn={'n_neighbors':np.arange(5,20)}

#parameters for DTC
params_dtc={'criterion':('gini', 'entropy'), 'max_depth':(4,6,8,12)}   

#parameters for RFC,ADA,GBC
params={'n_estimators':[100,500],'random_state':[10,100]}

    
svc=GridSearchCV(SVC(),params_svc)
svc.fit(train_x,train_y)
print("Best parameters for Support Vector Classification:",svc.best_params_)

knn= GridSearchCV(KNeighborsClassifier(),param_grid=params_knn)
knn.fit(train_x,train_y)
print("Best parameters for KNeighborsClassifier",knn.best_params_)


dtc= GridSearchCV(DecisionTreeClassifier(),param_grid=params_dtc,cv=10,scoring='accuracy')
dtc.fit(train_x,train_y)
print("Best parameters for DecisionTreeClassifier",dtc.best_params_)

ada= GridSearchCV(AdaBoostClassifier(),param_grid=params,scoring='accuracy')
ada.fit(train_x,train_y)
print("Best parameters for AdaBoostClassifier: ",ada.best_params_)

gbc= GridSearchCV(GradientBoostingClassifier(),param_grid=params,scoring='accuracy')
gbc.fit(train_x,train_y)
print("Best parameters for GradientBoostingClassifier: ",gbc.best_params_)


# Setting the best possible parameters for the models

In [None]:
KNN=KNeighborsClassifier(n_neighbors=15)
SV=SVC(kernel="linear", C=1)
LR=LogisticRegression()
DT=DecisionTreeClassifier(criterion='entropy',max_depth=4)
GNB=GaussianNB()
ADC=AdaBoostClassifier(n_estimators=100,random_state=10)
GBC=GradientBoostingClassifier(n_estimators=100,random_state=100)

In [None]:
models=[]
models.append(('KNeighborsClassifier',KNN))
models.append(('SVC',SV))
models.append(('LogisticRegression',LR))
models.append(('DecisionTreeClassifier',DT))
models.append(('GaussianNB',GNB))
models.append(('AdaBoostClassifier',ADC))
models.append(('GradientBoostingClassifier',GBC))

In [None]:
Model=[]
score=[]
cvs=[]
rocscore=[]

for name,model in models:
    print("--------------",name,"--------------")
    Model.append(name)
    model.fit(train_x,train_y)
    print(model)
    pre=model.predict(test_x)
    AS=accuracy_score(test_y,pre)
    print("Accuracy Score: ", AS)
    score.append(AS*100)
    sc=cross_val_score(model,x,y,cv=10,scoring='accuracy').mean()
    print("Cross_Val_Score: ", sc)
    cvs.append(sc*100)
    cm=confusion_matrix(test_y,pre)
    print(cm)
    print("\n")

In [None]:
result=pd.DataFrame({"Model": Model, "Score": score, "Cross Val Score":cvs})
result

# Logistic Regression is the best fit model here with 88% accuracy.

In [None]:
# Saving the prediction data in a file.

predictData=pd.DataFrame(pre)
data.to_csv('Loan_Predict.csv')
predictData

In [None]:
# Saving the model

from sklearn.externals import joblib
joblib.dump(LR,"LR_Loan.pkl")