In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
loan_dataset=pd.read_csv('https://raw.githubusercontent.com/dsrscientist/DSData/master/loan_prediction.csv')
loan_dataset

In [None]:
loan_dataset.shape

In [None]:
loan_dataset.info()

In [None]:
loan_dataset.isnull().sum()

In [None]:
loan_dataset.describe()

In [None]:
category=['Gender','Married','Dependents','Education','Self_Employed',
          'Loan_Amount_Term','Property_Area','Credit_History','Loan_Status']
for i in category:
    print(i)
    print(loan_dataset[i].value_counts())
    print('='*100)

In [None]:
sns.set_palette('gist_rainbow_r')
plt.figure(figsize=(20,20))
plotnumber=1
category=['Gender','Married','Dependents','Education','Self_Employed',
          'Loan_Amount_Term','Property_Area','Credit_History','Loan_Status']
for i in category:
    if plotnumber<=9:
        ax=plt.subplot(3,3,plotnumber)
        sns.countplot(loan_dataset[i])
        plt.xlabel(i,fontsize=20)
    plotnumber+=1
plt.tight_layout()
plt.show()

In [None]:
#Let check outliers for missing values Numerical variable having missing values by plotting boxplot.
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
sns.boxplot( y='LoanAmount', data=loan_dataset,color='red')
plt.ylabel('Loan Amount',fontsize=15)
plt.subplot(1,2,2)
sns.distplot(loan_dataset['LoanAmount'], color='b')
plt.xlabel('Loan Amount',fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
print("Mean of Loan Amount:",loan_dataset['LoanAmount'].mean())
print("Median of Loan Amount:",loan_dataset['LoanAmount'].median())

In [None]:
# Imputating Missing value with mode for categorical features
loan_dataset['Credit_History'].fillna(loan_dataset['Credit_History'].mode()[0],inplace=True)
loan_dataset['Self_Employed'].fillna(loan_dataset['Self_Employed'].mode()[0],inplace=True)
loan_dataset['Dependents'].fillna(loan_dataset['Dependents'].mode()[0], inplace=True)
loan_dataset['Gender'].fillna(loan_dataset['Gender'].mode()[0],inplace=True)
loan_dataset['Married'].fillna(loan_dataset['Married'].mode()[0],inplace=True)

# Imputation of Numerical features
loan_dataset['Loan_Amount_Term'].fillna(loan_dataset['Loan_Amount_Term'].mode()[0],inplace=True)
loan_dataset['LoanAmount'].fillna(loan_dataset['LoanAmount'].median(), inplace=True)

In [None]:
#Missing Value Check After Imputation
missing_values = loan_dataset.isnull().sum().sort_values(ascending = False)
percentage_missing_values =(missing_values/len(loan_dataset))*100
print(pd.concat([missing_values, percentage_missing_values], axis =1, keys =['Missing Values', '% Missing data']))

In [None]:
# Visualizing the statistics of the columns using heatmap.
plt.figure(figsize=(12,8))
sns.heatmap(loan_dataset.describe(),linewidths = 0.1,fmt='0.1f',annot = True,cmap='PiYG')

In [None]:
loan_dataset.describe()

In [None]:
#Target Variable
plt.rcParams["figure.autolayout"] = True
sns.set_palette('husl')
f,ax=plt.subplots(1,2,figsize=(18,8))
loan_dataset['Loan_Status'].value_counts().plot.pie(explode=[0,0.1],autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('Loan Status', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('Loan_Status',data=loan_dataset,ax=ax[1])
ax[1].set_title('Loan Status',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("Loan Status",fontsize=18,fontweight ='bold')
plt.show()


In [None]:
#Gender Vs Loan Status
plt.rcParams["figure.autolayout"] = True
sns.set_palette('husl')
f,ax=plt.subplots(1,2,figsize=(16,8))
loan_dataset['Gender'].value_counts().plot.pie(explode=[0,0.1],autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('Gender', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('Gender',hue="Loan_Status",data=loan_dataset,ax=ax[1])
ax[1].set_title('Gender Vs Loan Status',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("Loan Status",fontsize=18,fontweight ='bold')
plt.xticks(fontsize=14,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
pd.crosstab(loan_dataset['Gender'],loan_dataset["Loan_Status"], margins=True).style.background_gradient(cmap='summer_r')


In [None]:
#Married Vs Loan Status
plt.rcParams["figure.autolayout"] = True
sns.set_palette('husl')
f,ax=plt.subplots(1,2,figsize=(16,8))
loan_dataset['Married'].value_counts().plot.pie(explode=[0,0.1],autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('Married', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('Married',hue="Loan_Status",data=loan_dataset,ax=ax[1])
ax[1].set_title('Married Vs Loan Status',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("Married",fontsize=18,fontweight ='bold')
plt.xticks(fontsize=14,fontweight ='bold')
plt.tight_layout()
plt.show()


In [None]:
pd.crosstab([loan_dataset['Gender'],loan_dataset['Married']],[loan_dataset.Loan_Status],margins=True).style.background_gradient(cmap='gist_rainbow_r')


In [None]:
#Let check how number dependents play here.
plt.rcParams["figure.autolayout"] = True
sns.set_palette('Set2')
f,ax=plt.subplots(1,2,figsize=(16,8))
loan_dataset['Dependents'].value_counts().plot.pie(explode=[0,0.1,0.15,0.2],autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('No. of Dependents', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('Dependents',hue="Loan_Status",data=loan_dataset,ax=ax[1])
ax[1].set_title('No. of Dependents Vs Loan Status',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("Dependents",fontsize=18,fontweight ='bold')
plt.xticks(fontsize=14,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
pd.crosstab([loan_dataset['Dependents']],[loan_dataset.Loan_Status,loan_dataset['Gender']],margins=True).style.background_gradient(cmap='summer_r')


In [None]:
pd.crosstab([loan_dataset['Dependents'],loan_dataset['Gender']],[loan_dataset.Loan_Status],margins=True).style.background_gradient(cmap='Blues')

In [None]:
#Education Vs Loan status
plt.rcParams["figure.autolayout"] = True
sns.set_palette('prism')
f,ax=plt.subplots(1,2,figsize=(16,8))
loan_dataset['Education'].value_counts().plot.pie(explode=[0,0.1],autopct='%2.1f%%',
                                          textprops ={ 'fontweight': 'bold','fontsize':13}, ax=ax[0],shadow=True)
ax[0].set_title('Education', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot('Education',hue="Loan_Status",data=loan_dataset,ax=ax[1])
ax[1].set_title('Education Vs Loan Status',fontsize=20,fontweight ='bold')
ax[1].set_xlabel("Education",fontsize=18,fontweight ='bold')
plt.xticks(fontsize=14,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
pd.crosstab([loan_dataset['Education'],loan_dataset['Gender']],[loan_dataset.Loan_Status],margins=True).style.background_gradient(cmap='Blues')

In [None]:
#Encoding categorical data
Category=['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status']

In [None]:
# Using Label Encoder on categorical variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in Category:
    loan_dataset[i] = le.fit_transform(loan_dataset[i])
loan_dataset.head()


In [None]:
#Outliers Detection and Removal
Numerical =['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
plt.figure(figsize=(12,8),facecolor='white')
plotnumber=1

for column in Numerical:
    if plotnumber<=4:
        ax=plt.subplot(2,2,plotnumber)
        sns.boxplot(loan_dataset[column],color='c')
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.tight_layout()
plt.show()

In [None]:
# Droping unnecessary columns
loan_dataset.drop(["Loan_ID"], axis=1, inplace=True)

In [None]:
loan_dataset['Dependents'] = loan_dataset.Dependents.map({'0':0,'1':1,'2':2,'3+':3})

In [None]:
loan_dataset['Dependents'] =pd.to_numeric(loan_dataset['Dependents'])

In [None]:
from scipy.stats import zscore
z = np.abs(zscore(loan_dataset))
threshold = 3
loan_dataset1 = loan_dataset[(z<3).all(axis = 1)]

print ("Shape of the dataframe before removing outliers: ", loan_dataset.shape)
print ("Shape of the dataframe after removing outliers: ", loan_dataset1.shape)
print ("Percentage of data loss post outlier removal: ", (loan_dataset.shape[0]-loan_dataset1.shape[0])/loan_dataset.shape[0]*100)

loan_dataset=loan_dataset1.copy() # reassigning the changed dataframe name to our original dataframe name

In [None]:
#Data Loss
print("\033[1m"+'Percentage Data Loss :'+"\033[0m",((614-577)/614)*100,'%')

In [None]:
#Skewness of features
plt.figure(figsize=(22,5),facecolor='white')
plotnum=1
for col in Numerical:
    if plotnum<=4:
        plt.subplot(1,4,plotnum)
        sns.distplot(loan_dataset[col],color='r')
        plt.xlabel(col,fontsize=20)
    plotnum+=1
plt.show()

In [None]:
loan_dataset.skew()

In [None]:
# Removing skewness using yeo-johnson  method to get better prediction
skew = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method='yeo-johnson')

In [None]:
loan_dataset[skew] = scaler.fit_transform(loan_dataset[skew].values)
loan_dataset[skew].head()

In [None]:
#Checking skewness after using yeo-johnson ethod
loan_dataset.skew()

In [None]:
#Corrleation
loan_dataset.corr()

In [None]:
plt.figure(figsize=(21,13))
sns.heatmap(loan_dataset.corr(), vmin=-1, vmax=1, annot=True, square=True, fmt='0.3f', 
            annot_kws={'size':10}, cmap="gist_stern")
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
plt.figure(figsize = (18,6))
loan_dataset.corr()['Loan_Status'].drop(['Loan_Status']).sort_values(ascending=False).plot(kind='bar',color = 'purple')
plt.xlabel('Features',fontsize=15)
plt.ylabel('Income',fontsize=15)
plt.title('Correlation of features with Target Variable Loan_Status',fontsize = 18)
plt.show()

In [None]:
#Checking Multicollinearity between features using variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif= pd.DataFrame()
vif['VIF']= [variance_inflation_factor(loan_dataset.values,i) for i in range(loan_dataset.shape[1])]
vif['Features']= loan_dataset.columns
vif

In [None]:
# Splitting data in target and dependent feature

X = loan_dataset.drop(['Loan_Status'], axis =1)
Y = loan_dataset['Loan_Status']

In [None]:
#Balanceing Imbalanced target feature
loan_dataset.Loan_Status.value_counts()

In [None]:
#Standard Scaling
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_scale = scaler.fit_transform(X)

In [None]:
#Machine Learning Model Building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=99, test_size=.3)
print('Training feature matrix size:',X_train.shape)
print('Training target vector size:',Y_train.shape)
print('Test feature matrix size:',X_test.shape)
print('Test target vector size:',Y_test.shape)

In [None]:
#Finding best Random state
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,f1_score
maxAccu=0
maxRS=0
for i in range(1,250):
    X_train,X_test,Y_train,Y_test = train_test_split(X_scale,Y,test_size = 0.3, random_state=i)
    log_reg=LogisticRegression()
    log_reg.fit(X_train,Y_train)
    y_pred=log_reg.predict(X_test)
    acc=accuracy_score(Y_test,y_pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print('Best accuracy is', maxAccu ,'on Random_state', maxRS)

In [None]:
#Logistics Regression Model
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=78, test_size=.3)
log_reg=LogisticRegression()
log_reg.fit(X_train,Y_train)
y_pred=log_reg.predict(X_test)
print('\033[1m'+'Logistics Regression Evaluation'+'\033[0m')
print('\n')
print('\033[1m'+'Accuracy Score of Logistics Regression :'+'\033[0m', accuracy_score(Y_test, y_pred))
print('\n')
print('\033[1m'+'Confusion matrix of Logistics Regression :'+'\033[0m \n',confusion_matrix(Y_test, y_pred))
print('\n')
print('\033[1m'+'classification Report of Logistics Regression'+'\033[0m \n',classification_report(Y_test, y_pred))