In [1]:
# Importing Necessory Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from scipy.stats import zscore
from sklearn.preprocessing import OrdinalEncoder

# Models:
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Matrics for evaluation:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_roc_curve,roc_auc_score


In [2]:
#READ THE DATASET

df=pd.read_csv('https://raw.githubusercontent.com/dsrscientist/DSData/master/loan_prediction.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Loans is an important sector of business for the banks,hence this dataset belongs the details of the applicants or customers applying for getting loan.Here the targeted variavle is the "Loan_Status" column which belongs Y=Yes and N = No,thus this is a classification problem where we have to predict wheather a loan will be approved or not..The company wants to automate the loan eligibilitty process based on customer details while filling the online application form.These details are Loan Id ,gender,Married,Dependents,education,Self-Employed,Property Area and Loan status are object in nature,Applicant income is integer in nature and CoapplicantIncome,LoanAmount,Loan_Amount_Term and Credit_History are float in nature. # Dataset description: Variables:-

Loan_ID -Unique Loan ID

Gender -Male/ Female

Married -Applicant married (Y/N)

Dependents -Number of dependents

Education -Applicant Education (Graduate/ Under Graduate)

Self_Employed -Self employed (Y/N)

ApplicantIncome -Applicant income

CoapplicantIncome- Coapplicant income

LoanAmount -Loan amount in thousands

Loan_Amount_Term -Term of loan in months

Credit_History -credit history meets guidelines

Property_Area -Urban/ Semi Urban/ Rural

Loan_Status -Loan approved (Y/N)

In [3]:
df.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

we can see that Loan id is having most unique set of datas,then ApplicantIncome ,Coapplicant Income, and Loan Amount.

In [4]:
df.shape

(614, 13)

There are 614 rows and 13 columns in this dataset.

In [5]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


We can see that,Loan Id ,gender,Married,Dependents,education,Self-Employed,Property Area and Loan status are object in nature.

Applicant income is integer in nature and CoapplicantIncome,LoanAmount,Loan_Amount_Term and Credit_History are float in nature.


In [7]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

We can see that,Loan Id ,gender,Married,Dependents,education,Self-Employed,Property Area and Loan status are object in nature.

Applicant income is integer in nature and CoapplicantIncome,LoanAmount,Loan_Amount_Term and Credit_History are float in nature.

In [8]:
# Statistical summary
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12041,12.0,360.0,360.0,360.0,480.0
Credit_History,564.0,0.842199,0.364878,0.0,1.0,1.0,1.0,1.0


We can see in count that all the values are not same thus,the data set is having missing values.

ApplicantIncome is having min values of 150 and maximum values of 81000.

CoapplicantIncome is having minimum values of 0.0 and maximum values of 41667.

Loan Amount is havinig minimum values of 9.0 and maximum values of 700.0

Loan amount term is havng minimum values of 12.0 months and maximum values of 480 months.

credit history is havng max values of 1.0 and mininmum of 0.0.

There is skewnes in the dataset because the differnce between the standard deviation and mean is very high,we will remove the skewness latter.

In [9]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

There is null values in different columns like Gender,Married,Dependents,Self_Employed,LoanAmount,Loan_Amount_Term and Credit History.

In [10]:
# filling the missing values of the numerical columns:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())

Filling the missing values in numerical columns by using mean method.

In [11]:
#filling the missing values of the categorical columns:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

Filling the missing values in categorical columns by using mode method.

In [12]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

All the null values are filled and all the columns are showing 0 missing values.

EDA:

BI VARIATE ANALYSIS:

In [14]:
#Counting Gender Column
df['Gender'].value_counts()

Male      502
Female    112
Name: Gender, dtype: int64