# Libraries

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

# Load Data

In [32]:
loan_dataset = pd.read_csv('C:\\Users\\gbhan\\Downloads\\loan_data.csv')
loan_dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [33]:
loan_dataset.shape

(614, 13)

# Imputing the missing values

In [34]:
loan_dataset = loan_dataset.drop(columns=['Loan_ID'],axis=1)
loan_dataset.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Filling Missing values with mode

In [35]:
for i in [loan_dataset]:
    i["Gender"]=i["Gender"].fillna(loan_dataset.Gender.dropna().mode()[0])
    i["Married"]=i["Married"].fillna(loan_dataset.Married.dropna().mode()[0])
    i["Dependents"]=i["Dependents"].fillna(loan_dataset.Dependents.dropna().mode()[0])
    i["Self_Employed"]=i["Self_Employed"].fillna(loan_dataset.Self_Employed.dropna().mode()[0])
    i["Credit_History"]=i["Credit_History"].fillna(loan_dataset.Credit_History.dropna().mode()[0])

 Iterative imputer for filling missing values of LoanAmount and Loan_Amount_Term

In [36]:
from sklearn.ensemble import RandomForestRegressor
data = loan_dataset.loc[:,['LoanAmount','Loan_Amount_Term']]

#Run imputer with randomforest estimator
imp = IterativeImputer(RandomForestRegressor(),max_iter=10,random_state=0)
data = pd.DataFrame(imp.fit_transform(data),columns=data.columns)

In [37]:
data.isnull().sum()

LoanAmount          0
Loan_Amount_Term    0
dtype: int64

In [38]:
loan_dataset.drop(columns=['LoanAmount','Loan_Amount_Term'],axis=1)
loan_dataset['LoanAmount'] = data['LoanAmount']
loan_dataset['Loan_Amount_Term'] = data['Loan_Amount_Term']

In [39]:
loan_dataset.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

#  Label Encoding

In [40]:
loan_dataset = loan_dataset.replace(to_replace='3+', value=4)
loan_dataset.replace({'Married':{'No':0,'Yes':1},'Gender':{'Male':1,'Female':0},'Self_Employed':{'No':0,'Yes':1},
                      'Property_Area':{'Rural':0,'Semiurban':1,'Urban':2},
                      'Education':{'Graduate':1,'Not Graduate':0}},inplace=True)

# Train Test Split

In [41]:
x = loan_dataset.drop(columns=['Loan_Status'],axis=1)
y = loan_dataset['Loan_Status']

In [42]:
x.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,1,0,5849,0.0,146.584354,360.0,1.0,2
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2


In [43]:
y.head()

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

In [44]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.1,stratify=y,random_state=2)

# Train the data with LR

In [47]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train,y_train)

LogisticRegression()

# Predict the test data and Determine the accuracy

In [49]:
from sklearn.metrics import accuracy_score
x_test_prediction = classifier.predict(x_test)
test_data_accuray = accuracy_score(x_test_prediction,y_test)
print(test_data_accuray)

0.7741935483870968


# We made till the end. Thank You!