## Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have provided a dataset to identify the customers segments that are eligible for loan amount so that they can specifically target these customers. 

## Use Logistic Regression to predict the Loan status

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir(r"C:\Users\kalya\Santhosh\Data Science\Notebook\Preprocessing")

### Import the train and Test files

In [3]:
train = pd.read_csv('Loan_train.csv')

In [4]:
test = pd.read_csv('Loan_test.csv')

In [5]:
submission = pd.read_csv('Loan_sample_submission.csv')

### Preprocessing

In [6]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,614.0,5403.459283,6109.041673,150.0,2877.5,3812.5,5795.0,81000.0
CoapplicantIncome,614.0,1621.245798,2926.248369,0.0,0.0,1188.5,2297.25,41667.0
LoanAmount,592.0,146.412162,85.587325,9.0,100.0,128.0,168.0,700.0
Loan_Amount_Term,600.0,342.0,65.12041,12.0,360.0,360.0,360.0,480.0
Credit_History,564.0,0.842199,0.364878,0.0,1.0,1.0,1.0,1.0


In [8]:
display(train.duplicated().sum(), test.duplicated().sum())

0

0

In [9]:
train[train.Dependents.isna() == True]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
102,LP001350,Male,Yes,,Graduate,No,13650,0.0,,360.0,1.0,Urban,Y
104,LP001357,Male,,,Graduate,No,3816,754.0,160.0,360.0,1.0,Urban,Y
120,LP001426,Male,Yes,,Graduate,No,5667,2667.0,180.0,360.0,1.0,Rural,Y
226,LP001754,Male,Yes,,Not Graduate,Yes,4735,0.0,138.0,360.0,1.0,Urban,N
228,LP001760,Male,,,Graduate,No,4758,0.0,158.0,480.0,1.0,Semiurban,Y
293,LP001945,Female,No,,Graduate,No,5417,0.0,143.0,480.0,0.0,Urban,N
301,LP001972,Male,Yes,,Not Graduate,No,2875,1750.0,105.0,360.0,1.0,Semiurban,Y
332,LP002100,Male,No,,Graduate,No,2833,0.0,71.0,360.0,1.0,Urban,Y
335,LP002106,Male,Yes,,Graduate,Yes,5503,4490.0,70.0,,1.0,Semiurban,Y
346,LP002130,Male,Yes,,Not Graduate,No,3523,3230.0,152.0,360.0,0.0,Rural,N


In [10]:
train.Property_Area.value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [11]:
train.Credit_History.value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [12]:
display(train.isna().sum(), test.isna().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [13]:
train.shape, test.shape

((614, 13), (367, 12))

In [14]:
train.duplicated().sum()

0

In [15]:
train.select_dtypes(include='object')

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,Urban,Y
4,LP001008,Male,No,0,Graduate,No,Urban,Y
...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,Urban,Y


In [16]:
[print(i, train[i].unique(),'\n') for i in train.select_dtypes(include='object').columns]

Loan_ID ['LP001002' 'LP001003' 'LP001005' 'LP001006' 'LP001008' 'LP001011'
 'LP001013' 'LP001014' 'LP001018' 'LP001020' 'LP001024' 'LP001027'
 'LP001028' 'LP001029' 'LP001030' 'LP001032' 'LP001034' 'LP001036'
 'LP001038' 'LP001041' 'LP001043' 'LP001046' 'LP001047' 'LP001050'
 'LP001052' 'LP001066' 'LP001068' 'LP001073' 'LP001086' 'LP001087'
 'LP001091' 'LP001095' 'LP001097' 'LP001098' 'LP001100' 'LP001106'
 'LP001109' 'LP001112' 'LP001114' 'LP001116' 'LP001119' 'LP001120'
 'LP001123' 'LP001131' 'LP001136' 'LP001137' 'LP001138' 'LP001144'
 'LP001146' 'LP001151' 'LP001155' 'LP001157' 'LP001164' 'LP001179'
 'LP001186' 'LP001194' 'LP001195' 'LP001197' 'LP001198' 'LP001199'
 'LP001205' 'LP001206' 'LP001207' 'LP001213' 'LP001222' 'LP001225'
 'LP001228' 'LP001233' 'LP001238' 'LP001241' 'LP001243' 'LP001245'
 'LP001248' 'LP001250' 'LP001253' 'LP001255' 'LP001256' 'LP001259'
 'LP001263' 'LP001264' 'LP001265' 'LP001266' 'LP001267' 'LP001273'
 'LP001275' 'LP001279' 'LP001280' 'LP001282' 'LP001289

[None, None, None, None, None, None, None, None]

### Concatenating the train and teset data

In [17]:
data = pd.concat([train,test])

In [18]:
data.reset_index(inplace=True, drop=True)

In [19]:
train.shape, test.shape, data.shape

((614, 13), (367, 12), (981, 13))

In [20]:
train.shape[0]+test.shape[0]

981

### Manipulating the data as part of the preprocessing

In [21]:
data['Gender'] = data.Gender.fillna('Unknown')

In [22]:
 data['Married'] = data.Married.fillna('Unknown')

In [23]:
data.Married.isnull().sum()

0

In [24]:
data['Dependents'] = data.Dependents.fillna('na')

In [25]:
data['Self_Employed'] = data.Self_Employed.fillna('Unknown')

In [26]:
data['LoanAmount'] = data.LoanAmount.fillna(0.0)

In [27]:
data['Credit_History'] = data.Credit_History.fillna(0.0)

In [28]:
data.Credit_History.value_counts()

1.0    754
0.0    227
Name: Credit_History, dtype: int64

In [29]:
data.isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term      20
Credit_History         0
Property_Area          0
Loan_Status          367
dtype: int64

In [30]:
train.shape, test.shape, data.shape, train.shape[0]+test.shape[0]

((614, 13), (367, 12), (981, 13), 981)

In [31]:
data['Loan_Amount_Term'] = data.Loan_Amount_Term.fillna(0.0)

In [32]:
data.isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status          367
dtype: int64

In [33]:
[print(i, data[i].unique(),'\n') for i in data.select_dtypes(include='object').columns]

Loan_ID ['LP001002' 'LP001003' 'LP001005' 'LP001006' 'LP001008' 'LP001011'
 'LP001013' 'LP001014' 'LP001018' 'LP001020' 'LP001024' 'LP001027'
 'LP001028' 'LP001029' 'LP001030' 'LP001032' 'LP001034' 'LP001036'
 'LP001038' 'LP001041' 'LP001043' 'LP001046' 'LP001047' 'LP001050'
 'LP001052' 'LP001066' 'LP001068' 'LP001073' 'LP001086' 'LP001087'
 'LP001091' 'LP001095' 'LP001097' 'LP001098' 'LP001100' 'LP001106'
 'LP001109' 'LP001112' 'LP001114' 'LP001116' 'LP001119' 'LP001120'
 'LP001123' 'LP001131' 'LP001136' 'LP001137' 'LP001138' 'LP001144'
 'LP001146' 'LP001151' 'LP001155' 'LP001157' 'LP001164' 'LP001179'
 'LP001186' 'LP001194' 'LP001195' 'LP001197' 'LP001198' 'LP001199'
 'LP001205' 'LP001206' 'LP001207' 'LP001213' 'LP001222' 'LP001225'
 'LP001228' 'LP001233' 'LP001238' 'LP001241' 'LP001243' 'LP001245'
 'LP001248' 'LP001250' 'LP001253' 'LP001255' 'LP001256' 'LP001259'
 'LP001263' 'LP001264' 'LP001265' 'LP001266' 'LP001267' 'LP001273'
 'LP001275' 'LP001279' 'LP001280' 'LP001282' 'LP001289

[None, None, None, None, None, None, None, None]

In [34]:
data.select_dtypes(include='object')

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,Urban,Y
4,LP001008,Male,No,0,Graduate,No,Urban,Y
...,...,...,...,...,...,...,...,...
976,LP002971,Male,Yes,3+,Not Graduate,Yes,Urban,
977,LP002975,Male,Yes,0,Graduate,No,Urban,
978,LP002980,Male,No,0,Graduate,No,Semiurban,
979,LP002986,Male,Yes,0,Graduate,No,Rural,


In [35]:
data.reset_index(inplace=True, drop=True)

### Converting the object data into numerical data format using One-hot encoding

In [36]:
encoded_data = pd.get_dummies(data, columns = ['Gender','Married', 'Dependents','Education','Self_Employed','Property_Area'])

In [37]:
encoded_data.shape

(981, 26)

In [38]:
encoded_data.columns

Index(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Gender_Female',
       'Gender_Male', 'Gender_Unknown', 'Married_No', 'Married_Unknown',
       'Married_Yes', 'Dependents_0', 'Dependents_1', 'Dependents_2',
       'Dependents_3+', 'Dependents_na', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Unknown',
       'Self_Employed_Yes', 'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [39]:
data.select_dtypes(exclude='object')

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849,0.0,0.0,360.0,1.0
1,4583,1508.0,128.0,360.0,1.0
2,3000,0.0,66.0,360.0,1.0
3,2583,2358.0,120.0,360.0,1.0
4,6000,0.0,141.0,360.0,1.0
...,...,...,...,...,...
976,4009,1777.0,113.0,360.0,1.0
977,4158,709.0,115.0,360.0,1.0
978,3250,1993.0,126.0,360.0,0.0
979,5000,2393.0,158.0,360.0,1.0


In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
data.select_dtypes(exclude='object').columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

### Performing scaling on the existing numerical data to avoid high deviation on the values

In [42]:
sc = StandardScaler()
scaled_data = sc.fit_transform(encoded_data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Credit_History']])
scaled_data = pd.DataFrame(scaled_data, columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Credit_History'])
print(scaled_data.shape)

(981, 5)


In [43]:
scaled_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0.117565,-0.589506,-1.736899,0.307622,0.54869
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869
3,-0.456202,0.278239,-0.232973,0.307622,0.54869
4,0.144093,-0.589506,0.030214,0.307622,0.54869


### End of the proprocessing.
### Construction(preprocessing) of the data completed for the model. 
### Finally concatenating the required and coverted columns into final dataset

In [44]:
preprocessed_data = pd.concat([scaled_data, encoded_data.drop(columns=['ApplicantIncome', 'CoapplicantIncome','LoanAmount','Loan_Amount_Term', 'Credit_History'])],axis=1)
preprocessed_data.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.117565,-0.589506,-1.736899,0.307622,0.54869,LP001002,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869,LP001003,N,0,1,0,...,0,0,1,0,1,0,0,1,0,0
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869,LP001005,Y,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.456202,0.278239,-0.232973,0.307622,0.54869,LP001006,Y,0,1,0,...,0,0,0,1,1,0,0,0,0,1
4,0.144093,-0.589506,0.030214,0.307622,0.54869,LP001008,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1


In [45]:
preprocessed_data.shape

(981, 26)

In [46]:
preprocessed_data.isna().sum()

ApplicantIncome              0
CoapplicantIncome            0
LoanAmount                   0
Loan_Amount_Term             0
Credit_History               0
Loan_ID                      0
Loan_Status                367
Gender_Female                0
Gender_Male                  0
Gender_Unknown               0
Married_No                   0
Married_Unknown              0
Married_Yes                  0
Dependents_0                 0
Dependents_1                 0
Dependents_2                 0
Dependents_3+                0
Dependents_na                0
Education_Graduate           0
Education_Not Graduate       0
Self_Employed_No             0
Self_Employed_Unknown        0
Self_Employed_Yes            0
Property_Area_Rural          0
Property_Area_Semiurban      0
Property_Area_Urban          0
dtype: int64

### Splitting the data into train and test as we get in the original format

In [47]:
preprc_train = preprocessed_data[preprocessed_data['Loan_Status'].notna()]
preprc_test = preprocessed_data[preprocessed_data['Loan_Status'].isna()]

In [48]:
display(preprc_train.head(), preprc_train.tail())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.117565,-0.589506,-1.736899,0.307622,0.54869,LP001002,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869,LP001003,N,0,1,0,...,0,0,1,0,1,0,0,1,0,0
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869,LP001005,Y,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.456202,0.278239,-0.232973,0.307622,0.54869,LP001006,Y,0,1,0,...,0,0,0,1,1,0,0,0,0,1
4,0.144093,-0.589506,0.030214,0.307622,0.54869,LP001008,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
609,-0.400512,-0.589506,-0.847076,0.307622,0.54869,LP002978,Y,1,0,0,...,0,0,1,0,1,0,0,1,0,0
610,-0.188643,-0.589506,-1.23559,-1.927395,0.54869,LP002979,Y,0,1,0,...,1,0,1,0,1,0,0,1,0,0
611,0.5081,-0.501186,1.433878,0.307622,0.54869,LP002983,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
612,0.422193,-0.589506,0.606719,0.307622,0.54869,LP002984,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
613,-0.104844,-0.589506,-0.070048,0.307622,-1.822522,LP002990,N,1,0,0,...,0,0,1,0,0,0,1,0,1,0


In [49]:
display(preprc_test.head(), preprc_test.tail())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
614,0.094903,-0.589506,-0.3583,0.307622,0.54869,LP001015,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
615,-0.369593,-0.037505,-0.157777,0.307622,0.54869,LP001022,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
616,-0.031586,0.072895,0.869906,0.307622,0.54869,LP001031,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
617,-0.498892,0.347423,-0.483627,0.307622,-1.822522,LP001035,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
618,-0.334457,-0.589506,-0.759347,0.307622,0.54869,LP001051,,0,1,0,...,0,0,0,1,1,0,0,0,0,1


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
976,-0.205684,0.064431,-0.320702,0.307622,0.54869,LP002971,,0,1,0,...,1,0,0,1,0,0,1,0,0,1
977,-0.179508,-0.328594,-0.295637,0.307622,0.54869,LP002975,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
978,-0.339024,0.143919,-0.157777,0.307622,-1.822522,LP002980,,0,1,0,...,0,0,1,0,1,0,0,0,1,0
979,-0.031586,0.291119,0.24327,0.307622,0.54869,LP002986,,0,1,0,...,0,0,1,0,1,0,0,1,0,0
980,0.706265,-0.589506,-0.508693,-1.927395,0.54869,LP002989,,0,1,0,...,0,0,1,0,0,0,1,1,0,0


In [50]:

preprc_test.reset_index(inplace=True, drop=True)

In [51]:
display(preprc_train.head(), preprc_train.tail())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.117565,-0.589506,-1.736899,0.307622,0.54869,LP001002,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869,LP001003,N,0,1,0,...,0,0,1,0,1,0,0,1,0,0
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869,LP001005,Y,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.456202,0.278239,-0.232973,0.307622,0.54869,LP001006,Y,0,1,0,...,0,0,0,1,1,0,0,0,0,1
4,0.144093,-0.589506,0.030214,0.307622,0.54869,LP001008,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
609,-0.400512,-0.589506,-0.847076,0.307622,0.54869,LP002978,Y,1,0,0,...,0,0,1,0,1,0,0,1,0,0
610,-0.188643,-0.589506,-1.23559,-1.927395,0.54869,LP002979,Y,0,1,0,...,1,0,1,0,1,0,0,1,0,0
611,0.5081,-0.501186,1.433878,0.307622,0.54869,LP002983,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
612,0.422193,-0.589506,0.606719,0.307622,0.54869,LP002984,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
613,-0.104844,-0.589506,-0.070048,0.307622,-1.822522,LP002990,N,1,0,0,...,0,0,1,0,0,0,1,0,1,0


In [52]:
display(preprc_test.head(), preprc_test.tail())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.094903,-0.589506,-0.3583,0.307622,0.54869,LP001015,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.369593,-0.037505,-0.157777,0.307622,0.54869,LP001022,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
2,-0.031586,0.072895,0.869906,0.307622,0.54869,LP001031,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
3,-0.498892,0.347423,-0.483627,0.307622,-1.822522,LP001035,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
4,-0.334457,-0.589506,-0.759347,0.307622,0.54869,LP001051,,0,1,0,...,0,0,0,1,1,0,0,0,0,1


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
362,-0.205684,0.064431,-0.320702,0.307622,0.54869,LP002971,,0,1,0,...,1,0,0,1,0,0,1,0,0,1
363,-0.179508,-0.328594,-0.295637,0.307622,0.54869,LP002975,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
364,-0.339024,0.143919,-0.157777,0.307622,-1.822522,LP002980,,0,1,0,...,0,0,1,0,1,0,0,0,1,0
365,-0.031586,0.291119,0.24327,0.307622,0.54869,LP002986,,0,1,0,...,0,0,1,0,1,0,0,1,0,0
366,0.706265,-0.589506,-0.508693,-1.927395,0.54869,LP002989,,0,1,0,...,0,0,1,0,0,0,1,1,0,0


### Transferring the train and test data into csv file on the computer

In [53]:
preprc_train.to_csv('preprocessed_Loan_train_data.csv',index=False)
preprc_test.to_csv('preprocessed_Loan_test_data.csv',index=False)

### Reading the same file as it is

In [54]:
tot_train = pd.read_csv('preprocessed_Loan_train_data.csv')
tot_test = pd.read_csv('preprocessed_Loan_test_data.csv')

In [55]:
display(tot_train.head(), tot_test.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.117565,-0.589506,-1.736899,0.307622,0.54869,LP001002,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869,LP001003,N,0,1,0,...,0,0,1,0,1,0,0,1,0,0
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869,LP001005,Y,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.456202,0.278239,-0.232973,0.307622,0.54869,LP001006,Y,0,1,0,...,0,0,0,1,1,0,0,0,0,1
4,0.144093,-0.589506,0.030214,0.307622,0.54869,LP001008,Y,0,1,0,...,0,0,1,0,1,0,0,0,0,1


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.094903,-0.589506,-0.3583,0.307622,0.54869,LP001015,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.369593,-0.037505,-0.157777,0.307622,0.54869,LP001022,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
2,-0.031586,0.072895,0.869906,0.307622,0.54869,LP001031,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
3,-0.498892,0.347423,-0.483627,0.307622,-1.822522,LP001035,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
4,-0.334457,-0.589506,-0.759347,0.307622,0.54869,LP001051,,0,1,0,...,0,0,0,1,1,0,0,0,0,1


## Modelling on the data

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X = tot_train.drop(columns = ['Loan_ID','Loan_Status'])

In [58]:
X.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_Unknown,Married_No,Married_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.117565,-0.589506,-1.736899,0.307622,0.54869,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.104844,-0.034561,-0.132711,0.307622,0.54869,0,1,0,0,0,...,0,0,1,0,1,0,0,1,0,0
2,-0.382944,-0.589506,-0.90974,0.307622,0.54869,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,-0.456202,0.278239,-0.232973,0.307622,0.54869,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,1
4,0.144093,-0.589506,0.030214,0.307622,0.54869,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,1


In [59]:
y = tot_train['Loan_Status']

In [60]:
y.head()

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

### train test split

In [61]:
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.2,random_state = 33)

In [62]:
train_y.head()

421    N
366    N
126    Y
42     Y
65     N
Name: Loan_Status, dtype: object

In [63]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((491, 24), (123, 24), (491,), (123,))

In [64]:
from sklearn.linear_model import LogisticRegression

In [65]:
model = LogisticRegression(max_iter=500)

In [66]:
model.fit(train_X,train_y)

LogisticRegression(max_iter=500)

In [67]:
display(model.coef_, model.intercept_)

array([[-0.08492502, -0.12899123,  0.00118934, -0.07926598,  0.94281031,
         0.05262438,  0.13972637, -0.19219558, -0.45750795,  0.36199484,
         0.09566827,  0.09501697, -0.42994111,  0.32228546,  0.08890422,
        -0.07611038,  0.14021024, -0.14005508, -0.17436318,  0.14228527,
         0.03223307, -0.37251753,  0.49288192, -0.12020923]])

array([0.81770241])

In [68]:
train_pred_proba = model.predict_proba(train_X)

In [69]:
train_pred_proba

array([[0.76910768, 0.23089232],
       [0.1759728 , 0.8240272 ],
       [0.332261  , 0.667739  ],
       [0.65302392, 0.34697608],
       [0.12440923, 0.87559077],
       [0.65246926, 0.34753074],
       [0.15369733, 0.84630267],
       [0.26957385, 0.73042615],
       [0.26804257, 0.73195743],
       [0.17927616, 0.82072384],
       [0.37783305, 0.62216695],
       [0.22668317, 0.77331683],
       [0.35978269, 0.64021731],
       [0.73392032, 0.26607968],
       [0.21104444, 0.78895556],
       [0.133224  , 0.866776  ],
       [0.54506823, 0.45493177],
       [0.54412243, 0.45587757],
       [0.10825904, 0.89174096],
       [0.36098222, 0.63901778],
       [0.10517707, 0.89482293],
       [0.08155164, 0.91844836],
       [0.27602894, 0.72397106],
       [0.74407617, 0.25592383],
       [0.32900062, 0.67099938],
       [0.27973707, 0.72026293],
       [0.73547752, 0.26452248],
       [0.10275345, 0.89724655],
       [0.68090307, 0.31909693],
       [0.176962  , 0.823038  ],
       [0.

### Predicting the values

In [70]:
train_pred = model.predict(train_X)

In [71]:
train_pred

array(['N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y

In [73]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [74]:
train_y

421    N
366    N
126    Y
42     Y
65     N
      ..
57     N
201    Y
578    Y
391    Y
20     N
Name: Loan_Status, Length: 491, dtype: object

In [75]:
train_pred

array(['N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y

In [76]:
print('Train accuracy')
print('accuracy score',accuracy_score(train_y,train_pred))
print('f1 score',f1_score(train_y,train_pred,pos_label='Y'))
print('confusion matrix\n',confusion_matrix(train_y,train_pred))


Train accuracy
accuracy score 0.769857433808554
f1 score 0.8432732316227461
confusion matrix
 [[ 74  82]
 [ 31 304]]


In [77]:
test_X.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Gender_Male',
       'Gender_Unknown', 'Married_No', 'Married_Unknown', 'Married_Yes',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3+',
       'Dependents_na', 'Education_Graduate', 'Education_Not Graduate',
       'Self_Employed_No', 'Self_Employed_Unknown', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [78]:
test_pred  = model.predict(test_X)

In [79]:
print('test accuracy')
print('accuracy score',accuracy_score(test_y,test_pred))
print('f1 score',f1_score(test_y,test_pred,pos_label='Y'))
print('confusion matrix\n',confusion_matrix(test_y,test_pred))

test accuracy
accuracy score 0.7642276422764228
f1 score 0.8379888268156424
confusion matrix
 [[19 17]
 [12 75]]


In [80]:
tot_test.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID,Loan_Status,Gender_Female,Gender_Male,Gender_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.094903,-0.589506,-0.3583,0.307622,0.54869,LP001015,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.369593,-0.037505,-0.157777,0.307622,0.54869,LP001022,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
2,-0.031586,0.072895,0.869906,0.307622,0.54869,LP001031,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
3,-0.498892,0.347423,-0.483627,0.307622,-1.822522,LP001035,,0,1,0,...,0,0,1,0,1,0,0,0,0,1
4,-0.334457,-0.589506,-0.759347,0.307622,0.54869,LP001051,,0,1,0,...,0,0,0,1,1,0,0,0,0,1


In [81]:
tot_test1 = tot_test.drop(columns = ['Loan_ID', 'Loan_Status'])

In [82]:
tot_test1.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Gender_Unknown,Married_No,Married_Unknown,...,Dependents_3+,Dependents_na,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Unknown,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0.094903,-0.589506,-0.3583,0.307622,0.54869,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.369593,-0.037505,-0.157777,0.307622,0.54869,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
2,-0.031586,0.072895,0.869906,0.307622,0.54869,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
3,-0.498892,0.347423,-0.483627,0.307622,-1.822522,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
4,-0.334457,-0.589506,-0.759347,0.307622,0.54869,0,1,0,1,0,...,0,0,0,1,1,0,0,0,0,1


In [83]:
tot_test_pred  = model.predict(tot_test1)

In [84]:
tot_test_pred 

array(['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y

In [85]:
print('tot test accuracy')
print('accuracy score',accuracy_score(test_y,test_pred))
print('f1 score',f1_score(test_y,test_pred,pos_label='Y'))
print('confusion matrix\n',confusion_matrix(test_y,test_pred))

tot test accuracy
accuracy score 0.7642276422764228
f1 score 0.8379888268156424
confusion matrix
 [[19 17]
 [12 75]]


In [86]:
submission.shape

(367, 2)

In [87]:
display(submission.head(),submission.tail())

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,N
1,LP001022,N
2,LP001031,N
3,LP001035,N
4,LP001051,N


Unnamed: 0,Loan_ID,Loan_Status
362,LP002971,N
363,LP002975,N
364,LP002980,N
365,LP002986,N
366,LP002989,N


## Deploy

In [88]:
submission.Loan_Status = tot_test_pred

In [89]:
display(submission.head(),submission.tail())

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,N
4,LP001051,Y


Unnamed: 0,Loan_ID,Loan_Status
362,LP002971,Y
363,LP002975,Y
364,LP002980,N
365,LP002986,Y
366,LP002989,Y


In [90]:
submission.to_csv('Loan_submission_res.csv',index=False)