In [3]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [4]:
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv",index_col=False)
loan_data = loan_data.drop(['Unnamed: 0'], axis=1)
loan_data.head()
     

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [5]:

test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv')
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001116,Male,No,0,Not Graduate,No,3748,1668.0,110.0,360.0,1.0,Semiurban
1,LP001488,Male,Yes,3+,Graduate,No,4000,7750.0,290.0,360.0,1.0,Semiurban
2,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural
3,LP002284,Male,No,0,Not Graduate,No,3902,1666.0,109.0,360.0,1.0,Rural
4,LP002328,Male,Yes,0,Not Graduate,No,6096,0.0,218.0,360.0,0.0,Rural


In [6]:
print(loan_data.shape)
print(test_data.shape)

(491, 13)
(123, 12)


In [42]:
loan_status=loan_data['Loan_Status'].value_counts(normalize=True).mul(100).round(1).rename_axis('unique_value').reset_index(name='count')
education=loan_data['Education'].value_counts().rename_axis('unique_value').reset_index(name='count')
education


Unnamed: 0,unique_value,count
0,Graduate,388
1,Not Graduate,103


In [47]:
loan_data['Dependents']

0       0
1      3+
2       0
3       0
4       2
       ..
486     1
487     1
488     1
489     0
490     0
Name: Dependents, Length: 491, dtype: object

In [36]:
loan_data.isnull().sum()

Loan_ID               0
Gender               10
Married               1
Dependents            9
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term     13
Credit_History       43
Property_Area         0
Loan_Status           0
dtype: int64

In [40]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            491 non-null    object 
 1   Gender             481 non-null    object 
 2   Married            490 non-null    object 
 3   Dependents         482 non-null    object 
 4   Education          491 non-null    object 
 5   Self_Employed      462 non-null    object 
 6   ApplicantIncome    491 non-null    int64  
 7   CoapplicantIncome  491 non-null    float64
 8   LoanAmount         475 non-null    float64
 9   Loan_Amount_Term   478 non-null    float64
 10  Credit_History     448 non-null    float64
 11  Property_Area      491 non-null    object 
 12  Loan_Status        491 non-null    int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 50.0+ KB


In [70]:
loan_data.drop(columns=['Loan_ID'],inplace=True)

In [65]:
loan_data['Dependents'].replace('3+',3,inplace=True)
loan_data['Dependents']=loan_data['Dependents'].astype('float')

In [83]:
X=loan_data.drop(columns=['Loan_Status'])
y=loan_data[['Loan_Status']]


In [78]:
x_train, x_valid, y_train, y_valid = train_test_split(X,y, test_size=0.3)

In [86]:
num_column=X.select_dtypes(exclude='object').columns
cat_column=y.select_dtypes(include='object').columns


In [87]:
Gender_value=['Male','Female']
Married_value=['Yes','No']
Education_value=['Graduate','Not Graduate']
Property_Area_value=['Semiurban','Urban','Rural']

In [88]:
##Numerical_Pipieline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scalar',StandardScaler())
    ]
)
##categorical_pipeline
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[Gender_value,Married_value,Education_value,Property_Area_value])),
        ('scalar',StandardScaler())
    ]
)


preprocessor=ColumnTransformer([
    ('num_pipe',num_pipeline,num_column),
    ('cat_pipe',cat_pipeline,cat_column)
]
)

In [89]:
x_train=preprocessor.fit_transform(x_train)
x_valid=preprocessor.transform(x_valid)

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
model = LogisticRegression()
model.fit(x_train, y_train)

In [93]:
pred_data=model.predict(x_valid)
print('Model Accuracy = ', accuracy_score(y_valid,pred_data))
print('Model F1-Score = ', f1_score(y_valid,pred_data))

Model Accuracy =  0.7905405405405406
Model F1-Score =  0.858447488584475


In [95]:
test_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001116,Male,No,0,Not Graduate,No,3748,1668.0,110.0,360.0,1.0,Semiurban
1,LP001488,Male,Yes,3+,Graduate,No,4000,7750.0,290.0,360.0,1.0,Semiurban
2,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural
3,LP002284,Male,No,0,Not Graduate,No,3902,1666.0,109.0,360.0,1.0,Rural
4,LP002328,Male,Yes,0,Not Graduate,No,6096,0.0,218.0,360.0,0.0,Rural
...,...,...,...,...,...,...,...,...,...,...,...,...
118,LP002683,Male,No,0,Graduate,No,4683,1915.0,185.0,360.0,1.0,Semiurban
119,LP002054,Male,Yes,2,Not Graduate,No,3601,1590.0,,360.0,1.0,Rural
120,LP002757,Female,Yes,0,Not Graduate,No,3017,663.0,102.0,360.0,,Semiurban
121,LP002582,Female,No,0,Not Graduate,Yes,17263,0.0,225.0,360.0,1.0,Semiurban
