In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data  = pd.read_csv("loan.csv")

In [3]:
data.shape

(614, 13)

In [4]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [6]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
data.duplicated().sum()

0

In [8]:
data.Gender

0        Male
1        Male
2        Male
3        Male
4        Male
        ...  
609    Female
610      Male
611      Male
612      Male
613    Female
Name: Gender, Length: 614, dtype: object

In [9]:
data.Married.value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [10]:
data.Dependents = data.Dependents.apply(lambda x: 0 if x=='0' else 1 if x=="1" else 2 if x=="2" else 3 if x=="3+" else x )

In [11]:
data.Dependents.unique()

array([ 0.,  1.,  2.,  3., nan])

In [12]:
data.Self_Employed.unique()

array(['No', 'Yes', nan], dtype=object)

In [13]:
data.LoanAmount.unique()

array([ nan, 128.,  66., 120., 141., 267.,  95., 158., 168., 349.,  70.,
       109., 200., 114.,  17., 125., 100.,  76., 133., 115., 104., 315.,
       116., 112., 151., 191., 122., 110.,  35., 201.,  74., 106., 320.,
       144., 184.,  80.,  47.,  75., 134.,  96.,  88.,  44., 286.,  97.,
       135., 180.,  99., 165., 258., 126., 312., 136., 172.,  81., 187.,
       113., 176., 130., 111., 167., 265.,  50., 210., 175., 131., 188.,
        25., 137., 160., 225., 216.,  94., 139., 152., 118., 185., 154.,
        85., 259., 194.,  93., 370., 182., 650., 102., 290.,  84., 242.,
       129.,  30., 244., 600., 255.,  98., 275., 121.,  63., 700.,  87.,
       101., 495.,  67.,  73., 260., 108.,  58.,  48., 164., 170.,  83.,
        90., 166., 124.,  55.,  59., 127., 214., 240.,  72.,  60., 138.,
        42., 280., 140., 155., 123., 279., 192., 304., 330., 150., 207.,
       436.,  78.,  54.,  89., 143., 105., 132., 480.,  56., 159., 300.,
       376., 117.,  71., 490., 173.,  46., 228., 30

In [14]:
data.Loan_Amount_Term

0      360.0
1      360.0
2      360.0
3      360.0
4      360.0
       ...  
609    360.0
610    180.0
611    360.0
612    360.0
613    360.0
Name: Loan_Amount_Term, Length: 614, dtype: float64

In [15]:
data.Credit_History.unique()

array([ 1.,  0., nan])

### Filling Missing Data Afeter Column Wise Study 

In [16]:
data.fillna({'Gender':data.Gender.mode()[0],
            'Married':data.Married.mode()[0],
            "Dependents":data.Dependents.mode()[0],
            "Self_Employed":data.Self_Employed.mode()[0],
            "LoanAmount":data.LoanAmount.median(),
            "Loan_Amount_Term":data.Loan_Amount_Term.mode()[0],
            "Credit_History":data.Credit_History.mode()[0]}, inplace=True)

In [17]:
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Data Transformations

In [18]:
data.Gender = data.Gender.apply(lambda x: 1 if x=="Male" else 0)
data.Married = data.Married.apply(lambda x: 1 if x=="Yes" else 0)
data.Education = data.Education.apply(lambda x: 1 if x=="Graduate" else 0)
data.Self_Employed = data.Self_Employed.apply(lambda x: 1 if x=="Yes" else 0)


In [19]:
data.Education.value_counts()

1    480
0    134
Name: Education, dtype: int64

In [20]:
data.Self_Employed

0      0
1      0
2      1
3      0
4      0
      ..
609    0
610    0
611    0
612    0
613    1
Name: Self_Employed, Length: 614, dtype: int64

In [21]:
data.CoapplicantIncome

0         0.0
1      1508.0
2         0.0
3      2358.0
4         0.0
        ...  
609       0.0
610       0.0
611     240.0
612       0.0
613       0.0
Name: CoapplicantIncome, Length: 614, dtype: float64

In [22]:
data.Loan_Amount_Term

0      360.0
1      360.0
2      360.0
3      360.0
4      360.0
       ...  
609    360.0
610    180.0
611    360.0
612    360.0
613    360.0
Name: Loan_Amount_Term, Length: 614, dtype: float64

In [23]:
data.Credit_History.value_counts()

1.0    525
0.0     89
Name: Credit_History, dtype: int64

In [24]:
data.Property_Area.value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [25]:
data.Property_Area =data.Property_Area.apply(lambda x: 0 if x=='Rural' else 1)

In [26]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
encoded  = ohe.fit_transform(data[['Property_Area']]).toarray()
encoded

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [27]:
pd.get_dummies(data.Property_Area).astype(int)

Unnamed: 0,0,1
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1
...,...,...
609,1,0
610,1,0
611,0,1
612,0,1


In [28]:
data.Loan_Status = data.Loan_Status.apply(lambda x: 1 if x=="Y" else 0)

In [29]:
data=data.drop("Loan_ID",axis=1)

In [30]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,1,1
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,0,1
610,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,0,1
611,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,1,1
612,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,1,1


In [31]:
X=data.drop("Loan_Status",axis=1)
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,1
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,1
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,1
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,0
610,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,0
611,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,1
612,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,1


In [32]:
y=data.Loan_Status
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [34]:
from sklearn.linear_model import LogisticRegression


In [35]:
classifer=LogisticRegression()

In [36]:
classifer.fit(X_train,y_train)

# Predictions of Training Data

In [37]:
y_trian_pred=classifer.predict(X_train)

In [38]:
pd.DataFrame({"orginal":y_train,"predicted":y_trian_pred})

Unnamed: 0,orginal,predicted
46,1,1
272,1,1
474,1,1
382,1,1
283,1,1
...,...,...
277,1,1
9,0,1
359,1,1
192,0,1


# Evaluation Matrices of a classfication Model
# Accuracy
Accuracy=correcy prediction/total prediction

In [39]:
correct = 0
for o,p in zip(y_train, y_trian_pred):
    if o==p:
        correct+=1
        
print(f"Accuracy of our classification model is {(correct/len(y_train)*100)}%")

Accuracy of our classification model is 80.21739130434783%


In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,y_trian_pred)

0.8021739130434783

In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_train,y_trian_pred))

              precision    recall  f1-score   support

           0       0.89      0.44      0.59       149
           1       0.78      0.97      0.87       311

    accuracy                           0.80       460
   macro avg       0.84      0.71      0.73       460
weighted avg       0.82      0.80      0.78       460



In [42]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,y_trian_pred))

[[ 66  83]
 [  8 303]]


# Preditions on Test Data

In [43]:
y_test_pred=classifer.predict(X_test)

In [44]:
pd.DataFrame({"Orginal":y_test,"predicted":y_test_pred})

Unnamed: 0,Orginal,predicted
454,1,1
52,0,1
536,1,1
469,0,1
55,1,1
...,...,...
399,0,0
89,1,1
271,1,1
563,1,1


In [45]:
correct=0
for o,p in zip(y_test,y_test_pred):
    if o==p:
        correct+=1
print(f"Accuracy of our classifictio model is {(correct/len(y_test)*100)}%")
        

Accuracy of our classifictio model is 83.11688311688312%


In [46]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_test_pred))

0.8311688311688312


In [47]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.87      0.47      0.61        43
           1       0.82      0.97      0.89       111

    accuracy                           0.83       154
   macro avg       0.85      0.72      0.75       154
weighted avg       0.84      0.83      0.81       154



In [48]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

[[ 20  23]
 [  3 108]]


In [49]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
print(accuracy_score(y_test,y_test_pred))
print(precision_score(y_test,y_test_pred))
print(recall_score(y_test,y_test_pred))
print(f1_score(y_test,y_test_pred))

0.8311688311688312
0.8244274809160306
0.972972972972973
0.8925619834710745


In [50]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,1,0,5849,0.0,128.0,360.0,1.0,1,1
1,1,1,1.0,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0.0,1,1,3000,0.0,66.0,360.0,1.0,1,1
3,1,1,0.0,0,0,2583,2358.0,120.0,360.0,1.0,1,1
4,1,0,0.0,1,0,6000,0.0,141.0,360.0,1.0,1,1


In [None]:
0	1	0	0.0	1	0	5849	0.0	128.0	360.0	1.0	1	1

In [51]:
def loan_status(data):
    if classifer.predict(data)==0:
        return "No"
    else: 
        return "Yes"

In [53]:
record = []
for feature in data.columns[:-1]:    
    record.append(float(input(f"{feature} : ")))
loan_status(np.array(record).reshape(1,len(record)))

Gender : 0
Married : 1
Dependents : 0
Education : 0.0
Self_Employed : 1
ApplicantIncome : 0
CoapplicantIncome : 5849
LoanAmount : 0.0
Loan_Amount_Term : 128.0
Credit_History : 360.0
Property_Area : 1.0




'Yes'