# Practice Project

## Loan Eligibility Classification

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
loan = pd.read_csv("datasets/loan detection/loan.csv")

In [3]:
loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Pre-processing training data

In [4]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
loan.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
loan['Credit_History'].value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [7]:
loan['Credit_History'] = loan['Credit_History'].fillna(0.0)

In [8]:
loan['Self_Employed'].value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [9]:
loan['Self_Employed'] = loan['Self_Employed'].fillna('No')

In [10]:
loan.dropna(inplace=True)

In [11]:
categoricals = ['Gender', 'Education', 'Property_Area', 'Self_Employed', 'Married']
for cat in categoricals:
    print(loan[cat].value_counts())
    print("_"*20)

Gender
Male      449
Female    104
Name: count, dtype: int64
____________________
Education
Graduate        437
Not Graduate    116
Name: count, dtype: int64
____________________
Property_Area
Semiurban    215
Urban        175
Rural        163
Name: count, dtype: int64
____________________
Self_Employed
No     481
Yes     72
Name: count, dtype: int64
____________________
Married
Yes    359
No     194
Name: count, dtype: int64
____________________


In [12]:
loan['Gender'] = loan['Gender'].map({'Female':0, 'Male':1})
loan['Education'] = loan['Education'].map({'Not Graduate':0, 'Graduate':1})
loan['Married'] = loan['Married'].map({'No':0, 'Yes':1})
loan['Self_Employed'] = loan['Self_Employed'].map({'No':0, 'Yes':1})

In [13]:
loan.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y


In [14]:
encoded = pd.get_dummies(loan, columns=['Property_Area'], dtype=int, drop_first=True)
encoded

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,N,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,Y,0,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,Y,0,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,Y,0,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,Y,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,Y,0,0
610,LP002979,1,1,3+,1,0,4106,0.0,40.0,180.0,1.0,Y,0,0
611,LP002983,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,Y,0,1
612,LP002984,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,Y,0,1


In [15]:
encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, 1 to 613
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  553 non-null    object 
 1   Gender                   553 non-null    int64  
 2   Married                  553 non-null    int64  
 3   Dependents               553 non-null    object 
 4   Education                553 non-null    int64  
 5   Self_Employed            553 non-null    int64  
 6   ApplicantIncome          553 non-null    int64  
 7   CoapplicantIncome        553 non-null    float64
 8   LoanAmount               553 non-null    float64
 9   Loan_Amount_Term         553 non-null    float64
 10  Credit_History           553 non-null    float64
 11  Loan_Status              553 non-null    object 
 12  Property_Area_Semiurban  553 non-null    int32  
 13  Property_Area_Urban      553 non-null    int32  
dtypes: float64(4), int32(2), int64(

In [16]:
encoded['Dependents'].value_counts()

Dependents
0     316
1      96
2      96
3+     45
Name: count, dtype: int64

In [17]:
encoded['Dependents'] = encoded['Dependents'].replace('3+', 3).astype(int) # '3'

In [18]:
encoded['Dependents'].dtype

dtype('int32')

In [19]:
encoded.drop('Loan_ID', axis=1, inplace=True)

In [20]:
X = encoded.drop('Loan_Status', axis=1)
y = encoded['Loan_Status']

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [27]:
nb_acc = accuracy_score(nb_pred, y_test)
nb_acc*100

77.47747747747748

In [29]:
nb_report = classification_report(nb_pred, y_test)
print(nb_report)

              precision    recall  f1-score   support

           0       0.53      0.67      0.59        27
           1       0.88      0.81      0.84        84

    accuracy                           0.77       111
   macro avg       0.71      0.74      0.72       111
weighted avg       0.80      0.77      0.78       111



In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

In [31]:
knn_acc = accuracy_score(knn_pred, y_test)
knn_report = classification_report(knn_pred, y_test)

In [32]:
knn_acc

0.7027027027027027

In [33]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.38      0.52      0.44        25
           1       0.84      0.76      0.80        86

    accuracy                           0.70       111
   macro avg       0.61      0.64      0.62       111
weighted avg       0.74      0.70      0.72       111



In [36]:
from sklearn.svm import SVC
svc = SVC()

svc.fit(X_train, y_train)
scv_pred = svc.predict(X_test)

In [39]:
svc_acc = accuracy_score(scv_pred, y_test)
svc_report = classification_report(scv_pred, y_test)

In [40]:
svc_acc

0.7387387387387387

In [41]:
print(svc_report)

              precision    recall  f1-score   support

           0       0.44      0.60      0.51        25
           1       0.87      0.78      0.82        86

    accuracy                           0.74       111
   macro avg       0.66      0.69      0.67       111
weighted avg       0.77      0.74      0.75       111



In [42]:
nb_pred

array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1])

In [43]:
le.inverse_transform(nb_pred)

array(['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y'], dtype=object)