In [1]:
import pandas as pd 
import numpy as np
import sklearn 

## Reading Data by pandas


In [3]:
data = pd.read_csv("loan_data_set.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# View Data Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# Calc. null data and remove

In [6]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [7]:
data2 = data.copy()

In [8]:
data2 = data2.dropna()
data2

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [9]:
data2.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,480.0,480.0,480.0,480.0,480.0
mean,5364.23125,1581.093583,144.735417,342.05,0.854167
std,5668.251251,2617.692267,80.508164,65.212401,0.353307
min,150.0,0.0,9.0,36.0,0.0
25%,2898.75,0.0,100.0,360.0,1.0
50%,3859.0,1084.5,128.0,360.0,1.0
75%,5852.5,2253.25,170.0,360.0,1.0
max,81000.0,33837.0,600.0,480.0,1.0


# OneHot Encoding

In [10]:
data3 = data.copy()

In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohef = ohe.fit_transform(data3['Education'].values.reshape(-1,1)).toarray()

print(ohe.categories_)


[array(['Graduate', 'Not Graduate'], dtype=object)]


In [16]:
sampleOneHot = pd.DataFrame(ohef, columns=["Education"+str(ohe.categories_[0][i]) for i in range(len(ohe.categories_[0]))])
sample_one_hot_final = pd.concat([data3, sampleOneHot.iloc[:,:-1]], axis=1)

print(sample_one_hot_final.head())

    Loan_ID Gender Married Dependents     Education Self_Employed   
0  LP001002   Male      No          0      Graduate            No  \
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term   
0             5849                0.0         NaN             360.0  \
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  EducationGraduate  
0             1.0         Urban           Y                1.0  
1             1.0         Ru

# Label Encoding

In [19]:
data4 = data.copy()

data4['Self_Employed_Encoded'] = pd.factorize(data4['Self_Employed'])[0].reshape(-1,1)


data4.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Self_Employed_Encoded
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,0


In [22]:
print(data4['Property_Area'].to_string(index=False))

    Urban
    Rural
    Urban
    Urban
    Urban
    Urban
    Urban
Semiurban
    Urban
Semiurban
    Urban
    Urban
    Urban
    Rural
    Urban
    Urban
    Urban
    Urban
    Rural
    Urban
    Urban
    Urban
Semiurban
    Rural
Semiurban
Semiurban
Semiurban
    Urban
    Urban
Semiurban
    Urban
    Urban
    Rural
Semiurban
    Rural
    Urban
    Urban
Semiurban
    Urban
Semiurban
    Urban
    Urban
    Urban
Semiurban
    Urban
    Urban
    Urban
    Urban
    Urban
Semiurban
Semiurban
Semiurban
Semiurban
    Urban
    Urban
Semiurban
Semiurban
    Rural
    Urban
    Urban
    Urban
    Urban
    Rural
    Rural
Semiurban
Semiurban
    Urban
    Urban
    Urban
Semiurban
    Urban
Semiurban
Semiurban
Semiurban
Semiurban
    Urban
    Urban
    Urban
Semiurban
Semiurban
Semiurban
Semiurban
    Urban
Semiurban
    Urban
Semiurban
Semiurban
Semiurban
    Urban
Semiurban
Semiurban
Semiurban
    Urban
Semiurban
Semiurban
    Urban
Semiurban
Semiurban
Semiurban
Semiurban


In [21]:
# Display The Encoded Column Only
print(data4['Self_Employed_Encoded'].to_string(index=False))

 0
 0
 1
 0
 0
 1
 0
 0
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
 0
-1
 1
 0
 0
 0
-1
-1
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 1
 0
 0
 1
 0
 0
 1
 0
 0
 1
 0
 1
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
-1
 0
 1
-1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0
 1
 0
 0
 0
 1
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 1
 0
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 1
 0
-1
 0
 0
 0
 0
-1
 1
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 0
 0
 0
-1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 1
 0
 0
 0
 0
 0
 1
 0
 1
 0
 0
 0
 0
 0
-

# Ordinal Encoding

In [25]:
dic = {
   'Rural' :1,
   'Urban':2,
   'Semiurban':3
}
data5 =data.copy()

data5['Enc_Property_Area'] = data5['Property_Area'].map(dic)

# data5.head()

# Display The Encoded Column Only
print(data5['Enc_Property_Area'].to_string(index=False))

2
1
2
2
2
2
2
3
2
3
2
2
2
1
2
2
2
2
1
2
2
2
3
1
3
3
3
2
2
3
2
2
1
3
1
2
2
3
2
3
2
2
2
3
2
2
2
2
2
3
3
3
3
2
2
3
3
1
2
2
2
2
1
1
3
3
2
2
2
3
2
3
3
3
3
2
2
2
3
3
3
3
2
3
2
3
3
3
2
3
3
3
2
3
3
2
3
3
3
3
2
3
2
3
2
2
2
1
2
3
2
3
1
3
3
1
3
2
1
2
1
3
3
3
1
1
1
1
2
1
2
2
3
3
3
3
1
2
3
1
1
2
3
3
2
3
2
2
1
3
1
1
2
1
2
3
1
2
1
3
3
2
3
1
2
1
1
1
3
3
1
2
1
3
3
1
1
3
3
2
2
1
3
3
3
3
1
1
1
1
1
3
2
3
1
3
1
2
3
2
3
3
2
2
3
3
2
1
2
3
3
3
2
1
2
3
1
3
3
3
2
3
3
3
3
1
2
3
3
1
3
1
1
3
3
1
2
2
1
3
1
2
2
1
3
2
2
2
3
2
3
2
1
3
2
1
1
2
1
3
2
3
3
1
3
1
3
2
1
2
2
2
1
3
3
3
3
2
3
1
2
3
2
2
1
1
3
1
3
1
1
3
2
2
3
2
3
2
1
2
2
3
1
2
1
2
1
2
1
1
3
3
1
1
1
2
3
2
3
1
3
3
1
1
1
1
1
3
2
2
2
3
2
2
2
3
1
1
2
3
1
1
2
3
1
3
1
2
3
1
3
1
1
1
3
2
1
2
2
3
3
3
2
2
1
1
3
1
3
1
3
3
1
2
2
2
1
3
2
2
3
3
2
2
2
2
3
2
2
1
1
1
2
3
2
3
2
2
1
3
2
1
3
3
1
3
2
1
3
1
3
2
1
1
3
2
2
2
1
3
1
2
2
2
3
1
1
3
1
2
3
1
3
3
2
3
3
2
2
1
3
1
3
3
1
1
1
1
1
3
1
2
1
1
3
3
2
2
1
3
3
2
1
3
3
1
2
3
3
2
3
1
3
2
1
1
3
3
3
2
1
1
3
3
3
1
1
2
3
2
3
2
1
3
1
3
1
2
3
1


In [46]:
# Remove all string coulmns in data
data6 = data.copy()
# data6 = data6.drop(columns='Self_Employed')
# data6 = data6.drop(columns='Property_Area')
# data6 = data6.drop(columns='Education')
# data6 = data6.drop(columns='Married')
# data6 = data6.drop(columns='Loan_ID')
# data6 = data6.drop(columns='Gender')
# data6 = data6.drop(columns='Loan_Status')
# data6 = data6.drop(columns='Dependents')


# # data6.head()
# L = data6['ApplicantIncome'].to_string(index=False)
# print(L.var())


# Split Data

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x = data6.iloc[:, :-1]
y = data6.iloc[:, -1]

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=0)

In [None]:
#Logistic Regression
from sklearn.base import accuracy_score
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict the training and testing data
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Calculate the accuracy of the model
train_acc_lr = accuracy_score(y_train, y_train_pred)
test_acc_lr = accuracy_score(y_test, y_test_pred)

print("Training accuracy of Logistic Regression: ", train_acc_lr)
print("Testing accuracy of Logistic Regression: ", test_acc_lr)