### Import Libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib as plt

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

#### Load Dataset

In [2]:
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [4]:
df.count()

Loan_ID              614
Gender               601
Married              611
Dependents           599
Education            614
Self_Employed        582
ApplicantIncome      614
CoapplicantIncome    614
LoanAmount           592
Loan_Amount_Term     600
Credit_History       564
Property_Area        614
Loan_Status          614
dtype: int64

#### As it can be seen that there are some missing values in the dataset

#### Now, these missing values need to be removed and dataset is need to be made stable

In [5]:
# Removing all missing and null values
df.dropna(how='any' ,inplace=True)
df.drop_duplicates(inplace=True)
df.count()

Loan_ID              480
Gender               480
Married              480
Dependents           480
Education            480
Self_Employed        480
ApplicantIncome      480
CoapplicantIncome    480
LoanAmount           480
Loan_Amount_Term     480
Credit_History       480
Property_Area        480
Loan_Status          480
dtype: int64

### It can be seen that all the values are same in count i.e. dataset have no missing or null values

#### Now, the target variable is needed to be encode in binary form

In [6]:
encode = LabelEncoder()
df.Loan_Status = encode.fit_transform(df.Loan_Status)

### Splitting the data in training and testing form

In [7]:
train,test= train_test_split(df,test_size=0.2,random_state=0)
#seperating the target and independent variable
train_x = train.drop(columns=['Loan_ID','Loan_Status'],axis=1)
train_y = train['Loan_Status']
test_x = test.drop(columns=['Loan_ID','Loan_Status'],axis=1)
test_y = test['Loan_Status']

#### Encoding the data

In [8]:
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

In [9]:
train_x.shape

(384, 20)

In [10]:
test_x.shape

(96, 20)

### Creating the model for prediction

In [15]:
model = LogisticRegression()
model.fit(train_x,train_y)
pred = model.predict(test_x)

print("\nAccuracy Score on Test Data : \n")
print(accuracy_score(test_y,pred)*100)


Accuracy Score on Test Data : 

72.91666666666666


## :)