# Logistic Regression

In [1]:
# ------------------------------------------------------------------
# Build the Logistic Regression Model
# Predict the loan approval status based on various data elements
# ------------------------------------------------------------------

In [2]:
# Import Libraries
import pandas as pd

In [3]:
# Read the data and Create a copy
LoanData = pd.read_csv("Loan_Data.csv")
LoanPrep = LoanData.copy()
LoanPrep

Unnamed: 0,gender,married,ch,income,loanamt,status
0,Male,No,1.0,5849,,Y
1,Male,Yes,1.0,4583,128.0,N
2,Male,Yes,1.0,3000,66.0,Y
3,Male,Yes,1.0,2583,120.0,Y
4,Male,No,1.0,6000,141.0,Y
...,...,...,...,...,...,...
609,Female,No,1.0,2900,71.0,Y
610,Male,Yes,1.0,4106,40.0,Y
611,Male,Yes,1.0,8072,253.0,Y
612,Male,Yes,1.0,7583,187.0,Y


In [8]:
# check for columns with any missing values
print(LoanPrep.shape)
LoanPrep.isnull().sum(axis=0)

(614, 6)


gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

In [10]:
# Replace the missing values

# replace categorical values with mode - most frequent values
# ch stands for Credit History - the value are either 1.0 or 0.0
cols_cat = ['gender', 'married', 'ch']
LoanPrep[cols_cat] = LoanPrep[cols_cat].fillna(LoanPrep.mode().iloc[0])


# replace numerical value with mean
cols_num = ['loanamt']
LoanPrep[cols_num] = LoanPrep[cols_num].fillna(LoanPrep.mean())

LoanPrep.isnull().sum(axis=0)

gender     0
married    0
ch         0
income     0
loanamt    0
status     0
dtype: int64

In [11]:
# Drop irrelevant columns - I don't think gender is a factor for any bank to decide on loans 
LoanPrep = LoanPrep.drop(['gender'], axis=1)

In [17]:
LoanPrep.dtypes

married     object
ch         float64
income       int64
loanamt    float64
status      object
dtype: object

In [18]:
# creating dummy variables or one-hot encoding

# get_dummies automatically figures out which columns are categorical and one-hot encode them
LoanPrep = pd.get_dummies(LoanPrep, drop_first=True)

In [19]:
LoanPrep.dtypes

ch             float64
income           int64
loanamt        float64
married_Yes      uint8
status_Y         uint8
dtype: object

In [20]:
# Normalize the data (Income and Loan Amount) Using StandardScaler
from sklearn.preprocessing import StandardScaler
scalar_ = StandardScaler()

LoanPrep['income'] = scalar_.fit_transform(LoanPrep[['income']])
LoanPrep['loanamt'] = scalar_.fit_transform(LoanPrep[['loanamt']])

In [21]:
# Create the X (Independent) and Y (Dependent) dataframes
# -------------------------------------------------------
Y = LoanPrep[['status_Y']]
X = LoanPrep.drop(['status_Y'], axis=1)

In [22]:
# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 42, stratify=Y)

In [23]:
# Build the Logistic Regression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, Y_train)

  return f(**kwargs)


LogisticRegression()

In [24]:
# Predict the outcome using Test data
Y_predict = lr.predict(X_test)

In [26]:
Y_predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=uint8)

In [28]:
# Build the conufsion matrix and get the accuracy/score
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_predict)

cm

array([[ 32,  26],
       [  2, 125]])

In [29]:
score = lr.score(X_test, Y_test)
score

0.8486486486486486