In [4]:
import numpy as np
import pandas as pd

In [5]:
raw_data = pd.read_csv('loan-train.csv')
raw_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
raw_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Preprocessing Data

In [7]:
#Dealing with Null Vales. Interpolate for numbers and replace with mode for categorical data
raw_data.interpolate(method = 'linear', limit_direction = 'both', inplace=True)
raw_data.fillna(raw_data.select_dtypes(include='object').mode().iloc[0], inplace=True)
print(raw_data.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [8]:
#Checkpoint
df = raw_data.copy()

In [9]:
#Drop ID
df  = df.drop(['Loan_ID'], axis = 1)

In [10]:
#Convert categorical to numeric
#Some cateogries were binary. Could convert them using map as well
#Ex: df['Gender'] = df['Gender'].map({'Female':1, 'Male':0})
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [11]:
df.rename({'Loan_Status_Y': 'Loan_Status'}, axis=1, inplace=True)
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
0,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [12]:
df.to_csv('df-preprocessed.csv', index = False)

### Part 2: Scaling required inputs

In [13]:
df_copy = pd.read_csv('df-preprocessed.csv')
df_copy.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
0,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


### Balancing the dataset

In [14]:
#Check if training set is balanced
df['Loan_Status'].sum()/df.shape[0]

0.6872964169381107

In [19]:
#Checkpoint
df = df_copy.copy()
df.shape
df_equal = df.copy()

In [20]:
"""
#Balance dataset using downsampling. since we have less data, we'll use upsampling after dividing into train and test sets
# Count how many targets are 1 (meaning that the customer did convert)
num_zero_targets = int(df.shape[0] - df['Loan_Status'].sum())

# Set a counter for targets that are 0 (meaning that the customer did not convert)
one_targets_counter = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 1. 
# Once there are as many 0s as 1s, mark entries where the target is 1.
for i in range(df.shape[0]):
    if df['Loan_Status'][i] == 1:
        one_targets_counter += 1
        if one_targets_counter > num_zero_targets:
            indices_to_remove.append(i)

#print(df_copy.shape)
# We delete all indices that we marked "to remove" in the loop above.
df_equal = df.drop(df.index[indices_to_remove])
df_equal = df_equal.reset_index()
#Check if training set is balanced
print(df_equal.shape)
df_equal['Loan_Status'].sum()/df_equal.shape[0]"""


'\n#Balance dataset using downsampling. since we have less data, we\'ll use upsampling after dividing into train and test sets\n# Count how many targets are 1 (meaning that the customer did convert)\nnum_zero_targets = int(df.shape[0] - df[\'Loan_Status\'].sum())\n\n# Set a counter for targets that are 0 (meaning that the customer did not convert)\none_targets_counter = 0\n\n# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.\n# Declare a variable that will do that:\nindices_to_remove = []\n\n# Count the number of targets that are 1. \n# Once there are as many 0s as 1s, mark entries where the target is 1.\nfor i in range(df.shape[0]):\n    if df[\'Loan_Status\'][i] == 1:\n        one_targets_counter += 1\n        if one_targets_counter > num_zero_targets:\n            indices_to_remove.append(i)\n\n#print(df_copy.shape)\n# We delete all indices that we marked "to remove" in the loop above.\ndf_equal = df.drop(df.index[indices_to_remove])\ndf_equ

### Scale required columns

In [22]:
#df_equal  = df_equal.drop(['index'], axis = 1)
df_equal

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status
0,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900,0.0,71.0,360.0,1.0,0,0,0,0,0,0,0,0,0,1
610,4106,0.0,40.0,180.0,1.0,1,1,0,0,1,0,0,0,0,1
611,8072,240.0,253.0,360.0,1.0,1,1,1,0,0,0,0,0,1,1
612,7583,0.0,187.0,360.0,1.0,1,1,0,1,0,0,0,0,1,1


In [23]:
unscaled_inputs = df_equal.iloc[:,:-1]
targets = df_equal['Loan_Status']
print(targets.shape)
unscaled_inputs.shape

(614,)


(614, 14)

In [24]:
#Scaling columns
from sklearn.preprocessing import StandardScaler

# define scaler as an object
insurance_scaler = StandardScaler()

In [25]:
#Custom scaler to prevent scaling of categoric columns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        print(X_scaled.shape)
        print(X_not_scaled.shape)
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [26]:
column_names = unscaled_inputs.columns.tolist()
columns_to_scale = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History']
columns_to_not_scale = [x for x in column_names if x not in columns_to_scale]
columns_to_not_scale


['Gender_Male',
 'Married_Yes',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Education_Not Graduate',
 'Self_Employed_Yes',
 'Property_Area_Semiurban',
 'Property_Area_Urban']

In [27]:
#Declare instance of abv class with columns to scale
loan_scaler = CustomScaler(columns_to_scale)
loan_scaler.fit(unscaled_inputs)
scaled_inputs = loan_scaler.transform(unscaled_inputs)
scaled_inputs

(614, 5)
(614, 9)




Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,0.072991,-0.554487,-0.220885,0.278198,0.447830,1,0,0,0,0,0,0,0,1
1,-0.134412,-0.038732,-0.220885,0.278198,0.447830,1,1,1,0,0,0,0,0,0
2,-0.393747,-0.554487,-0.946114,0.278198,0.447830,1,1,0,0,0,0,1,0,1
3,-0.462062,0.251980,-0.314463,0.278198,0.447830,1,1,0,0,0,1,0,0,1
4,0.097728,-0.554487,-0.068821,0.278198,0.447830,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.410130,-0.554487,-0.887628,0.278198,0.447830,0,0,0,0,0,0,0,0,0
610,-0.212557,-0.554487,-1.250242,-2.506802,0.447830,1,1,0,0,1,0,0,0,0
611,0.437174,-0.472404,1.241269,0.278198,0.447830,1,1,1,0,0,0,0,0,1
612,0.357064,-0.554487,0.469252,0.278198,0.447830,1,1,0,1,0,0,0,0,1


In [28]:
scaled_inputs.shape

(614, 14)

## Upsampling(Balancing train data)

In [53]:
from imblearn.over_sampling import SMOTE

In [54]:
sm = SMOTE(random_state=23, sampling_strategy=1.0)
print(len(scaled_inputs), len(targets))
scaled_inputs_sm,targets_sm = sm.fit_resample(scaled_inputs, targets)
print(len(scaled_inputs_sm), len(targets_sm))

614 614
844 844


## Split the data into train & test and shuffle

In [55]:
# import train_test_split so we can split our data into train and test
from sklearn.model_selection import train_test_split

In [56]:
# declare 4 variables for the split
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, #train_size = 0.8, 
                                                                            test_size = 0.2, random_state = 20)

In [57]:
# check the shape of the train inputs and targets
print (x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(491, 14) (491,) (123, 14) (123,)


## Logistic regression with sklearn

In [58]:
# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [60]:
reg = LogisticRegression()
reg.fit(x_train,y_train)
reg.score(x_train,y_train)
#Accuracy with downsampling ~ 71%
#Accuracy with upsampling ~ 82%

0.824847250509165

### Manually check the accuracy

In [61]:
model_outputs = reg.predict(x_test)
model_outputs

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1], dtype=int64)

In [62]:
# ACTUALLY compare the two variables
model_outputs == y_test

591    False
613     True
500     True
242     True
387     True
       ...  
263     True
63      True
148    False
342     True
545     True
Name: Loan_Status, Length: 123, dtype: bool

In [63]:
# calculate the accuracy of the model
np.sum((model_outputs==y_test)) / model_outputs.shape[0]

0.7317073170731707

In [64]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print(f1_score(y_test, model_outputs, average="macro"))
print(precision_score(y_test, model_outputs, average="macro"))
print(recall_score(y_test, model_outputs, average="macro"))  

0.6880811496196112
0.7735923869944488
0.6879084967320261


### Finding the intercept and coefficients

In [65]:
feature_name = unscaled_inputs.columns.values
summary_table = pd.DataFrame (columns=['Feature name'], data = feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,ApplicantIncome,0.006821
1,CoapplicantIncome,-0.276111
2,LoanAmount,-0.114277
3,Loan_Amount_Term,-0.044278
4,Credit_History,1.242244
5,Gender_Male,0.168618
6,Married_Yes,0.56066
7,Dependents_1,-0.299724
8,Dependents_2,0.412385
9,Dependents_3+,0.165984


In [66]:
# create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
4,Credit_History,1.242244,3.463377
12,Property_Area_Semiurban,0.744383,2.105142
6,Married_Yes,0.56066,1.751829
8,Dependents_2,0.412385,1.510415
13,Property_Area_Urban,0.228519,1.256737
11,Self_Employed_Yes,0.184615,1.202755
5,Gender_Male,0.168618,1.183667
9,Dependents_3+,0.165984,1.180554
0,ApplicantIncome,0.006821,1.006844
3,Loan_Amount_Term,-0.044278,0.956688


## Testing the model

In [67]:
# assess the test accuracy of the model
reg.score(x_test,y_test)
predicted_proba = reg.predict_proba(x_test)

predicted_proba

array([[0.05056005, 0.94943995],
       [0.83845464, 0.16154536],
       [0.38536693, 0.61463307],
       [0.28749534, 0.71250466],
       [0.90516314, 0.09483686],
       [0.25781397, 0.74218603],
       [0.12783332, 0.87216668],
       [0.05940921, 0.94059079],
       [0.31060844, 0.68939156],
       [0.7041377 , 0.2958623 ],
       [0.74567816, 0.25432184],
       [0.07886005, 0.92113995],
       [0.85702794, 0.14297206],
       [0.21298139, 0.78701861],
       [0.37081   , 0.62919   ],
       [0.15950854, 0.84049146],
       [0.08247923, 0.91752077],
       [0.13575413, 0.86424587],
       [0.10958353, 0.89041647],
       [0.11393515, 0.88606485],
       [0.16522133, 0.83477867],
       [0.24544314, 0.75455686],
       [0.57789115, 0.42210885],
       [0.11680811, 0.88319189],
       [0.11598548, 0.88401452],
       [0.72447109, 0.27552891],
       [0.88692269, 0.11307731],
       [0.09581283, 0.90418717],
       [0.90447723, 0.09552277],
       [0.14964501, 0.85035499],
       [0.

In [68]:
predicted_proba[:,1]

array([0.94943995, 0.16154536, 0.61463307, 0.71250466, 0.09483686,
       0.74218603, 0.87216668, 0.94059079, 0.68939156, 0.2958623 ,
       0.25432184, 0.92113995, 0.14297206, 0.78701861, 0.62919   ,
       0.84049146, 0.91752077, 0.86424587, 0.89041647, 0.88606485,
       0.83477867, 0.75455686, 0.42210885, 0.88319189, 0.88401452,
       0.27552891, 0.11307731, 0.90418717, 0.09552277, 0.85035499,
       0.22481102, 0.89241795, 0.83842588, 0.84397537, 0.8234783 ,
       0.78583203, 0.90561644, 0.71932243, 0.77488052, 0.77198309,
       0.16524575, 0.83854879, 0.78197435, 0.80091032, 0.84179645,
       0.7616711 , 0.64194504, 0.85201128, 0.72396663, 0.10124857,
       0.70465706, 0.79722405, 0.78084805, 0.2132773 , 0.78695065,
       0.8341716 , 0.92233127, 0.82234072, 0.78347337, 0.70190922,
       0.07481483, 0.85305916, 0.9088733 , 0.86292764, 0.12706392,
       0.78831859, 0.50551357, 0.90706195, 0.62420305, 0.05994971,
       0.84008708, 0.8974018 , 0.715855  , 0.89342073, 0.85784

## Save the model

In [69]:
# import the relevant module
import pickle
with open('model', 'wb') as file:
    pickle.dump(reg, file)
with open('scaler','wb') as file:
    pickle.dump(insurance_scaler, file)