In [1]:
# Importing the dependencies

import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn.metrics import accuracy_score

In [2]:
# Loading the training data
trainDf = pd.read_csv('Datasets/train.csv')
print(trainDf.shape)
trainDf.head()

(614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Data Preperation

In [3]:
# Checking the missing values
for column in trainDf:
    if(trainDf[column].isnull().sum() > 0):
        print('There are missing values in the column: ',column)

There are missing values in the column:  Gender
There are missing values in the column:  Married
There are missing values in the column:  Dependents
There are missing values in the column:  Self_Employed
There are missing values in the column:  LoanAmount
There are missing values in the column:  Loan_Amount_Term
There are missing values in the column:  Credit_History


In [4]:
# Handling the missing values and populating with the mode
trainDf['Gender'] = trainDf['Gender'].fillna( trainDf['Gender'].dropna().mode().values[0] )
trainDf['Married'] = trainDf['Married'].fillna( trainDf['Married'].dropna().mode().values[0] )
trainDf['Dependents'] = trainDf['Dependents'].fillna( trainDf['Dependents'].dropna().mode().values[0] )
trainDf['Self_Employed'] = trainDf['Self_Employed'].fillna( trainDf['Self_Employed'].dropna().mode().values[0] )
trainDf['LoanAmount'] = trainDf['LoanAmount'].fillna( trainDf['LoanAmount'].dropna().mode().values[0] )
trainDf['Loan_Amount_Term'] = trainDf['Loan_Amount_Term'].fillna(
    trainDf['Loan_Amount_Term'].dropna().mode().values[0] )
trainDf['Credit_History'] = trainDf['Credit_History'].fillna( trainDf['Credit_History'].dropna().mode().values[0] )

In [5]:
trainDf.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
# Creating dummy variables for categorical datatypes
trainDummiesDf = pd.get_dummies(trainDf, columns=['Gender', 'Married', 'Dependents',
                                                      'Education', 'Self_Employed', 'Property_Area'])

In [7]:
print(trainDummiesDf.shape)
print(trainDummiesDf.columns)
trainDummiesDf.head()

(614, 22)
Index(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Gender_Female',
       'Gender_Male', 'Married_No', 'Married_Yes', 'Dependents_0',
       'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')


Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,5849,0.0,120.0,360.0,1.0,Y,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,LP001003,4583,1508.0,128.0,360.0,1.0,N,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,LP001005,3000,0.0,66.0,360.0,1.0,Y,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,LP001006,2583,2358.0,120.0,360.0,1.0,Y,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,LP001008,6000,0.0,141.0,360.0,1.0,Y,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
# converting the label into proper binary labels
lb = preprocessing.LabelBinarizer() 

# Remove the non-predictive and the target column form the data
dataPredict = trainDummiesDf.drop(trainDummiesDf.columns[[0,6]], axis = 1) # 0 is Loan_ID and 6 is Loan_Status

# Convert the target column into binomial label vector
targetBinomial = lb.fit_transform(trainDummiesDf['Loan_Status']).flatten()      

print ('The shape of dataPredict after preprocessing is: ', dataPredict.shape)
print ('The shape of targetBinomial after preprocessing is: ', targetBinomial.shape)
print ('first five targetBinomial:',targetBinomial[:5])
dataPredict.head()

The shape of dataPredict after preprocessing is:  (614, 20)
The shape of targetBinomial after preprocessing is:  (614,)
first five targetBinomial: [1 0 1 1 1]


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,120.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,4583,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3000,0.0,66.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2583,2358.0,120.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,6000,0.0,141.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


## Training

#### Now we create the classifiers using the sklearn library and store them in a variable.

In [9]:
# Creating the classifiers

# Classifier implementing Decision tree classifier
clfDtree = tree.DecisionTreeClassifier()

# Classifier implementing Support vector machines
clfSVM = svm.SVC()

# Classifier implementing the k-nearest neighbors
clfKNN = neighbors.KNeighborsClassifier()

#### Now we build training set on which we want to train our model and predict the results for the future test data.

In [10]:
# Passing the dataPredict columns we selected to predict the Loan_Status
X = dataPredict

# Passing Loan_Status converted into proper binary labels
Y = targetBinomial

In [11]:
# Training the models on the training set data

# Decision Tree
clfDtree = clfDtree.fit(X, Y)

# SVM
clfSVM = clfSVM.fit(X, Y)

# K-nearest neighbors
clfKNN = clfKNN.fit(X, Y)

## Testing

#### Now we consider a test data set and we predict the results using our trained models on the dataset X and Y

In [12]:
# Loading the test dataset
testDf = pd.read_csv('Datasets/test.csv')
print(testDf.shape)
testDf.head()

(367, 12)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [13]:
# Checking the missing values in test dataset
for column in testDf:
    if(testDf[column].isnull().sum() > 0):
        print('There are missing values in the column: ',column)

There are missing values in the column:  Gender
There are missing values in the column:  Dependents
There are missing values in the column:  Self_Employed
There are missing values in the column:  LoanAmount
There are missing values in the column:  Loan_Amount_Term
There are missing values in the column:  Credit_History


In [14]:
# Handling the missing values in the test dataset and populating with the mode
testDf['Gender'] = testDf['Gender'].fillna( testDf['Gender'].dropna().mode().values[0] )
testDf['Dependents'] = testDf['Dependents'].fillna( testDf['Dependents'].dropna().mode().values[0] )
testDf['Self_Employed'] = testDf['Self_Employed'].fillna( testDf['Self_Employed'].dropna().mode().values[0] )
testDf['LoanAmount'] = testDf['LoanAmount'].fillna( testDf['LoanAmount'].dropna().mode().values[0] )
testDf['Loan_Amount_Term'] = testDf['Loan_Amount_Term'].fillna(
    testDf['Loan_Amount_Term'].dropna().mode().values[0] )
testDf['Credit_History'] = testDf['Credit_History'].fillna( testDf['Credit_History'].dropna().mode().values[0] )

In [15]:
# Creating dummy variables for categorical datatypes for test dataset
testDummiesDf = pd.get_dummies(testDf, columns=['Gender', 'Married', 'Dependents',
                                                      'Education', 'Self_Employed', 'Property_Area'])

In [16]:
print(testDummiesDf.shape)
print(testDummiesDf.columns)
testDummiesDf.head()

(367, 21)
Index(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Gender_Male',
       'Married_No', 'Married_Yes', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')


Unnamed: 0,Loan_ID,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,LP001015,5720,0,110.0,360.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,LP001022,3076,1500,126.0,360.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,LP001031,5000,1800,208.0,360.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,LP001035,2340,2546,100.0,360.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,LP001051,3276,0,78.0,360.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [17]:
# Remove the non-predictive and the target column form the test data
dataTestPredict = testDummiesDf.drop(testDummiesDf.columns[[0]], axis = 1) # 0 is Loan_ID

print ('The shape of dataTestPredict after preprocessing is: ', dataTestPredict.shape)
dataTestPredict.head()

The shape of dataTestPredict after preprocessing is:  (367, 20)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5720,0,110.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,3076,1500,126.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,5000,1800,208.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,2340,2546,100.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,3276,0,78.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


#### Now we predict the output of our test set using our trained models

In [18]:
# Prediction using Decision Tree

predictionDtree = clfDtree.predict(dataTestPredict)
predictionDtree[:5]

array([1, 1, 1, 1, 1])

In [19]:
# Prediction using SVM

predictionSVM = clfSVM.predict(dataTestPredict)
predictionSVM[:5]

array([1, 1, 1, 1, 1])

In [20]:
# Prediction using KNN

predictionKNN = clfKNN.predict(dataTestPredict)
predictionKNN[:5]

array([1, 1, 0, 1, 1])