In [12]:
# Importing the dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
sns.set(style='darkgrid')
%matplotlib inline

from sklearn import preprocessing
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn.metrics import accuracy_score

In [13]:
trainDf = pd.read_csv('Datasets/train.csv')

In [14]:
for column in trainDf:
    if(trainDf[column].isnull().sum() > 0):
        print('There are missing values in the column: ',column)

There are missing values in the column:  Gender
There are missing values in the column:  Married
There are missing values in the column:  Dependents
There are missing values in the column:  Self_Employed
There are missing values in the column:  LoanAmount
There are missing values in the column:  Loan_Amount_Term
There are missing values in the column:  Credit_History


In [15]:
# Handling the missing values
trainDf['Gender'] = trainDf['Gender'].fillna( trainDf['Gender'].dropna().mode().values[0] )
trainDf['Married'] = trainDf['Married'].fillna( trainDf['Married'].dropna().mode().values[0] )
trainDf['Dependents'] = trainDf['Dependents'].fillna( trainDf['Dependents'].dropna().mode().values[0] )
trainDf['Self_Employed'] = trainDf['Self_Employed'].fillna( trainDf['Self_Employed'].dropna().mode().values[0] )
trainDf['LoanAmount'] = trainDf['LoanAmount'].fillna( trainDf['LoanAmount'].dropna().mode().values[0] )
trainDf['Loan_Amount_Term'] = trainDf['Loan_Amount_Term'].fillna(
    trainDf['Loan_Amount_Term'].dropna().mode().values[0] )
trainDf['Credit_History'] = trainDf['Credit_History'].fillna( trainDf['Credit_History'].dropna().mode().values[0] )

In [16]:
trainDf.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [17]:
trainDfDummies = pd.get_dummies(trainDf, columns=['Gender', 'Married', 'Dependents',
                                                      'Education', 'Self_Employed', 'Property_Area'])

In [18]:
trainDfDummies.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,5403.459283,1621.245798,145.465798,342.410423,0.855049,0.18241,0.81759,0.346906,0.653094,0.586319,0.166124,0.164495,0.083062,0.781759,0.218241,0.86645,0.13355,0.291531,0.379479,0.32899
std,6109.041673,2926.248369,84.180967,64.428629,0.352339,0.386497,0.386497,0.476373,0.476373,0.492894,0.372495,0.371027,0.276201,0.413389,0.413389,0.340446,0.340446,0.454838,0.485653,0.470229
min,150.0,0.0,9.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2877.5,0.0,100.25,360.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,3812.5,1188.5,125.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,5795.0,2297.25,164.75,360.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
# converting the label into proper binary labels
lb = preprocessing.LabelBinarizer() 

# Remove the non-predictive and the target column form the data
dataContinuous = trainDfDummies.drop(trainDfDummies.columns[[0,6]], axis = 1) # 0 is Loan_ID and 6 is Loan_Status

# Convert the target column into binomial label vector
targetBinomial = lb.fit_transform(trainDfDummies['Loan_Status']).flatten()      

print ('The shape of dataContinuous after preprocessing is: ', dataContinuous.shape)
print ('The shape of targetBinomial after preprocessing is: ', targetBinomial.shape)
print (targetBinomial[:5])
dataContinuous.head()

The shape of dataContinuous after preprocessing is:  (614, 20)
The shape of targetBinomial after preprocessing is:  (614,)
[1 0 1 1 1]


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,120.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,4583,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3000,0.0,66.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2583,2358.0,120.0,360.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,6000,0.0,141.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


#### Now we create the classifiers using the sklearn library and store them in a variable.

In [8]:
# Creating the classifiers

# Classifier implementing Decision tree classifier
clfDtree = tree.DecisionTreeClassifier()

# Classifier implementing Support vector machines
clfSVM = svm.SVC()

# Classifier implementing the k-nearest neighbors
clfKNN = neighbors.KNeighborsClassifier()

#### Now we build training set on which we want to train our model and predict the results for the future test data.

In [None]:
X = trainDfDummies[]

Y = trainDfDummies['Loan_Status']

In [30]:
trainDfDummies.columns.get_loc("Loan_ID")

6