In [1]:
#Data preparation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Loading dataset

loans = pd.read_csv("loans.csv")

In [3]:
#Inspecting loans dataset
loans

Unnamed: 0,City,Age,Salary,Approved
0,Apple Valley,25.0,65000.0,Yes
1,Maplewood,30.0,81000.0,No
2,Eagan,33.0,,Yes
3,Apple Valley,39.0,100000.0,No
4,Maplewood,28.0,91000.0,Yes
5,Eagan,,66000.0,No
6,Apple Valley,40.0,98000.0,Yes
7,Maplewood,34.0,86000.0,Yes
8,Eagan,25.0,70000.0,No
9,Maplewood,24.0,62000.0,Yes


In [4]:
loans.shape

(10, 4)

In [5]:
# Separating features and target variable

X = loans.iloc[:,:-1].values
Y = loans.iloc[:,3].values

print(X)
print(Y)

[['Apple Valley' 25.0 65000.0]
 ['Maplewood' 30.0 81000.0]
 ['Eagan' 33.0 nan]
 ['Apple Valley' 39.0 100000.0]
 ['Maplewood' 28.0 91000.0]
 ['Eagan' nan 66000.0]
 ['Apple Valley' 40.0 98000.0]
 ['Maplewood' 34.0 86000.0]
 ['Eagan' 25.0 70000.0]
 ['Maplewood' 24.0 62000.0]]
['Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes']


In [6]:
#Check for missing values
loans
#Age and Salary columns have 1 missing value

Unnamed: 0,City,Age,Salary,Approved
0,Apple Valley,25.0,65000.0,Yes
1,Maplewood,30.0,81000.0,No
2,Eagan,33.0,,Yes
3,Apple Valley,39.0,100000.0,No
4,Maplewood,28.0,91000.0,Yes
5,Eagan,,66000.0,No
6,Apple Valley,40.0,98000.0,Yes
7,Maplewood,34.0,86000.0,Yes
8,Eagan,25.0,70000.0,No
9,Maplewood,24.0,62000.0,Yes


In [7]:
#loans.dropna(axis=0) #drop all rows with at least 1 NAN value
#loans.dropna(axis=1) #drop all columns with at least 1 NAN value
#loans.dropna(how='all') #drop all rows with all columns NAN value
#loans.dropna(thresh=2) #drop all rows with 2 NAN values
#Imputing missing value of column with mean value of that column using scikit learn library
from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,[1,2]])
X[:,[1,2]]= imputer.transform(X[:,[1,2]])  #or X[:,1:3]= imputer.transform(X[:,1:3])

In [9]:
#Check for missing cloumns after replacement
X

array([['Apple Valley', 25.0, 65000.0],
       ['Maplewood', 30.0, 81000.0],
       ['Eagan', 33.0, 79888.88888888889],
       ['Apple Valley', 39.0, 100000.0],
       ['Maplewood', 28.0, 91000.0],
       ['Eagan', 30.88888888888889, 66000.0],
       ['Apple Valley', 40.0, 98000.0],
       ['Maplewood', 34.0, 86000.0],
       ['Eagan', 25.0, 70000.0],
       ['Maplewood', 24.0, 62000.0]], dtype=object)

In [10]:
#Dealing with categorical values-Transforming them into numeric levels before passing to model using OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

col_trans = make_column_transformer((OneHotEncoder(),[0]),remainder="passthrough")
X = col_trans.fit_transform(X)
print(X)

[[1.0 0.0 0.0 25.0 65000.0]
 [0.0 0.0 1.0 30.0 81000.0]
 [0.0 1.0 0.0 33.0 79888.88888888889]
 [1.0 0.0 0.0 39.0 100000.0]
 [0.0 0.0 1.0 28.0 91000.0]
 [0.0 1.0 0.0 30.88888888888889 66000.0]
 [1.0 0.0 0.0 40.0 98000.0]
 [0.0 0.0 1.0 34.0 86000.0]
 [0.0 1.0 0.0 25.0 70000.0]
 [0.0 0.0 1.0 24.0 62000.0]]


In [11]:
#Transforming dependent target variable
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y)

In [12]:
print(Y)

[1 0 1 0 1 0 1 1 0 1]


In [13]:
#Splitting dataset into trainning and testing datasets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

In [14]:
X_train

array([[0.0, 0.0, 1.0, 28.0, 91000.0],
       [0.0, 0.0, 1.0, 24.0, 62000.0],
       [0.0, 0.0, 1.0, 30.0, 81000.0],
       [1.0, 0.0, 0.0, 40.0, 98000.0],
       [0.0, 0.0, 1.0, 34.0, 86000.0],
       [1.0, 0.0, 0.0, 39.0, 100000.0],
       [1.0, 0.0, 0.0, 25.0, 65000.0],
       [0.0, 1.0, 0.0, 30.88888888888889, 66000.0]], dtype=object)

In [15]:
#Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)