In [6]:
# machine do not understand the categorical data it only understands the numerical data.
# hence we convert the categorical data into numerical data using encoding
# there are many ways of encoding :
# one of them is OneHotEncoding (it is used for data having 2-3 values , eg: Married can be only YES or NO)

In [7]:
import pandas as pd

In [8]:
dataset = pd.read_csv("loan.csv")
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
# lets encode the columns Gender and Married
# before proceeding lets fill the null values in gender,married if any
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
dataset["Gender"].fillna(dataset["Gender"].mode()[0],inplace=True)
dataset["Married"].fillna(dataset["Married"].mode()[0],inplace=True)

In [14]:
#now there are no null values in Gender and Married , we can check :
dataset.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [15]:
#lets seperate gender and married from the dataset now for encoding
en_data = dataset[["Gender","Married"]]
en_data

Unnamed: 0,Gender,Married
0,Male,No
1,Male,Yes
2,Male,Yes
3,Male,Yes
4,Male,No
...,...,...
609,Female,No
610,Male,Yes
611,Male,Yes
612,Male,Yes


In [16]:
# there are two ways of One-Hot-Encoding
# 1) using get dummies ... pandas method
# 2) One Hot Encoder ..... scikit learn class

In [17]:
#1) using get dummies
pd.get_dummies(en_data)

Unnamed: 0,Gender_Female,Gender_Male,Married_No,Married_Yes
0,False,True,True,False
1,False,True,False,True
2,False,True,False,True
3,False,True,False,True
4,False,True,True,False
...,...,...,...,...
609,True,False,True,False
610,False,True,False,True
611,False,True,False,True
612,False,True,False,True


In [18]:
pd.get_dummies(en_data).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Gender_Female  614 non-null    bool 
 1   Gender_Male    614 non-null    bool 
 2   Married_No     614 non-null    bool 
 3   Married_Yes    614 non-null    bool 
dtypes: bool(4)
memory usage: 2.5 KB


In [19]:
# as we can see it converted the two columns into four and the data values into true and false
# now we can convert the true values into 1 and false valuse into 0 , and so the conversion of categorical to numerical will be done
# but OneHotEncoder from scikit learn will do this work also , i.e converting into 0s and 1s
# hence we prefer OneHotEncoder over get_dummies ,,

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
# 2)using onehotencoder
ohe = OneHotEncoder()
ohe.fit_transform(en_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1228 stored elements and shape (614, 4)>

In [35]:
# by this we got the result in sparse matrix , but we want the result in table 
# for that lets first convert the sparse matrix into array and then array into table
ar = ohe.fit_transform(en_data).toarray()
ar

array([[0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       ...,
       [0., 1., 0., 1.],
       [0., 1., 0., 1.],
       [1., 0., 1., 0.]])

In [36]:
#converting array into table
pd.DataFrame(ar,columns=["Gender_Female","Gender_Male","Married_No","Married_Yes"])

Unnamed: 0,Gender_Female,Gender_Male,Married_No,Married_Yes
0,0.0,1.0,1.0,0.0
1,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,1.0
4,0.0,1.0,1.0,0.0
...,...,...,...,...
609,1.0,0.0,1.0,0.0
610,0.0,1.0,0.0,1.0
611,0.0,1.0,0.0,1.0
612,0.0,1.0,0.0,1.0


In [37]:
# and we successfully converted the categorical data into numerical data ,
# but we got 2 extra columns , and as the data can take only two values 0and1 we can drop 1 column each

In [38]:
ohe = OneHotEncoder(drop="first") #does encoding and drop the first column
ohe.fit_transform(en_data)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 903 stored elements and shape (614, 2)>

In [39]:
ar2 = ohe.fit_transform(en_data).toarray()
ar2

array([[1., 0.],
       [1., 1.],
       [1., 1.],
       ...,
       [1., 1.],
       [1., 1.],
       [0., 0.]])

In [40]:
pd.DataFrame(ar2,columns=["Gender_Male","Married_Yes"])

Unnamed: 0,Gender_Male,Married_Yes
0,1.0,0.0
1,1.0,1.0
2,1.0,1.0
3,1.0,1.0
4,1.0,0.0
...,...,...
609,0.0,0.0
610,1.0,1.0
611,1.0,1.0
612,1.0,1.0
