# Feature encoding

In [63]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
# data load
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [65]:
df['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

# **1-`Labelencoder`**

In [66]:
# let's encode the time in labelencoder with sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

le = LabelEncoder()
df['encoded_time'] = le.fit_transform(df['time'])
df.sample(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time
222,8.58,1.92,Male,Yes,Fri,Lunch,1,1
79,17.29,2.71,Male,No,Thur,Lunch,2,1
78,22.76,3.0,Male,No,Thur,Lunch,2,1
229,22.12,2.88,Female,Yes,Sat,Dinner,2,0
166,20.76,2.24,Male,No,Sun,Dinner,2,0
169,10.63,2.0,Female,Yes,Sat,Dinner,2,0
170,50.81,10.0,Male,Yes,Sat,Dinner,3,0
216,28.15,3.0,Male,Yes,Sat,Dinner,5,0
159,16.49,2.0,Male,No,Sun,Dinner,4,0
232,11.61,3.39,Male,No,Sat,Dinner,2,0


In [67]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# **2- `Ordinal Encoding`**

In [68]:
# ordinal encoding the day columns using specific order

oe = OrdinalEncoder()
oe = OrdinalEncoder(categories=[['Thur', 'Fri','Sat', 'Sun' ]])
df['encoded_day'] = oe.fit_transform(df[['day']])
df.sample(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,encoded_time,encoded_day
243,18.78,3.0,Female,No,Thur,Dinner,2,0,0.0
204,20.53,4.0,Male,Yes,Thur,Lunch,4,1,0.0
79,17.29,2.71,Male,No,Thur,Lunch,2,1,0.0
106,20.49,4.06,Male,Yes,Sat,Dinner,2,0,2.0
93,16.32,4.3,Female,Yes,Fri,Dinner,2,0,1.0
235,10.07,1.25,Male,No,Sat,Dinner,2,0,2.0
58,11.24,1.76,Male,Yes,Sat,Dinner,2,0,2.0
195,7.56,1.44,Male,No,Thur,Lunch,2,1,0.0
85,34.83,5.17,Female,No,Thur,Lunch,4,1,0.0
182,45.35,3.5,Male,Yes,Sun,Dinner,3,0,3.0


In [69]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [70]:
df['encoded_day'].value_counts()

encoded_day
2.0    87
3.0    76
0.0    62
1.0    19
Name: count, dtype: int64

# **3- `One Hot Encoding`**

In [71]:
# one hot encoding on day columns
ohe = OneHotEncoder()
ohe.fit_transform(df[['sex']]).toarray()

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.

In [72]:
# example of one hot encoding 
titanic = sns.load_dataset('titanic')

ohe = OneHotEncoder()
embarked_ohe = ohe.fit_transform(titanic[['embarked']])

embarked_ohe.toarray()

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], shape=(891, 4))

In [73]:
# pip install category_encoders

In [74]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# **4- `BinaryEncoder`**

In [75]:
from category_encoders import BinaryEncoder

Binary_encoder = BinaryEncoder()

In [76]:
binary_day =  Binary_encoder.fit_transform(df['day'])

In [77]:
binary_day

Unnamed: 0,day_0,day_1,day_2
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
239,0,1,0
240,0,1,0
241,0,1,0
242,0,1,0


# **Feature encoding with `Pandas`**

In [80]:
df = sns.load_dataset('tips')


In [81]:
# Use pandas for feature encoding
pd.get_dummies(df['day'])

Unnamed: 0,Thur,Fri,Sat,Sun
0,False,False,False,True
1,False,False,False,True
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
...,...,...,...,...
239,False,False,True,False
240,False,False,True,False
241,False,False,True,False
242,False,False,True,False
