![@mikegchambers](../images/header.png)

# Feature Encoding

In this notebook, we get some data ready for an algorithm.

![transform](butterfly.png)

We do this with our old old friends:

## Libraries

In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

## OneHotEncoding

In [3]:
date_parser = pd.to_datetime

In [4]:
data = pd.read_csv("customers.csv", parse_dates=['datetime'], date_parser=date_parser)

In [5]:
data.head()

Unnamed: 0,datetime,age,eye_color,satisfaction,upsell
0,2020-07-12 18:41:00,48,gray,satisfied,yes
1,2020-07-12 15:54:00,44,blue,extremely satisfied,no
2,2020-07-12 11:23:00,42,green,not satisfied,yes
3,2020-06-12 23:59:00,59,amber,very satisfied,no
4,2020-07-12 15:41:00,50,hazel,slightly satisfied,yes


In [6]:
data.dtypes

datetime        datetime64[ns]
age                      int64
eye_color               object
satisfaction            object
upsell                  object
dtype: object

In [7]:
data = pd.get_dummies(data,prefix=['eye_color'], columns = ['eye_color'], drop_first=True)

In [8]:
data.head()

Unnamed: 0,datetime,age,satisfaction,upsell,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,2020-07-12 18:41:00,48,satisfied,yes,0,0,1,0,0
1,2020-07-12 15:54:00,44,extremely satisfied,no,1,0,0,0,0
2,2020-07-12 11:23:00,42,not satisfied,yes,0,0,0,1,0
3,2020-06-12 23:59:00,59,very satisfied,no,0,0,0,0,0
4,2020-07-12 15:41:00,50,slightly satisfied,yes,0,0,0,0,1


In [9]:
data.dtypes

datetime           datetime64[ns]
age                         int64
satisfaction               object
upsell                     object
eye_color_blue              uint8
eye_color_brown             uint8
eye_color_gray              uint8
eye_color_green             uint8
eye_color_hazel             uint8
dtype: object

## Label-Encoding

In [10]:
labelencoder = LabelEncoder()
data['satisfaction'] = labelencoder.fit_transform(data['satisfaction'])

In [11]:
data.head()

Unnamed: 0,datetime,age,satisfaction,upsell,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,2020-07-12 18:41:00,48,2,yes,0,0,1,0,0
1,2020-07-12 15:54:00,44,0,no,1,0,0,0,0
2,2020-07-12 11:23:00,42,1,yes,0,0,0,1,0
3,2020-06-12 23:59:00,59,4,no,0,0,0,0,0
4,2020-07-12 15:41:00,50,3,yes,0,0,0,0,1


In [12]:
data.dtypes

datetime           datetime64[ns]
age                         int64
satisfaction                int64
upsell                     object
eye_color_blue              uint8
eye_color_brown             uint8
eye_color_gray              uint8
eye_color_green             uint8
eye_color_hazel             uint8
dtype: object

## Binary Encoding

In [13]:
labelencoder = LabelEncoder()
data['upsell'] = labelencoder.fit_transform(data['upsell'])
data = data.astype({'upsell': bool})

In [14]:
data.head()

Unnamed: 0,datetime,age,satisfaction,upsell,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,2020-07-12 18:41:00,48,2,True,0,0,1,0,0
1,2020-07-12 15:54:00,44,0,False,1,0,0,0,0
2,2020-07-12 11:23:00,42,1,True,0,0,0,1,0
3,2020-06-12 23:59:00,59,4,False,0,0,0,0,0
4,2020-07-12 15:41:00,50,3,True,0,0,0,0,1


In [15]:
data.dtypes

datetime           datetime64[ns]
age                         int64
satisfaction                int64
upsell                       bool
eye_color_blue              uint8
eye_color_brown             uint8
eye_color_gray              uint8
eye_color_green             uint8
eye_color_hazel             uint8
dtype: object

In [16]:
data.head()

Unnamed: 0,datetime,age,satisfaction,upsell,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,2020-07-12 18:41:00,48,2,True,0,0,1,0,0
1,2020-07-12 15:54:00,44,0,False,1,0,0,0,0
2,2020-07-12 11:23:00,42,1,True,0,0,0,1,0
3,2020-06-12 23:59:00,59,4,False,0,0,0,0,0
4,2020-07-12 15:41:00,50,3,True,0,0,0,0,1


## Date Encoding

In [17]:
data['datetime'] = data['datetime'].dt.hour

In [18]:
data.dtypes

datetime           int64
age                int64
satisfaction       int64
upsell              bool
eye_color_blue     uint8
eye_color_brown    uint8
eye_color_gray     uint8
eye_color_green    uint8
eye_color_hazel    uint8
dtype: object

In [19]:
data.head(10)

Unnamed: 0,datetime,age,satisfaction,upsell,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,18,48,2,True,0,0,1,0,0
1,15,44,0,False,1,0,0,0,0
2,11,42,1,True,0,0,0,1,0
3,23,59,4,False,0,0,0,0,0
4,15,50,3,True,0,0,0,0,1
5,10,43,4,True,0,1,0,0,0
6,18,30,4,False,0,1,0,0,0
7,21,31,2,True,1,0,0,0,0
8,22,56,4,True,0,0,1,0,0
9,7,47,4,True,0,0,1,0,0


## Carve out labels

In [20]:
X = data.drop(columns=['upsell'])

In [21]:
X.head(10)

Unnamed: 0,datetime,age,satisfaction,eye_color_blue,eye_color_brown,eye_color_gray,eye_color_green,eye_color_hazel
0,18,48,2,0,0,1,0,0
1,15,44,0,1,0,0,0,0
2,11,42,1,0,0,0,1,0
3,23,59,4,0,0,0,0,0
4,15,50,3,0,0,0,0,1
5,10,43,4,0,1,0,0,0
6,18,30,4,0,1,0,0,0
7,21,31,2,1,0,0,0,0
8,22,56,4,0,0,1,0,0
9,7,47,4,0,0,1,0,0


In [22]:
y = pd.DataFrame(data['upsell'])

In [23]:
y.head(10)

Unnamed: 0,upsell
0,True
1,False
2,True
3,False
4,True
5,True
6,False
7,True
8,True
9,True
