# Convert categorical data
 - Value replacing
 - Label encoding
 - One-hot encoding
 - Binary encoding

## 1. Value replacing

### Predefine number

In [1]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [2]:
df

Unnamed: 0,Job,Salary
0,Engineer,20000
1,Sale,30000
2,Marketing,15000
3,Finance,20000
4,HR,15000


In [None]:
df.info()

In [None]:
mapping = { 'Job' : {'Engineer': 101, 'Sale': 102, 'Marketing': 103, 'Finance': 201, 'HR': 202}}

In [None]:
df_map = df.copy()
df_map.replace(mapping, inplace=True)

In [None]:
df_map

In [None]:
df_map.info()

In [None]:
# Change data type to operate faster
df_map['Job'] = df_map['Job'].astype('category')

In [None]:
df_map.info()

### Auto-number

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [None]:
labels = df['Job']
mapping = {'Job' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}

In [None]:
list(range(1,len(labels)+1))


In [None]:
mapping

In [None]:
# Example of zip function
z = zip( ['a','b','c'] , [2,4,9] )
print(tuple(z))

In [None]:
df_map = df.copy()
df_map.replace(mapping, inplace=True)

In [None]:
df_map

## 2. Label Encoding
 - Numerical labels are always between 0 and n_categories-1

### Built-in function in dataframe

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [None]:
df_label = df.copy()

In [None]:
df_label['Job'] =df_label['Job'].astype('category')
df_label['Job'] = df_label['Job'].cat.codes

In [None]:
df_label

### Built-in function in 'sklearn'

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [None]:
df_label = df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df_label['Job'] = label.fit_transform(df_label['Job'])

In [None]:
df_label

### Built-in function in Numpy
 (In the case of only two categories)

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Sex':['M', 'F', 'M', 'M', 'F'] ,
                           'Height':[170,165,168,165,161] } )

In [None]:
df

In [None]:
df_label = df.copy()

In [None]:
import numpy as np
df_label['Sex'] = np.where(df_label['Sex'].str.contains('M'), 1, 0)

In [None]:
df_label

## 3. One-hot encoding
 - Each category value will be a new column
 - No weighting value

### Built-in function in Pandas

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [None]:
df_label = df.copy()

In [None]:
df_label = pd.get_dummies(df_label, columns=['Job'], prefix = ['Label'])

In [None]:
df_label

### Built-in function in 'sklearn'

In [None]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [None]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb_results = lb.fit_transform(df['Job'])
df_label = pd.DataFrame(lb_results, columns=lb.classes_)

In [None]:
lb_results

In [None]:
df_label

In [None]:
df_label['Salary'] = df['Salary']

In [None]:
df_label

## 4. Binary encoding
 - There are fewer dimensions than the One-hot encoding

In [1]:
import pandas as pd
df = pd.DataFrame(data = { 'Job':['Engineer', 'Sale', 'Marketing', 'Finance', 'HR'] ,
                           'Salary':[20000,30000,15000,20000,15000] } )

In [2]:
df

Unnamed: 0,Job,Salary
0,Engineer,20000
1,Sale,30000
2,Marketing,15000
3,Finance,20000
4,HR,15000


In [3]:
df_label = df.copy()

In [None]:
pip install category_encoders

In [4]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['Job'])
df_label = encoder.fit_transform(df_label)

In [5]:
df_label

Unnamed: 0,Job_0,Job_1,Job_2,Salary
0,0,0,1,20000
1,0,1,0,30000
2,0,1,1,15000
3,1,0,0,20000
4,1,0,1,15000
