## Encoder
https://scikit-learn.org/stable/api/sklearn.preprocessing.html

# 1. Label Encoder

In [5]:
classes = ['ClassA', 'ClassB', 'ClassC', 'ClassD']
instances = ['ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB']

### Manual

In [7]:
label_to_int = {label:index for index, label in enumerate(classes)}
encoded_labels = [label_to_int[label] for label in instances]

print("Encoded labels: ", encoded_labels)

Encoded labels:  [0, 1, 2, 3, 0, 1, 2, 3, 0, 1]


In [8]:
int_to_label = {index: label for label, index in label_to_int.items()}
decoded_labels = [int_to_label[index] for index in encoded_labels]

print("Encoded labels:", encoded_labels)
print("Decoded labels:", decoded_labels)

Encoded labels: [0, 1, 2, 3, 0, 1, 2, 3, 0, 1]
Decoded labels: ['ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB']


### Sklearn

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(instances)

In [10]:
original_labels = label_encoder.inverse_transform(encoded_labels)

print("Encoded labels:", encoded_labels)
print("Original labels:", original_labels)

Encoded labels: [0 1 2 3 0 1 2 3 0 1]
Original labels: ['ClassA' 'ClassB' 'ClassC' 'ClassD' 'ClassA' 'ClassB' 'ClassC' 'ClassD'
 'ClassA' 'ClassB']


# 2. One Hot Encoding

In [12]:
import pandas as pd
data = {'Category': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C']}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Category
0,A
1,B
2,C
3,A
4,B


In [16]:
one_hot_encoded_df = pd.get_dummies(df, columns=['Category'])
one_hot_encoded_df.head()

Unnamed: 0,Category_A,Category_B,Category_C
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False
4,False,True,False


In [18]:
one_hot_encoded_df = pd.get_dummies(df, columns=['Category'], prefix='Dummy')
one_hot_encoded_df.head()

Unnamed: 0,Dummy_A,Dummy_B,Dummy_C
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False
4,False,True,False


In [19]:
one_hot_encoded_df = pd.get_dummies(df, columns=['Category'], prefix='Dummy',drop_first=True )
one_hot_encoded_df.head()

Unnamed: 0,Dummy_B,Dummy_C
0,False,False
1,True,False
2,False,True
3,False,False
4,True,False


# 3. Binary Encoder

In [None]:
import pandas as pd
import category_encoders as ce

# 3 unique categories, so it will be a 2 bit binary which will result in 2 categories
data = {'Category': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C']}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Category
0,A
1,B
2,C
3,A
4,B


In [3]:
encoder = ce.BinaryEncoder(cols=['Category'], return_df=True)

df_binary_encoder = encoder.fit_transform(df)
df_binary_encoder.head()

Unnamed: 0,Category_0,Category_1
0,0,1
1,1,0
2,1,1
3,0,1
4,1,0


# 4. Ordinal Encoder

In [10]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

data = [
    ['good'], ['bad'], ['excellent'], ['average'], 
    ['good'], ['average'], ['excellent'], ['bad'], 
    ['average'], ['good']
]
categories = [['bad', 'average', 'good', 'excellent']]

In [8]:
data = pd.DataFrame(data=data, columns=['review'])
data.head()

Unnamed: 0,review
0,good
1,bad
2,excellent
3,average
4,good


In [11]:
encoder = OrdinalEncoder(categories=categories)

In [12]:
encoded_data = encoder.fit_transform(data)
encoded_data

array([[2.],
       [0.],
       [3.],
       [1.],
       [2.],
       [1.],
       [3.],
       [0.],
       [1.],
       [2.]])

In [14]:
decoded_data = encoder.inverse_transform(encoded_data)
decoded_data

array([['good'],
       ['bad'],
       ['excellent'],
       ['average'],
       ['good'],
       ['average'],
       ['excellent'],
       ['bad'],
       ['average'],
       ['good']], dtype=object)