In [3]:
classes = ['ClassA', 'ClassB', 'ClassC', 'ClassD']
instances = ['ClassA', 'ClassB', 'ClassC', 'ClassD','ClassA', 'ClassB', 'ClassC', 'ClassD','ClassA', 'ClassB']

In [4]:
label_to_int = {label: index for index, label in enumerate(classes)}
encoded_labels= [label_to_int[label] for label in instances]
print("Encoded labels:", encoded_labels)

Encoded labels: [0, 1, 2, 3, 0, 1, 2, 3, 0, 1]


In [5]:
int_to_label = {index: label for label, index in label_to_int.items()}
decoded_labels = [int_to_label[index] for index in encoded_labels]

print("Encoded labels: ", encoded_labels)
print("Decoded labels: ", decoded_labels)

Encoded labels:  [0, 1, 2, 3, 0, 1, 2, 3, 0, 1]
Decoded labels:  ['ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB', 'ClassC', 'ClassD', 'ClassA', 'ClassB']


# Sklearn - Label Encoder

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
label_encoder = LabelEncoder()

In [8]:
encoded_labels = label_encoder.fit_transform(instances)
print("Encoded labels:", encoded_labels)

Encoded labels: [0 1 2 3 0 1 2 3 0 1]


In [9]:
original_labels = label_encoder.inverse_transform(encoded_labels)
print("Encoded labels:", encoded_labels)
print("Original labels:", original_labels)

Encoded labels: [0 1 2 3 0 1 2 3 0 1]
Original labels: ['ClassA' 'ClassB' 'ClassC' 'ClassD' 'ClassA' 'ClassB' 'ClassC' 'ClassD'
 'ClassA' 'ClassB']


# One Hot Encoding 

In [10]:
import pandas as pd 

In [11]:
data = { 'Category': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C' ]}

In [12]:
df = pd.DataFrame(data)

In [13]:
df.head()

Unnamed: 0,Category
0,A
1,B
2,C
3,A
4,B


In [14]:
one_hot_encoded_df = pd.get_dummies(df, columns=['Category'])
one_hot_encoded_df

Unnamed: 0,Category_A,Category_B,Category_C
0,True,False,False
1,False,True,False
2,False,False,True
3,True,False,False
4,False,True,False
5,False,False,True
6,True,False,False
7,False,True,False
8,False,False,True


In [15]:
one_hot_encoded_df = pd.get_dummies(df, columns=['Category'], prefix ='Encoded', drop_first=True)
one_hot_encoded_df

Unnamed: 0,Encoded_B,Encoded_C
0,False,False
1,True,False
2,False,True
3,False,False
4,True,False
5,False,True
6,False,False
7,True,False
8,False,True


# Binary Encoding

In [17]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
     -------------------------------------- 81.9/81.9 kB 507.9 kB/s eta 0:00:00
Collecting statsmodels>=0.9.0
  Downloading statsmodels-0.14.2-cp310-cp310-win_amd64.whl (9.8 MB)
     ---------------------------------------- 9.8/9.8 MB 7.2 MB/s eta 0:00:00
Collecting patsy>=0.5.1
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
     ------------------------------------- 233.9/233.9 kB 14.0 MB/s eta 0:00:00
Installing collected packages: patsy, statsmodels, category_encoders
Successfully installed category_encoders-2.6.3 patsy-0.5.6 statsmodels-0.14.2



[notice] A new release of pip available: 22.2.2 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import pandas as pd
import category_encoders as ce

In [19]:
data = { 'Category': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C' ]}
df = pd.DataFrame(data)

In [21]:
df.head()

Unnamed: 0,Category
0,A
1,B
2,C
3,A
4,B


In [23]:
df.shape

(9, 1)

In [24]:
encoder = ce.BinaryEncoder(cols=['Category'], return_df=True)

In [25]:
df_binary_encoded = encoder.fit_transform(df)
df_binary_encoded

Unnamed: 0,Category_0,Category_1
0,0,1
1,1,0
2,1,1
3,0,1
4,1,0
5,1,1
6,0,1
7,1,0
8,1,1
