In [1]:
import pandas as pd

In [2]:
# Create a sample DataFrame with categorical data
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'C', 'B'],
    'Color': ['Red', 'Green', 'Blue', 'Red', 'Green', 'Red', 'Blue', 'Red', 'Green'],
    'Size': ['Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Medium', 'Small', 'Large'],
    'Label': [1, 0, 1, 0, 1, 0, 1, 1, 0]
}
df = pd.DataFrame(data)

In [3]:
# Example 1: One-Hot Encoding using pd.get_dummies
df_encoded = pd.get_dummies(df, columns=['Category', 'Color', 'Size'], prefix=['Cat', 'Col', 'Size'])


In [4]:
# Example 2: Label Encoding using pd.factorize
df['Size_Label'] = pd.factorize(df['Size'])[0]


In [5]:
# Example 3: Ordinal Encoding using a custom mapping
size_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}
df['Size_Ordinal'] = df['Size'].map(size_mapping)


In [8]:
pip install category-encoders

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# Example 4: Binary Encoding using category_encoders library
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['Size'])
df_binary_encoded = encoder.fit_transform(df)


In [10]:
# Example 5: Count Encoding using category_encoders
encoder = ce.CountEncoder(cols=['Category'])
df_count_encoded = encoder.fit_transform(df)

In [11]:
# Example 6: Target Encoding using category_encoders
encoder = ce.TargetEncoder(cols=['Color'])
df_target_encoded = encoder.fit_transform(df, df['Label'])

In [12]:
# Example 7: Mean Encoding using groupby and transform
mean_encoded = df.groupby('Color')['Label'].transform('mean')
df['Color_Mean_Encoded'] = mean_encoded

In [13]:
# Example 8: Frequency Encoding using value_counts
freq_encoding = df['Color'].map(df['Color'].value_counts())
df['Color_Freq_Encoded'] = freq_encoding

In [14]:
# Example 9: Hashing Encoding using category_encoders
encoder = ce.HashingEncoder(cols=['Category'], n_components=3)
df_hash_encoded = encoder.fit_transform(df)

In [15]:
# Example 10: Leave-One-Out Encoding using category_encoders
encoder = ce.LeaveOneOutEncoder(cols=['Color'])
df_loo_encoded = encoder.fit_transform(df, df['Label'])

In [16]:
# Display a sample of the DataFrame
print(df.head(3))

  Category  Color    Size  Label  Size_Label  Size_Ordinal  \
0        A    Red   Small      1           0             1   
1        B  Green  Medium      0           1             2   
2        A   Blue   Large      1           2             3   

   Color_Mean_Encoded  Color_Freq_Encoded  
0            0.500000                   4  
1            0.333333                   3  
2            1.000000                   2  


In [17]:
# Example 11: Weight of Evidence Encoding using category_encoders
encoder = ce.WOEEncoder(cols=['Category'])
df_woe_encoded = encoder.fit_transform(df, df['Label'])

In [18]:
# Example 12: James-Stein Encoding using category_encoders
encoder = ce.JamesSteinEncoder(cols=['Category'])
df_js_encoded = encoder.fit_transform(df, df['Label'])

In [19]:
# Example 13: Backward Difference Encoding using category_encoders
encoder = ce.BackwardDifferenceEncoder(cols=['Size'])
df_bd_encoded = encoder.fit_transform(df)



In [20]:
# Example 14: Helmert Encoding using category_encoders
encoder = ce.HelmertEncoder(cols=['Size'])
df_helmert_encoded = encoder.fit_transform(df)



In [21]:
# Example 15: Polynomial Encoding using category_encoders
encoder = ce.PolynomialEncoder(cols=['Size'])
df_poly_encoded = encoder.fit_transform(df)



In [22]:
# Example 16: Bin-counting Encoding using category_encoders
encoder = ce.CatBoostEncoder(cols=['Category'])
df_cb_encoded = encoder.fit_transform(df, df['Label'])

In [23]:
# Example 17: Feature Hashing using sklearn's FeatureHasher
from sklearn.feature_extraction import FeatureHasher
hasher = FeatureHasher(n_features=3, input_type='string')
hashed_features = hasher.transform(df['Size'])
df_hashed_features = pd.DataFrame(hashed_features.toarray(), columns=['Size_hashed_1', 'Size_hashed_2', 'Size_hashed_3'])


ValueError: Samples can not be a single string. The input must be an iterable over iterables of strings.

In [24]:
# Example 18: Encoding with Frequency and Mean for high cardinality categories
category_counts = df['Category'].value_counts()
category_means = df.groupby('Category')['Label'].mean()
df['Category_Freq_Encoded'] = df['Category'].map(category_counts)
df['Category_Mean_Encoded'] = df['Category'].map(category_means)


In [25]:
# Example 19: Encoding with Frequency and Mean for Color and Size
color_counts = df['Color'].value_counts()
color_means = df.groupby('Color')['Label'].mean()
size_counts = df['Size'].value_counts()
size_means = df.groupby('Size')['Label'].mean()
df['Color_Freq_Encoded'] = df['Color'].map(color_counts)
df['Color_Mean_Encoded'] = df['Color'].map(color_means)
df['Size_Freq_Encoded'] = df['Size'].map(size_counts)
df['Size_Mean_Encoded'] = df['Size'].map(size_means)


In [26]:
# Example 20: Encoding with Frequency and Mean for Size_Label
size_label_counts = df['Size_Label'].value_counts()
size_label_means = df.groupby('Size_Label')['Label'].mean()
df['Size_Label_Freq_Encoded'] = df['Size_Label'].map(size_label_counts)
df['Size_Label_Mean_Encoded'] = df['Size_Label'].map(size_label_means)


In [27]:
# Display a sample of the DataFrame
print(df.head(3))

  Category  Color    Size  Label  Size_Label  Size_Ordinal  \
0        A    Red   Small      1           0             1   
1        B  Green  Medium      0           1             2   
2        A   Blue   Large      1           2             3   

   Color_Mean_Encoded  Color_Freq_Encoded  Category_Freq_Encoded  \
0            0.500000                   4                      3   
1            0.333333                   3                      3   
2            1.000000                   2                      3   

   Category_Mean_Encoded  Size_Freq_Encoded  Size_Mean_Encoded  \
0               0.666667                  3           0.666667   
1               0.333333                  3           0.666667   
2               0.666667                  3           0.333333   

   Size_Label_Freq_Encoded  Size_Label_Mean_Encoded  
0                        3                 0.666667  
1                        3                 0.666667  
2                        3                 0.333333