One-Hot Encoding

In [19]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
data = {
    "EmployeeID": [1,2,3,4,5],
    "Gender": ['m','f','m','f','m'],
    "behavior" : ['Good', 'Bad', 'Good', 'Bad', 'Good']
}
df = pd.DataFrame(data)
print(df)

   EmployeeID Gender behavior
0           1      m     Good
1           2      f      Bad
2           3      m     Good
3           4      f      Bad
4           5      m     Good


In [5]:
df_pandas_encoded = pd.get_dummies(df,columns=['Gender','behavior'],drop_first=True)
print(df_pandas_encoded)

   EmployeeID  Gender_m  behavior_Good
0           1      True           True
1           2     False          False
2           3      True           True
3           4     False          False
4           5      True           True


In [7]:
categorical_columns = ['Gender', 'behavior']
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded,columns=encoder.get_feature_names_out(categorical_columns))
df_sklearn_encoded = pd.concat([df.drop(categorical_columns, axis=1), one_hot_df], axis=1)

In [8]:
print(df_sklearn_encoded)

   EmployeeID  Gender_f  Gender_m  behavior_Bad  behavior_Good
0           1       0.0       1.0           0.0            1.0
1           2       1.0       0.0           1.0            0.0
2           3       0.0       1.0           0.0            1.0
3           4       1.0       0.0           1.0            0.0
4           5       0.0       1.0           0.0            1.0


In [9]:
employee_data = {
    'ID': [101, 102, 103, 104, 105],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Female'],
    'Salary_Payout': ['High', 'Low', 'Low', 'High', 'High']
}
df = pd.DataFrame(employee_data)
print(df)

    ID  Gender Salary_Payout
0  101    Male          High
1  102  Female           Low
2  103    Male           Low
3  104  Female          High
4  105  Female          High


In [10]:
df_pandas_encoded = pd.get_dummies(df,columns=['Gender', 'Salary_Payout'])
print(df_pandas_encoded)

    ID  Gender_Female  Gender_Male  Salary_Payout_High  Salary_Payout_Low
0  101          False         True                True              False
1  102           True        False               False               True
2  103          False         True               False               True
3  104           True        False                True              False
4  105           True        False                True              False


In [16]:
column_name = ['Gender', 'Salary_Payout']
encoder = OneHotEncoder(sparse_output=False)
df_one_hot_encoder = encoder.fit_transform(df[column_name])
df_one_hot = pd.DataFrame(df_one_hot_encoder, columns=encoder.get_feature_names_out(column_name))
df_encoded = pd.concat([df.drop(column_name, axis=1), df_one_hot], axis=1)
print(df_encoded)


    ID  Gender_Female  Gender_Male  Salary_Payout_High  Salary_Payout_Low
0  101            0.0          1.0                 1.0                0.0
1  102            1.0          0.0                 0.0                1.0
2  103            0.0          1.0                 0.0                1.0
3  104            1.0          0.0                 1.0                0.0
4  105            1.0          0.0                 1.0                0.0


Label Encoding

In [18]:
fruit_data = {
    'Name': ['Apple','Banana', 'Apple', 'Orange', 'Orange', 'Banana','Grapes'],
    "Price": [100, 50, 120, 80, 90, 55, 200]
}
df = pd.DataFrame(fruit_data)
print(df)

     Name  Price
0   Apple    100
1  Banana     50
2   Apple    120
3  Orange     80
4  Orange     90
5  Banana     55
6  Grapes    200


In [21]:
le = LabelEncoder()
df['Fruit_data_encoded'] = le.fit_transform(df['Name'])
print(df)

     Name  Price  Fruit_data_encoded
0   Apple    100                   0
1  Banana     50                   1
2   Apple    120                   0
3  Orange     80                   3
4  Orange     90                   3
5  Banana     55                   1
6  Grapes    200                   2


In [28]:
df.groupby(['Name']).count()

Unnamed: 0_level_0,Price,Fruit_data_encoded
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,2,2
Banana,2,2
Grapes,1,1
Orange,2,2


In [29]:
df.groupby(['Name']).mean()

Unnamed: 0_level_0,Price,Fruit_data_encoded
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,110.0,0.0
Banana,52.5,1.0
Grapes,200.0,2.0
Orange,85.0,3.0


In [31]:
mean_encoder = df.groupby('Name').mean().to_dict()
df['Mean_Encoded_Fruit'] = df['Name'].map(mean_encoder['Price'])
print(mean_encoder)

{'Price': {'Apple': 110.0, 'Banana': 52.5, 'Grapes': 200.0, 'Orange': 85.0}, 'Fruit_data_encoded': {'Apple': 0.0, 'Banana': 1.0, 'Grapes': 2.0, 'Orange': 3.0}, 'Mean_Encoded_Fruit': {'Apple': 110.0, 'Banana': 52.5, 'Grapes': 200.0, 'Orange': 85.0}}
