In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
... ['red', 'L', 13.5, 'class1'],
... ['blue', 'XL', 15.3, 'class2']])
df.columns  = ['color', 'size', 'price', 'classlabel']

print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


In [3]:
# Map ordinal features
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
print(df['size'].map(size_mapping))
inverse_size_mapping = {v: k for k, v in size_mapping.items()}

# If we are unsure about the numerical differences between features/they are ill defined
# we can simply use a dummy variable x and create columns for x > M and x > L that take binary values

copied_df = df.copy(deep=True)
copied_df['x > M'] = copied_df['size'].apply(lambda x: 1 if x in ['L', 'XL'] else 0)
copied_df['x > L'] = copied_df['size'].apply(lambda x: 1 if x == 'XL' else 0)
del copied_df['size']
print(df)
print(copied_df)

0    1
1    2
2    3
Name: size, dtype: int64
   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2
   color  price classlabel  x > M  x > L
0  green   10.1     class2      0      0
1    red   13.5     class1      1      0
2   blue   15.3     class2      1      1


In [4]:
# Class labels mapping
class_mapping = {label: idx for idx, label in enumerate(df['classlabel'].unique())}
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)

inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
print(df)


# Or use sklearn
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
print(y)

print(class_le.inverse_transform(y))

   color size  price  classlabel
0  green    M   10.1           0
1    red    L   13.5           1
2   blue   XL   15.3           0
   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2
[1 0 1]
['class2' 'class1' 'class2']


In [5]:
# For non-ordinal/nominal features, one-hot encoding would be better
# Otherwise, label encoding RGB as 0,1,2 would somehow imply that R < G < B - not true

from sklearn.preprocessing import OneHotEncoder

color_ohe = OneHotEncoder()

X = df[['color', 'size', 'price']].values
print(X[:, 0])
print(X[:, 0].reshape(-1, 1))

print(color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray())

['green' 'red' 'blue']
[['green']
 ['red']
 ['blue']]
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [6]:
# We can also do all columns at the same time
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)

X = df[['color', 'size', 'price']].values

# Column transformer takes in a list of tupeles of (name, transformer, columns)
c_transform = ColumnTransformer([('onehot', OneHotEncoder(), [0]), ('nothing', 'passthrough', [1, 2])])

print(c_transform.fit_transform(X).astype(float))

[[ 0.   1.   0.   1.  10.1]
 [ 0.   0.   1.   2.  13.5]
 [ 1.   0.   0.   3.  15.3]]


In [7]:
# Another way is to use the get_dummies feature from pandas
print(pd.get_dummies(df[['color', 'size', 'price']]).astype(float))

# The problem with one-hot encoding this way is the introduction of linear dependence between R, G, B. Since if R = 0, G, = 0 then it must be B
# To avoid this, we can use drop_first
print(pd.get_dummies(df[['color', 'size', 'price']], drop_first=True).astype(float))

# This can also be done with scikit learn by using
color_ohe = OneHotEncoder(categories='auto', drop='first')

   size  price  color_blue  color_green  color_red
0   1.0   10.1         0.0          1.0        0.0
1   2.0   13.5         0.0          0.0        1.0
2   3.0   15.3         1.0          0.0        0.0
   size  price  color_green  color_red
0   1.0   10.1          1.0        0.0
1   2.0   13.5          0.0        1.0
2   3.0   15.3          0.0        0.0
