## 1. Nominal Encoding/One Hot Encoding

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
### Creating a nominal dataset
df=pd.DataFrame({
    'color':['red','blue','green','green','red','blue']
})
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [3]:
from sklearn.preprocessing import OneHotEncoder

In [4]:
encoder=OneHotEncoder()

In [5]:
encoder.fit_transform(df[['color']])

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [6]:
encoder.fit_transform(df[['color']]).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [7]:
encoded=encoder.fit_transform(df[['color']]).toarray()

In [8]:
## Creating a dataframe of this encoded data
encoded_df=pd.DataFrame(encoded,columns=encoder.get_feature_names_out())

In [9]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [10]:
## So we have encoded the categories of our categorical feature into numerical values
# Now whenever we get any new data,we can encode it too using the same object

In [11]:
encoder.transform([['blue']])



<1x3 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [12]:
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [13]:
pd.concat([df,encoded_df],axis=1)
# Alphabetically it does blue to 100, green to 010 and red to 001

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [14]:
# Another example
df1=sns.load_dataset('tips')
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [15]:
# Encoding the time feature
df1['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [16]:
encoder_1=OneHotEncoder()
encoder_1.fit_transform(df1[['time']])

<244x2 sparse matrix of type '<class 'numpy.float64'>'
	with 244 stored elements in Compressed Sparse Row format>

In [17]:
encoded_1=encoder_1.fit_transform(df1[['time']]).toarray()

In [18]:
encoded_df1=pd.DataFrame(encoded_1,columns=encoder_1.get_feature_names_out())

In [19]:
pd.concat([df1,encoded_df1],axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_Dinner,time_Lunch
0,16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0
1,10.34,1.66,Male,No,Sun,Dinner,3,1.0,0.0
2,21.01,3.50,Male,No,Sun,Dinner,3,1.0,0.0
3,23.68,3.31,Male,No,Sun,Dinner,2,1.0,0.0
4,24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,1.0,0.0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,1.0,0.0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,1.0,0.0
242,17.82,1.75,Male,No,Sat,Dinner,2,1.0,0.0


## 2. Using Numpy to encode binary classified features

In [20]:
## Since time is a binary classified feature, we can also use numpy library to do encoding
df1['time_enc']=np.where(df1['time']=='Dinner',0,1)

In [21]:
df1.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,time_enc
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0


## 3. Label Encoding

In [22]:
## Assigns labels to each category of a particular feature
# Usually the labels are assigned in alphabetic way or based on frequency

In [23]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [24]:
from sklearn.preprocessing import LabelEncoder
lbl_encoder=LabelEncoder()

In [30]:
lbl_encoder.fit_transform(df['color'])

array([2, 0, 1, 1, 2, 0])

In [26]:
## So here it is assigning labels alphabetically as 
# blue-->0
# green-->1
# red-->2

In [32]:
## For any new data
lbl_encoder.transform(['red','blue','green'])

array([2, 0, 1])

## 3. Ordinal Encoder

In [39]:
## Used for ordinal data, when we want to assign ranks to categories in our own way.
from sklearn.preprocessing import OrdinalEncoder

In [40]:
df2=pd.DataFrame({
    'size':['small','medium','large','medium','small','large']
})

In [41]:
df2

Unnamed: 0,size
0,small
1,medium
2,large
3,medium
4,small
5,large


In [42]:
## Now we want to assign ranking to these category in our own desired manner, i.e., I want to rank:
# small-->0
# medium-->1
# large-->2

In [43]:
ord_encoder=OrdinalEncoder(categories=[['small','medium','large']])

In [45]:
ord_encoder.fit_transform(df2[['size']])

array([[0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.]])

In [53]:
pd.DataFrame(ord_encoder.fit_transform(df2[['size']]),columns=['size'])

Unnamed: 0,size
0,0.0
1,1.0
2,2.0
3,1.0
4,0.0
5,2.0


In [48]:
## For any new data
ord_encoder.transform([['small']])

array([[0.]])

## 4. Target Guided Ordinal Encoding

In [54]:
## Encoding categorical variable based on their relationship with the target variable
# Useful when we have categorical feature with a a large no of categories
# We replace each category with a numerical value-> mean or median of the target value for that category
# Creates a monotonic relationship between categorical and target variable->improves predictive power of model

In [55]:
df3=pd.DataFrame({
    'city':['New York','London','Paris','Tokyo','New york','Paris'],
    'price':[200,150,300,250,180,320]
})

In [56]:
df3

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New york,180
5,Paris,320


In [57]:
# Now we want to encode these cities with the mean of the target values for them
# For eg New York catgeory will be replaced by (200+180)/2=190

In [68]:
# Groupby operation creating an object having grouped categories then we find mean price for each category
df3.groupby('city')['price'].mean()

city
London      150.0
New York    200.0
New york    180.0
Paris       310.0
Tokyo       250.0
Name: price, dtype: float64

In [70]:
# Converting it into a dictionary
mean_price=df3.groupby('city')['price'].mean().to_dict()

In [71]:
mean_price

{'London': 150.0,
 'New York': 200.0,
 'New york': 180.0,
 'Paris': 310.0,
 'Tokyo': 250.0}

In [75]:
df3['city_encoded']=df3['city'].map(mean_price)
df3['city_encoded']

0    200.0
1    150.0
2    310.0
3    250.0
4    180.0
5    310.0
Name: city_encoded, dtype: float64

In [76]:
df3

Unnamed: 0,city,price,city_encoded
0,New York,200,200.0
1,London,150,150.0
2,Paris,300,310.0
3,Tokyo,250,250.0
4,New york,180,180.0
5,Paris,320,310.0


In [78]:
# For training purpose now we'll give only price and city_encoded feature
# Model will be now able to give importance based on the value