# High cardinality categorical feature encoding

In [50]:
import pandas as pd
import numpy as np

# let's open the mercedes benz dataset for demonstration
# Download the dataset from the below link
#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data

df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [51]:
# The cardinality of `X2` cols
df['X2'].unique()
print('The total number of categories are:', len(df['X2'].unique()))

The total number of categories are: 44


## One hot encoding using pandas

Ref: [Link](https://github.com/krishnaik06/Complete-Feature-Engineering/blob/master/2.Count_frequency_encoding.ipynb)

In [74]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first=True).shape

(4209, 69)

## Count/ Frequency encoding

Replace category with its count/ frequency

In [61]:
df1 = df.copy()

In [62]:
# categories mapping with frequency
mapping_dict  = df1['X2'].value_counts().to_dict()

In [63]:
df1['X2'] = df1.apply(lambda z:mapping_dict[z['X2']], axis=1)
#df1.X2 = df1.X2.map(mapping_dict)


In [64]:
df1.head()

Unnamed: 0,X1,X2
0,v,6
1,t,4
2,w,137
3,t,137
4,v,137


### Advantages
* It is very simple to implement
* Does not increase the feature dimensional space

### Disadvantages
* If some of the labels have the same count, then they will be replaced with the same count and they will loose some valuable information.
*  Adds somewhat arbitrary numbers, and therefore weights to the different labels, that may not be related to their predictive power

Follow this thread in Kaggle for more information: https://www.kaggle.com/general/16927

## One hot encoding Top few category

Ref: [Link](https://github.com/krishnaik06/Complete-Feature-Engineering/blob/master/1_OHE_variables_with_many_labels.ipynb)

In [65]:
df2= df.copy()

In [85]:
# Top 10 frequently occuring category corresponds to ~90% entries
df2['X2'].value_counts().sort_values(ascending=False).head(10).sum()/df.shape[0]

0.8918983131385128

In [90]:
top_10_level = list(df2['X2'].value_counts().sort_values(ascending=False).head(10).index)

In [97]:
df2.loc[:,'X2'] = df2.apply(lambda z: 'other' if  z['X2'] not in top_10_level   else  z['X2'] , axis=1)

In [98]:
pd.get_dummies(df2['X2'])

Unnamed: 0,ae,ai,ak,as,e,f,m,n,other,r,s
0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,1,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,1,0,0
4206,0,0,0,0,0,0,0,0,0,1,0
4207,0,0,0,0,1,0,0,0,0,0,0
