In [7]:
import pandas as pd
import numpy as np 

data = pd.read_csv('mercedes-benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [8]:
##Let's load the mercedes benz dataset only categorical variables 

data = pd.read_csv('mercedes-benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [13]:
#let's have look at how many labels each variable has

for col in data.columns:
    print(col, ': ', len(data[col].unique()),' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [14]:
#Let's examine how many columns we will obtain after one hot encoding these variables 
pd.get_dummies(data,drop_first=True).shape

(4209, 117)

We can see that from just 6 initial categorical variables we end up with 117 new variables 


###### what can we do instead?

The authors limit one hot encoding to the 10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10 most frequent labels only. This is equivalent to grouping all the other labels in under a new category, That in this case will be dropped.Thus, the 10 new dummy variables indicate if one of the 10 most frequent labels is present (1) or not (0) for a particular observation.

How can we do this with python?


In [16]:
#Let's find the top 10 most frequent categories for the variable x2

data.X2.value_counts().sort_values(ascending=False).head(20)

X2
as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: count, dtype: int64

In [20]:
#Let's make a list with the most frequent categories of the variable

top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [18]:
#and now we make the 10 binary variables

for label in top_10:
    data[label]=np.where(data['X2']==label, 1, 0)
    
data[['X2']+top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [33]:
#get whole set of dummy variables, for all categorical variables 

def one_hot_top_x(df,variable,top_x_labels):
    
    #function to create the dummy variables for the most frequent labels 
    #we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
        
#read the data again
data = pd.read_csv('mercedes-benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

#encode X2 into the 10 most frequent categories 
one_hot_top_x(data, 'X2', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [35]:
#get whole set of dummy variables, for all categorical variables 

def one_hot_top_x(df,variable,top_x_labels):
    
    #function to create the dummy variables for the most frequent labels 
    #we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
        
#read the data again
data = pd.read_csv('mercedes-benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

#encode X2 into the 10 most frequent categories 
one_hot_top_x(data, 'X1', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_as,X1_ae,X1_ai,X1_m,X1_ak,X1_r,X1_n,X1_s,X1_f,X1_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,0,0,0,0


In [36]:
#get whole set of dummy variables, for all categorical variables 

def one_hot_top_x(df,variable,top_x_labels):
    
    #function to create the dummy variables for the most frequent labels 
    #we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
        
#read the data again
data = pd.read_csv('mercedes-benz.csv',usecols=['X1','X2','X3','X4','X5','X6'])

#encode X2 into the 10 most frequent categories 
one_hot_top_x(data, 'X3', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X3_as,X3_ae,X3_ai,X3_m,X3_ak,X3_r,X3_n,X3_s,X3_f,X3_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,1
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,1,0
4,v,n,f,d,h,d,0,0,0,0,0,0,0,0,1,0


### One Hot Encoding Of Top Variables

#### Advantages
- StraightForward to Implement 
- Does not require hours of variable exploration
- Does not Expand massively the feature space (number of columns in the dataset) 

#### Disadvantages
- Does not add any information that may make the variable more predictive 
- Does not keep the information of the ignored labels
- If some of the labels have same count and then they will replaced with the same count and then they will loose some valuable information.

Because it is not unusual that categorical variables have a few dominating categories and the remaining labels add mostly noise, this is a quite simple and straightforward approach that may be useful on many occassions

It is worth nothing that top 10 variables is a totally arbitary number, you could also choose the top 5 or top 20.

This improved the performance of the variable dramatically..

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf

https://www.kaggle.com/general/16927