In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
df = pd.read_csv("mercedes_benz.csv",usecols=["X1","X2","X3","X4","X5","X6"])

In [3]:
# First 5 rows
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
# Number of unique data in each column
for i in df.columns:
    print(i,"-",df[i].nunique())

X1 - 27
X2 - 44
X3 - 7
X4 - 4
X5 - 29
X6 - 12


In [5]:
# Let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df,drop_first=True).shape

(4209, 117)

### KDD Cup Orange Challenge
What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

In [6]:
# Let's find the top 10 most frequent categories for variable x2
df["X2"].value_counts().head(10)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
Name: X2, dtype: int64

In [7]:
top_10_X2 = [x for x in df["X2"].value_counts().head(10).index]
top_10_X2

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [8]:
# and now we make 10
for label in top_10_X2:
    df[label] = np.where(df["X2"]==label,1,0)

In [9]:
df[["X2"]+top_10_X2].head(10)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


### Function to implement

In [10]:
# function to create the dummy variables for the most frequent labels
# we can vary the number of most frequent labels that we encode
def one_hot_top_x(df,variable,top_x_labels):
    
    for label in top_x_labels:
        df[variable+"_"+label] = np.where(df[variable]==label,1,0)

In [11]:
top_10_X1 = [x for x in df["X1"].value_counts().head(10).index]

In [12]:
one_hot_top_x(df,"X1",top_10_X1)
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,as,ae,ai,m,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### One Hot Encoding of top variables

#### Advantages
- Straightforward to implement
- Does not require hrs of variable exploration
- Does not expand massively the feature space (number of columns in the dataset)

#### Disadvantages
- Does not add any information that may make the variable more predictive
- Does not keep the information of the ignored labels