## One Hot Encoding - variables with many categories

In [2]:
#https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data
import pandas as pd
import numpy as np

df = pd.read_csv("mercedesbenz.csv", usecols = ['X1', 'X2'])
df.head()

Unnamed: 0,X1,X2
0,v,at
1,t,av
2,w,n
3,t,n
4,v,n


In [3]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']


In [4]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [5]:
df['X2'].unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [6]:
len(df['X1'].unique())

27

In [7]:
len(df['X2'].unique())

44

In [8]:
#Let's have a look at how many labels each variables has
for col in df.columns:
    print(col, ':', len(df[col].unique()), 'labels')

X1 : 27 labels
X2 : 44 labels


In [9]:
df.shape

(4209, 2)

In [10]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first = True).shape

(4209, 69)

We can observe that from with just 2 categorical features we are getting 69 features with the help of one hot encoding.

### KDD Cup Orange Challenge
What can we do instead?

http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

How can we do that in python?

In [13]:
# let's find the top 10 most frequent categories for the variable X2
df.X2.value_counts().sort_values(ascending = False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [14]:
# let's make a list with the most frequent categories of the variable
top_10_labels = [y for y in df.X2.value_counts().sort_values(ascending = False).head(10).index]
top_10_labels

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [19]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
        
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [20]:
#read the data again
df = pd.read_csv('mercedesbenz.csv', usecols = ['X1', 'X2'])

#encode X2 into  the top 10 most frequent categories
one_hot_encoding_top_x(df, 'X2', top_10_labels)
df.head()

Unnamed: 0,X1,X2,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,0,0,0,0,0,0,0,0,0,0
1,t,av,0,0,0,0,0,0,0,0,0,0
2,w,n,0,0,0,0,0,0,1,0,0,0
3,t,n,0,0,0,0,0,0,1,0,0,0
4,v,n,0,0,0,0,0,0,1,0,0,0
