In [1]:
import pandas as pd
import numpy as np

# 1. One Hot Encoding - When we have few categories

In [2]:
df = pd.read_csv("titanic_train.csv",usecols=["Sex"])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [5]:
pd.get_dummies(data=df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [26]:
df = pd.read_csv("titanic_train.csv",usecols=["Embarked"])
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [27]:
df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [28]:
df.dropna(inplace=True)

In [29]:
df["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [31]:
pd.get_dummies(data=df,drop_first=True)

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


# 2. KDD Encoding - When we have many/large categories

Applying one hot encoding to the 10 most frequent categories in each feature

In [53]:
df = pd.read_csv("mercedes.csv",usecols=["X0","X1","X2","X3","X4","X5","X6"])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [34]:
df["X0"].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [39]:
# top 10 most frequent categories in X0 feature
df["X0"].value_counts().head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [37]:
for i in df.columns:
    print(i+" : ",len(df[i].unique()))

X0 :  47
X1 :  27
X2 :  44
X3 :  7
X4 :  4
X5 :  29
X6 :  12


In [56]:
top_10_X6 = df["X6"].value_counts().head(10).index
top_10_X6

Index(['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b'], dtype='object')

In [44]:
top_10_X0 = list(top_10_X0)
top_10_X0

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']

In [45]:
#doing one hot encoding manually to X0
for category in top_10_X0:
    df[category] = np.where(category==df["X0"],1,0)

In [46]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,z,ak,y,ay,t,x,o,f,n,w
0,k,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,0,0,0,0,0,0


In [54]:
# doing one hot encoding for all the features:

### get top 10 categories and then one hot encoding for each feature
for feature in df.columns:
    top_10 = df[feature].value_counts().head(10).index
    top_10 = list(top_10)
    for category in top_10:
        df[category] = np.where(category==df[feature],1,0)

In [55]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,z,ak,y,...,ai,m,e,d,g,q,p,j,h,k
0,k,v,at,a,d,u,j,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,k,t,av,e,d,y,l,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,az,t,n,f,d,x,l,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,...,0,0,0,1,0,0,0,0,0,0
