##### Handle Categorical Features
###### One Hot Encoding

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('titanic.csv',usecols=['Sex'])

In [3]:
df

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
...,...
886,male
887,female
888,female
889,male


In [4]:
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [5]:
pd.get_dummies(df).head() 
#In first column(Sex_female) it will assing 1 for Female & 0 for Male
# #In second column(Sex_male) it will assing 1 for male & 0 for FeMale

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [6]:
pd.get_dummies(df,drop_first=True).head() #We can take any one feature either Sex_female or Sex_male coz both features represent both the classes(Male or Female).

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [7]:
df=pd.read_csv('titanic.csv',usecols=['Embarked'])

In [8]:
df

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S
...,...
886,S
887,S
888,S
889,C


In [9]:
df.isnull().sum()

Embarked    2
dtype: int64

In [10]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [11]:
df.dropna(inplace=True) #Here we have dropped NaN values.

In [12]:
df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [13]:
pd.get_dummies(df) 
#I can drop one of the feature(Embarked_C) coz these 2 features(Embarked_Q,Embarked_S) represent Embarked_C feature. 
#Whenever I have 0,0(Embarked_Q , Embarked_S) then this represent 1st feature(Embarked_C)

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [14]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [15]:
#### Onehotencoding with many categories in a feature

In [16]:
df=pd.read_csv('mercedes.csv')
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [18]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [19]:
df['X0'].unique() #See there are many unique values. So if I do OneHotencoding it will create that many features.

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [20]:
len(df['X0'].unique())

47

In [21]:
df.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6'], dtype='object')

In [22]:
for i in df.columns:
    print(len(df[i].unique()))
#If u use OneHotEncoding here then there maybe a huge problem. Please dont use this encoding in this use case.    

47
27
44
7
4
29
12


In [23]:
df.X1.value_counts()

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
d       3
q       3
ab      3
Name: X1, dtype: int64

In [24]:
df.X1.value_counts().sort_values(ascending=False)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
w      52
z      46
u      37
e      33
m      32
t      31
h      29
y      23
f      23
j      22
n      19
k      17
p       9
g       6
d       3
q       3
ab      3
Name: X1, dtype: int64

In [25]:
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [26]:
df.X1.value_counts().sort_values(ascending=False).head(10).index

Index(['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'], dtype='object')

In [27]:
type(df.X1.value_counts().sort_values(ascending=False).head(10).index)

pandas.core.indexes.base.Index

In [28]:
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index #This is basically Im taking top 10 values
lst_10=list(lst_10)
#By using this I take index names

In [29]:
lst_10 #These r the 10 most frequent categories. Im gng to take this 'X1' feature and apply OneHotEncoding into this features only.
#For all the remaining features Im jst gng to skip it or reject it. 

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [30]:
type(lst_10)

list

In [38]:
df['X1']

0       v
1       t
2       w
3       t
4       v
       ..
4204    s
4205    o
4206    v
4207    r
4208    r
Name: X1, Length: 4209, dtype: object

In [31]:
df[df['X1']=='aa']

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
17,ay,aa,as,c,d,j,c
21,t,aa,as,d,d,j,j
27,t,aa,as,c,d,j,d
42,z,aa,ai,c,d,j,g
47,s,aa,m,c,d,j,g
...,...,...,...,...,...,...,...
4175,z,aa,ai,a,d,aa,l
4178,ay,aa,as,c,d,aa,h
4183,z,aa,m,c,d,aa,l
4199,t,aa,ay,c,d,aa,l


In [32]:
import numpy as np
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)
#Here I'm gng to implement OneHotEncoding to this 10 categories   

In [33]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [34]:
df[['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']]

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0
4205,0,0,0,0,0,0,0,0,0,1
4206,0,0,0,0,1,0,0,0,0,0
4207,0,0,0,0,0,1,0,0,0,0


In [35]:
lst_10.append('X1')

In [36]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o', 'X1']

In [37]:
df[lst_10].head(20)

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
5,0,0,1,0,0,0,0,0,0,0,b
6,0,0,0,0,0,1,0,0,0,0,r
7,0,0,0,1,0,0,0,0,0,0,l
8,0,1,0,0,0,0,0,0,0,0,s
9,0,0,1,0,0,0,0,0,0,0,b


In [39]:
df[lst_10] #Here we r only checking the top 10 values.

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v
...,...,...,...,...,...,...,...,...,...,...,...
4204,0,1,0,0,0,0,0,0,0,0,s
4205,0,0,0,0,0,0,0,0,0,1,o
4206,0,0,0,0,1,0,0,0,0,0,v
4207,0,0,0,0,0,1,0,0,0,0,r
