In [1]:
import pandas as pd
import numpy as np

### Method 1: Using numpy where function:

This method is useful when we have only two categorical labels. eg: yes /no

In [2]:
Status = ['Yes', 'No', 'No', 'Yes','No', 'No','Yes','Yes']
df_1 = pd.DataFrame(Status, columns=['Status'])
df_1

Unnamed: 0,Status
0,Yes
1,No
2,No
3,Yes
4,No
5,No
6,Yes
7,Yes


In [3]:
df_1['Status'] = np.where(df_1['Status']=='Yes', 1,0)
df_1['Status']

0    1
1    0
2    0
3    1
4    0
5    0
6    1
7    1
Name: Status, dtype: int32

### Encoding when there are multiple labels are present in a single feature.

In [4]:
#Import data set
pd.set_option('display.max_columns',None)

In [5]:
df = pd.read_csv("mercedes.csv", usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'])

In [6]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [7]:
for item in df.columns:
    print(item,':', len(df[item].unique()))

X0 : 47
X1 : 27
X2 : 44
X3 : 7
X4 : 4
X5 : 29
X6 : 12
X8 : 25


Each features contains muliple labels.

### Method 1: Using map function

In [8]:
df['X3'].unique()

array(['a', 'e', 'c', 'f', 'd', 'b', 'g'], dtype=object)

In [9]:
to_dict = {'a':1, 'e':2, 'c':3, 'f':4, 'd':5, 'b':6, 'g':7}

In [10]:
df['X3'] = df['X3'].map(to_dict)

In [11]:
df['X3'].head(10)

0    1
1    2
2    3
3    4
4    4
5    3
6    4
7    4
8    2
9    3
Name: X3, dtype: int64

### Method 2: pandas get_dummies method

In [12]:
df = pd.get_dummies(df,columns=['X4'], drop_first=True)

In [13]:
df

Unnamed: 0,X0,X1,X2,X3,X5,X6,X8,X4_b,X4_c,X4_d
0,k,v,at,1,u,j,o,0,0,1
1,k,t,av,2,y,l,o,0,0,1
2,az,w,n,3,x,j,x,0,0,1
3,az,t,n,4,x,l,e,0,0,1
4,az,v,n,4,h,d,n,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4204,ak,s,as,3,aa,d,q,0,0,1
4205,j,o,t,5,aa,h,h,0,0,1
4206,ak,v,r,1,aa,g,e,0,0,1
4207,al,r,e,4,aa,l,u,0,0,1


### Method 3:Using sci-kit learn library approach

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
#creating instance of onehotencoder

#enc = OneHotEncoder(handle_unknown='ignore') #handle_unknown='ignore' is optinal

In [16]:
#df['X6'] = enc.fit_transform()
#df

In [17]:
#df_stat = pd.DataFrame(enc.fit_transform(df_stat[['Stat']]).toarray())
#df_stat

In [18]:
enc = OneHotEncoder(handle_unknown='ignore')

In [19]:
enc_df = pd.DataFrame(enc.fit_transform(df[['X6']]).toarray()).add_prefix('X6_')

In [20]:
#From the above line it will not print the column name of X6, 
#instead it will be printed as 0,1,2,3,...11.
#Therefore to give clear information about column which is encoded use "add_prefix".
enc_df = enc_df.add_prefix('X6_')

In [21]:
# merge with main df bridge_df on key values
df = df.join(enc_df)
df

Unnamed: 0,X0,X1,X2,X3,X5,X6,X8,X4_b,X4_c,X4_d,X6_X6_0,X6_X6_1,X6_X6_2,X6_X6_3,X6_X6_4,X6_X6_5,X6_X6_6,X6_X6_7,X6_X6_8,X6_X6_9,X6_X6_10,X6_X6_11
0,k,v,at,1,u,j,o,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,k,t,av,2,y,l,o,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,az,w,n,3,x,j,x,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,az,t,n,4,x,l,e,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,az,v,n,4,h,d,n,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,ak,s,as,3,aa,d,q,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4205,j,o,t,5,aa,h,h,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4206,ak,v,r,1,aa,g,e,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4207,al,r,e,4,aa,l,u,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
