## Encoding a categorical variable using Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_excel('C:\\Users\\Parul\\Desktop\\DATASETS\\stark_data.xlsx')

In [4]:
dataset.head()

Unnamed: 0,Character,Age,Gender,Survived
0,Eddard,52.0,Male,0
1,John,27.0,Male,1
2,Arya,21.0,Female,1
3,Bran,19.0,Male,1
4,Robb,,Male,0


In [5]:
dataset.shape

(6, 4)

In [6]:
x_gender = dataset.iloc[:,2].values
x_gender

array(['Male', 'Male', 'Female', 'Male', 'Male', 'Female'], dtype=object)

In [7]:
#Converting categorical features
x_dummy1 = pd.get_dummies(x_gender)

In [8]:
x_dummy1

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
5,1,0


In [9]:
x_dummy2 = pd.get_dummies(x_gender, drop_first=True)

In [10]:
x_dummy2

Unnamed: 0,Male
0,1
1,1
2,0
3,1
4,1
5,0


# Encode a categorical variable using ML model

In [14]:
#import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [15]:
dataset = pd.read_excel('C:\\Users\\Parul\\Desktop\\DATASETS\\stark_data.xlsx')

In [16]:
dataset.head()

Unnamed: 0,Character,Age,Gender,Survived
0,Eddard,52.0,Male,0
1,John,27.0,Male,1
2,Arya,21.0,Female,1
3,Bran,19.0,Male,1
4,Robb,,Male,0


In [17]:
dataset.shape

(6, 4)

In [18]:
x = dataset.iloc[:,:-1].values
x

array([['Eddard', 52.0, 'Male'],
       ['John', 27.0, 'Male'],
       ['Arya', 21.0, 'Female'],
       ['Bran', 19.0, 'Male'],
       ['Robb', nan, 'Male'],
       ['Sansa', 24.0, 'Female']], dtype=object)

In [20]:
#Impute missing value using sklearn imputer from preprocessing
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer.fit(x[:, 1:2])
x[:, 1:2] = imputer.transform(x[:, 1:2])
x[:, 1:2]



array([[52.0],
       [27.0],
       [21.0],
       [19.0],
       [28.6],
       [24.0]], dtype=object)

In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
#column gender encoding
x[:,2] = labelencoder_x.fit_transform(x[:,2])
x[:,2]

array([1, 1, 0, 1, 1, 0], dtype=object)

In [22]:
#column name encoding
x[:,0] = labelencoder_x.fit_transform(x[:,0])
x[:,0]

array([2, 3, 0, 1, 4, 5], dtype=object)

In [23]:
x

array([[2, 52.0, 1],
       [3, 27.0, 1],
       [0, 21.0, 0],
       [1, 19.0, 1],
       [4, 28.6, 1],
       [5, 24.0, 0]], dtype=object)

In [24]:
onehotencoder = OneHotEncoder(categorical_features = [0])
x = onehotencoder.fit_transform(x).toarray()
x

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[ 0. ,  0. ,  1. ,  0. ,  0. ,  0. , 52. ,  1. ],
       [ 0. ,  0. ,  0. ,  1. ,  0. ,  0. , 27. ,  1. ],
       [ 1. ,  0. ,  0. ,  0. ,  0. ,  0. , 21. ,  0. ],
       [ 0. ,  1. ,  0. ,  0. ,  0. ,  0. , 19. ,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  1. ,  0. , 28.6,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  1. , 24. ,  0. ]])

In [25]:
x[:, 1:]

array([[ 0. ,  1. ,  0. ,  0. ,  0. , 52. ,  1. ],
       [ 0. ,  0. ,  1. ,  0. ,  0. , 27. ,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. , 21. ,  0. ],
       [ 1. ,  0. ,  0. ,  0. ,  0. , 19. ,  1. ],
       [ 0. ,  0. ,  0. ,  1. ,  0. , 28.6,  1. ],
       [ 0. ,  0. ,  0. ,  0. ,  1. , 24. ,  0. ]])