# Handling Nominal Categorical Data using One Hot Encoding

In [1]:
# We need to convert data from string to int, as most ML Models do not accept strings and a lot of data is present in form of strings.

In [2]:
# Nominal-Encoding -> No order is given 
# Eg -> Male, Female
# With the help of One-Hot Encoding.


# Ordinal Encoding -> Order is present,
# Eg -> Poor, Average, Good
# Label-Encoder is use in o/p and ordinal in i/p

In [3]:
import numpy as np
import pandas as pd

In [8]:
df = pd.DataFrame({
    'brand': [
        'Toyota', 'Hyundai', 'Ford', 'Chevrolet', 'Nissan', 'Kia', 'Renault', 'Volkswagen', 'BMW', 'Audi',
        'Mercedes', 'Tata', 'Mahindra', 'Honda', 'Ford', 'Toyota', 'Hyundai', 'Kia', 'Maruti', 'Honda',
        'Skoda', 'Hyundai', 'Volkswagen', 'Chevrolet', 'Ford', 'Honda', 'Toyota', 'Hyundai', 'Kia', 'Nissan',
        'Maruti', 'Hyundai', 'Honda', 'Ford', 'Toyota', 'Skoda', 'Maruti', 'Hyundai', 'Chevrolet', 'Nissan',
        'Renault', 'Volkswagen', 'Audi', 'BMW', 'Mercedes', 'Mahindra', 'Tata', 'Kia', 'Maruti'
    ],
    'km_driven': [
        135000, 95000, 80000, 125000, 160000, 105000, 85000, 120000, 60000, 75000,
        90000, 130000, 110000, 145000, 80000, 140000, 118000, 99000, 123000, 137000,
        87000, 96000, 134000, 102000, 107000, 95000, 87000, 115000, 108000, 143000,
        150000, 98000, 105000, 125000, 133000, 118000, 104000, 111000, 150000, 99000,
        130000, 112000, 90000, 95000, 105000, 135000, 120000, 98000, 102000
    ],
    'fuel': [
        'Diesel', 'Petrol', 'Petrol', 'Diesel', 'Petrol', 'Diesel', 'Diesel', 'Petrol', 'Petrol', 'Diesel',
        'Diesel', 'Diesel', 'Diesel', 'Petrol', 'Diesel', 'Petrol', 'Petrol', 'Diesel', 'CNG', 'Petrol',
        'CNG', 'CNG', 'Diesel', 'Petrol', 'CNG', 'Petrol', 'Diesel', 'Diesel', 'Petrol', 'Petrol',
        'Diesel', 'Petrol', 'CNG', 'Petrol', 'Diesel', 'Diesel', 'CNG', 'Diesel', 'Petrol', 'Diesel',
        'CNG', 'Petrol', 'Petrol', 'Diesel', 'Diesel', 'Diesel', 'CNG', 'Diesel', 'Petrol'
    ],
    'owner': [
        'frst', 'scnd', 'frst', 'thrd', 'scnd', 'frst', 'scnd', 'thrd', 'frst', 'scnd',
        'thrd', 'frst', 'scnd', 'frst', 'thrd', 'frst', 'scnd', 'thrd', 'frst', 'scnd',
        'thrd', 'frst', 'scnd', 'frst', 'thrd', 'frst', 'scnd', 'thrd', 'frst', 'scnd',
        'thrd', 'frst', 'scnd', 'thrd', 'frst', 'scnd', 'thrd', 'frst', 'scnd', 'thrd',
        'frst', 'scnd', 'thrd', 'frst', 'scnd', 'thrd', 'frst', 'scnd', 'thrd'
    ],
    'selling_price': [
        560000, 450000, 330000, 200000, 350000, 280000, 320000, 290000, 800000, 720000,
        850000, 150000, 280000, 180000, 230000, 500000, 470000, 390000, 310000, 300000,
        410000, 360000, 400000, 270000, 330000, 300000, 500000, 380000, 340000, 320000,
        450000, 230000, 280000, 270000, 380000, 340000, 260000, 290000, 310000, 250000,
        450000, 390000, 700000, 820000, 950000, 280000, 160000, 300000, 370000
    ]})

In [10]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
8,BMW,60000,Petrol,frst,800000
43,BMW,95000,Diesel,frst,820000
11,Tata,130000,Diesel,frst,150000
37,Hyundai,111000,Diesel,frst,290000
12,Mahindra,110000,Diesel,scnd,280000


In [11]:
df['brand'].value_counts()

brand
Hyundai       6
Toyota        4
Ford          4
Kia           4
Honda         4
Maruti        4
Chevrolet     3
Nissan        3
Volkswagen    3
Renault       2
BMW           2
Audi          2
Mercedes      2
Tata          2
Mahindra      2
Skoda         2
Name: count, dtype: int64

In [13]:
df['brand'].nunique()

16

In [14]:
df['fuel'].value_counts()

fuel
Diesel    22
Petrol    19
CNG        8
Name: count, dtype: int64

In [15]:
df['owner'].value_counts()

owner
frst    18
scnd    16
thrd    15
Name: count, dtype: int64

**One Hot Encoding using Pandas**

In [17]:
pd.get_dummies(df, columns = ['fuel', 'owner'])
# In get_dummies , we will be passing dataset and columns.

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_Petrol,owner_frst,owner_scnd,owner_thrd
0,Toyota,135000,560000,False,True,False,True,False,False
1,Hyundai,95000,450000,False,False,True,False,True,False
2,Ford,80000,330000,False,False,True,True,False,False
3,Chevrolet,125000,200000,False,True,False,False,False,True
4,Nissan,160000,350000,False,False,True,False,True,False
5,Kia,105000,280000,False,True,False,True,False,False
6,Renault,85000,320000,False,True,False,False,True,False
7,Volkswagen,120000,290000,False,False,True,False,False,True
8,BMW,60000,800000,False,False,True,True,False,False
9,Audi,75000,720000,False,True,False,False,True,False


In [21]:
# Owner Column is replaced and fuel also
# newColumn will form -> columnName_Category

In [22]:
df.shape

(49, 5)

In [24]:
pd.get_dummies(df, columns = ['fuel', 'owner'], drop_first = True)
# Solving Multicollinearity Problem

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_Petrol,owner_scnd,owner_thrd
0,Toyota,135000,560000,True,False,False,False
1,Hyundai,95000,450000,False,True,True,False
2,Ford,80000,330000,False,True,False,False
3,Chevrolet,125000,200000,True,False,False,True
4,Nissan,160000,350000,False,True,True,False
5,Kia,105000,280000,True,False,False,False
6,Renault,85000,320000,True,False,True,False
7,Volkswagen,120000,290000,False,True,False,True
8,BMW,60000,800000,False,True,False,False
9,Audi,75000,720000,True,False,True,False


In [25]:
# first-Column will be removed by default

In [26]:
# We do not use this in ML, as pandas do no remember the position of value it is storing in Column

**ONE HOT ENCODING  by sklearn**

In [27]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size = 0.2, random_state = 42)

In [28]:
from sklearn.preprocessing import OneHotEncoder

In [29]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Toyota,135000,Diesel,frst,560000
1,Hyundai,95000,Petrol,scnd,450000
2,Ford,80000,Petrol,frst,330000
3,Chevrolet,125000,Diesel,thrd,200000
4,Nissan,160000,Petrol,scnd,350000


In [33]:
train_X.head()

Unnamed: 0,brand,km_driven,fuel,owner
12,Mahindra,110000,Diesel,scnd
4,Nissan,160000,Petrol,scnd
34,Toyota,133000,Diesel,frst
8,BMW,60000,Petrol,frst
3,Chevrolet,125000,Diesel,thrd


In [34]:
test_X.head()

Unnamed: 0,brand,km_driven,fuel,owner
13,Honda,145000,Petrol,frst
45,Mahindra,135000,Diesel,thrd
47,Kia,98000,Diesel,scnd
44,Mercedes,105000,Diesel,scnd
17,Kia,99000,Diesel,thrd


In [35]:
from sklearn.preprocessing import OneHotEncoder

In [36]:
ohe = OneHotEncoder()

In [39]:
# We need to done encoding ionly in fuel and owner 
# For it we need to pass fuel and owner only and then join new fuel and owner to brand and kms_driven
# column_transformer solves this prblm easily

In [40]:
ohe.fit_transform(train_X[['fuel', 'owner']])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 78 stored elements and shape (39, 6)>

In [41]:
train_X_new = ohe.fit_transform(train_X[['fuel', 'owner']]).toarray()

In [42]:
test_X_new = ohe.fit_transform(train_X[['fuel', 'owner']]).toarray()

In [43]:
#Now we will remove previos cols of X_train and append X_train_new to them.

In [44]:
train_X_new

array([[0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0.

In [46]:
train_X[['brand', 'km_driven']].values

array([['Mahindra', 110000],
       ['Nissan', 160000],
       ['Toyota', 133000],
       ['BMW', 60000],
       ['Chevrolet', 125000],
       ['Renault', 85000],
       ['Renault', 130000],
       ['Volkswagen', 112000],
       ['Tata', 120000],
       ['Toyota', 140000],
       ['Audi', 75000],
       ['Hyundai', 118000],
       ['Ford', 107000],
       ['Ford', 125000],
       ['Maruti', 150000],
       ['Toyota', 135000],
       ['BMW', 95000],
       ['Honda', 105000],
       ['Kia', 105000],
       ['Nissan', 143000],
       ['Tata', 130000],
       ['Maruti', 104000],
       ['Hyundai', 95000],
       ['Hyundai', 96000],
       ['Ford', 80000],
       ['Hyundai', 111000],
       ['Skoda', 118000],
       ['Chevrolet', 102000],
       ['Nissan', 99000],
       ['Mercedes', 90000],
       ['Volkswagen', 134000],
       ['Maruti', 123000],
       ['Maruti', 102000],
       ['Skoda', 87000],
       ['Volkswagen', 120000],
       ['Audi', 90000],
       ['Ford', 80000],
       ['Kia'

In [47]:
# Now we will be stacking both the np.arrays

In [48]:
np.hstack((train_X[['brand', 'km_driven']].values,train_X_new))

array([['Mahindra', 110000, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
       ['Nissan', 160000, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
       ['Toyota', 133000, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
       ['BMW', 60000, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
       ['Chevrolet', 125000, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
       ['Renault', 85000, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
       ['Renault', 130000, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
       ['Volkswagen', 112000, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
       ['Tata', 120000, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
       ['Toyota', 140000, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0],
       ['Audi', 75000, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0],
       ['Hyundai', 118000, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0],
       ['Ford', 107000, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       ['Ford', 125000, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0],
       ['Maruti', 150000, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
       ['Toyota', 135000, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
       ['BMW', 95000, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0],
       ['Honda', 105000, 1.0, 0.0, 0.

In [50]:
np.hstack((train_X[['brand', 'km_driven']].values,train_X_new)).shape

(39, 8)

In [51]:
# Appending has been done successfully

In [52]:
# Solving multicollinaerity problem
# ohe = OneHotEncoder(drop = 'first')

In [53]:
# ohe = OneHotEncoder(drop = 'first', sparse = True)
# Then we do not need to convert to np.array, it is already present in that form.

In [54]:
# Controlling data_type
# ohe = OneHotEncoder(drop = 'first', sparse = True, d_type = np.int32)

**ONE HOT ENCODING with Top Categories**

In [55]:
# Like Brand here

In [56]:
df['brand'].value_counts()

brand
Hyundai       6
Toyota        4
Ford          4
Kia           4
Honda         4
Maruti        4
Chevrolet     3
Nissan        3
Volkswagen    3
Renault       2
BMW           2
Audi          2
Mercedes      2
Tata          2
Mahindra      2
Skoda         2
Name: count, dtype: int64

In [57]:
# we will club categories that has car under 4

In [58]:
counts = df['brand'].value_counts()

In [60]:
df['brand'].nunique()
threshold = 3

In [63]:
repl = counts[counts<=threshold].index

In [64]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))
# Where ever replce comes, we will replace it by uncommon

Unnamed: 0,Ford,Honda,Hyundai,Kia,Maruti,Toyota,uncommon
0,False,False,False,False,False,True,False
1,False,False,True,False,False,False,False
2,True,False,False,False,False,False,False
3,False,False,False,False,False,False,True
4,False,False,False,False,False,False,True
5,False,False,False,True,False,False,False
6,False,False,False,False,False,False,True
7,False,False,False,False,False,False,True
8,False,False,False,False,False,False,True
9,False,False,False,False,False,False,True


In [65]:
# Column Transform solves this problem easily.