In [1]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day27-one-hot-encoding/cars.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


### One Hot Encoding using Pandas

In [4]:
pd.get_dummies(df, columns=['fuel', 'owner'], dtype='int64')

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


As in the above one hot encoding, for one nominal categorical column with k categories it will create k columns. And so the columns can have a mathematical relation with each other which leads to multicollinearity. To solve this problem, we perform k-1 encoding on the dataset.

In [6]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True, dtype='int64')

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### One Hot encoding using scikit-learn

Generally in the ML project we do one hot encoding using scikit-learn as pandas don't remember the order in which it encodes.

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1],
                                                     test_size=0.3, random_state=42)

In [13]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int64)              # k category to k-1 category
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [17]:
# Fetching the brand and km_driven columns from X_train and then merging with the X_train_new
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new))

array([['Maruti', 120000, 0, ..., 0, 0, 1],
       ['Toyota', 100000, 1, ..., 0, 0, 0],
       ['BMW', 39000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 35000, 0, ..., 0, 0, 0],
       ['Maruti', 27000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0]], dtype=object)

OHE when there is most frequent categories and others are very less or common in data. In this case, we make less common categories as 'others' or 'uncommon'.

In [19]:
counts = df['brand'].value_counts()
threshold = 100

repl = counts[counts <= threshold].index
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype=np.int32).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5544,0,0,0,1,0,0,0,0,0,0,0,0,0
6861,0,0,0,0,0,1,0,0,0,0,0,0,0
2756,0,0,0,0,0,0,1,0,0,0,0,0,0
1078,0,0,0,0,0,0,0,0,0,0,0,0,1
7345,0,0,0,0,0,1,0,0,0,0,0,0,0
