<a href="https://colab.research.google.com/github/rajapramod/machine-learning/blob/main/OneHotEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv(r"https://raw.githubusercontent.com/rajapramod/machine-learning/refs/heads/main/datasets/cars.csv")
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [6]:
data.owner.value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


In [12]:
data.fuel.value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


### OneHotEncoding using `pandas`

For single categorical variable, `pd.get_dummies()` creates as many columns as the total number of categories in that variable.

In [8]:
data_encoded = pd.get_dummies(data, columns=['fuel', 'owner'])
data_encoded

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [11]:
data.shape, data_encoded.shape

((8128, 5), (8128, 12))


### $(k-1)$ OneHotEncoding


In [14]:
new_data_encoded = pd.get_dummies(data, columns=['fuel','owner'], drop_first=True)
new_data_encoded

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [15]:
data.shape, data_encoded.shape, new_data_encoded.shape

((8128, 5), (8128, 12), (8128, 10))

### One Hot Encoding using `sklearn`

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,0:4],data.iloc[:,-1], test_size=0.2, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [31]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
ohe.fit(X_train[['fuel','owner']])
X_train_new = ohe.transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [32]:
X_train.shape, X_train_new.shape

((6502, 4), (6502, 7))

### OneHotEncoding with top categories

In [41]:
data['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [48]:
counts = data['brand'].value_counts()
counts

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [55]:
threshold = 100

# Extracting the brands whose total count amounts to less than the threshold
# and getting their row values
counts[counts <= threshold].index

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [56]:
replace_ = counts[counts <= threshold].index

In [57]:
# Replacing others with "uncommon"

pd.get_dummies(data['brand'].replace(replace_, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2687,False,False,False,False,True,False,False,False,False,False,False,False,False
3692,False,False,False,False,True,False,False,False,False,False,False,False,False
784,False,False,False,False,False,True,False,False,False,False,False,False,False
5580,False,False,False,False,True,False,False,False,False,False,False,False,False
5991,False,False,False,False,True,False,False,False,False,False,False,False,False
