# One Hot Encoding
- One Hot Encoding is a method to convert categorical data into a binary matrix, addressing the challenges posed by categorical variables in machine learning models.

In [8]:
import numpy as np
import pandas as pd



In [9]:
df = pd.read_csv('cars.csv')

In [10]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
4112,Tata,120000,Diesel,Fourth & Above Owner,95000
1667,Tata,88000,Diesel,Second Owner,110000
5761,Chevrolet,50000,Diesel,First Owner,295000
6668,Tata,120000,Diesel,First Owner,150000
7040,Maruti,50000,CNG,First Owner,375000


In [11]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [12]:
pd.get_dummies(df,columns = ['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [13]:
pd.get_dummies(df,columns = ['fuel','owner'],drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [15]:
x_train.head()


Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [2]:
from sklearn.preprocessing import OneHotEncoder

In [42]:


ohe = OneHotEncoder(drop='first', sparse_output=False ,dtype = np.int32)


In [43]:
x_train_new = ohe.fit_transform(x_train[['fuel','owner']])

In [44]:
x_test_new = ohe.fit_transform(x_test[['fuel','owner']])

In [46]:
x_train_new


array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], shape=(6502, 7), dtype=int32)

In [47]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], shape=(6502, 9), dtype=object)

In [40]:
np.hstack((x_train[['brand','km_driven']].values,x_train_new)).shape

(6502, 9)

# OneHotEncoding with Top Categories

In [51]:
counts = df['brand'].value_counts()

In [52]:
df['brand'].nunique()
thresshold = 100


In [53]:
repl = counts[counts  <= thresshold].index

In [56]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False


In [58]:
pd.get_dummies(df['brand'].replace(repl,'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
2460,False,False,False,False,False,False,True,False,False,False,False,False,False
7505,False,False,False,False,False,True,False,False,False,False,False,False,False
4534,False,False,False,False,False,False,False,False,False,False,False,True,False
3013,False,False,False,False,True,False,False,False,False,False,False,False,False
6518,False,False,False,False,False,False,False,False,False,True,False,False,False
