In [1]:
# One-Hot Encoding is a method to convert categorical (nominal) data into a binary matrix (0s and 1s), where:
# Each category gets its own column (feature).
# 1 is placed in the column corresponding to that category, and 0s everywhere else.

# Example:
# Suppose you have a column Color:
# Color:
# Red
# Blue
# Green
# Red

# After One-Hot Encoding:
# | Red | Blue | Green |
# | --- | ---- | ----- |
# | 1   | 0    | 0     |
# | 0   | 1    | 0     |
# | 0   | 0    | 1     |
# | 1   | 0    | 0     |

In [3]:
# 👉 One-Hot Encoding creates extra redundant columns (dummy variable trap).
# 👉 We use drop='first' to remove one column and avoid multicollinearity.
#Multicollinearity means two or more independent variables (features) in a dataset are highly correlated, i.e., they give duplicate information.
#if there are n columns, 1 column is dropped and n-1 column is finally remained

In [3]:
import numpy as np
import pandas as pd

In [5]:
df=pd.read_csv("cars.csv")

In [7]:
df.head(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [9]:
df=df.iloc[:,2:]

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.iloc[:,:2],df.iloc[:,2:],test_size=0.2,random_state=0)

In [13]:
x_train.shape

(6502, 2)

In [15]:
x_test.shape

(1626, 2)

In [17]:
from sklearn.preprocessing import OneHotEncoder
pd.get_dummies(df,columns=["fuel","owner"])

Unnamed: 0,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,450000,False,True,False,False,True,False,False,False,False
1,370000,False,True,False,False,False,False,True,False,False
2,158000,False,False,False,True,False,False,False,False,True
3,225000,False,True,False,False,True,False,False,False,False
4,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,320000,False,False,False,True,True,False,False,False,False
8124,135000,False,True,False,False,False,True,False,False,False
8125,382000,False,True,False,False,True,False,False,False,False
8126,290000,False,True,False,False,True,False,False,False,False


In [19]:
pd.get_dummies(df,columns=["fuel","owner"],drop_first=True,dtype=np.int32)

Unnamed: 0,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,450000,1,0,0,0,0,0,0
1,370000,1,0,0,0,1,0,0
2,158000,0,0,1,0,0,0,1
3,225000,1,0,0,0,0,0,0
4,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
8123,320000,0,0,1,0,0,0,0
8124,135000,1,0,0,1,0,0,0
8125,382000,1,0,0,0,0,0,0
8126,290000,1,0,0,0,0,0,0


In [21]:
from sklearn.preprocessing import OneHotEncoder
df1=pd.read_csv("cars.csv")

In [23]:
df1.head(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [25]:
df1=df1.iloc[:,2:]

In [27]:
df1.head(5)

Unnamed: 0,fuel,owner,selling_price
0,Diesel,First Owner,450000
1,Diesel,Second Owner,370000
2,Petrol,Third Owner,158000
3,Diesel,First Owner,225000
4,Petrol,First Owner,130000


In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,:2],df1.iloc[:,2:],test_size=0.2,random_state=0)


In [31]:
x_train.shape

(6502, 2)

In [33]:
x_test.shape

(1626, 2)

In [41]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(drop="first",sparse_output=False,dtype=np.int32)
ohe_encoded=ohe.fit_transform(df1[["fuel","owner"]])
ohe_encoded_df=pd.DataFrame(ohe_encoded,columns=ohe.get_feature_names_out(["fuel","owner"]))

In [45]:
ohe_encoded_df

Unnamed: 0,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,1,0,0,0,0,0,0
1,1,0,0,0,1,0,0
2,0,0,1,0,0,0,1
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
8123,0,0,1,0,0,0,0
8124,1,0,0,1,0,0,0
8125,1,0,0,0,0,0,0
8126,1,0,0,0,0,0,0
