# <font color = 'red'><b>Encoding</b></font>

In [2]:
# Encoding Categorical variables
# Encoding is the process of converting categorical data into a numerical format.
# Categorical data is data that represents categories. For example
# a "color" variable may contain a list of colors: red, blue, green, etc.

# Categorical data is often represented as strings, but can also be encoded as numbers.
# This is useful as it allows the data to be used in machine learning algorithms.
# There are many ways to encode categorical data, such as one-hot encoding, label encoding, and target encoding.

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('customer.csv')

In [4]:
df = data.iloc[:, 2:]
df.sample(5)

Unnamed: 0,review,education,purchased
28,Poor,School,No
1,Poor,UG,No
44,Average,UG,No
13,Average,School,No
0,Average,School,No


In [8]:
df.purchased.value_counts()

purchased
No     26
Yes    24
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df[['review', 'education']]
y = df[['purchased']]

In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [16]:
oe = OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [17]:
oe.fit(Xtrain)

In [18]:
Xtrain = oe.transform(Xtrain)
Xtest = oe.transform(Xtest)

In [20]:
le = LabelEncoder()

In [21]:
le.fit(ytrain)

  y = column_or_1d(y, warn=True)


In [22]:
ytrain = le.transform(ytrain)
ytest = le.transform(ytest)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [23]:
ytrain

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])

---

# <font color = 'red'><b>One-Hot Encoding</b></font>

One-hot encoding creates a binary column for each category and assigns a 1 to the column corresponding to the category of a data point and 0 to all others. For the same categories "red," "green," and "blue," one-hot encoding would create three columns: red, green, and blue, with a row having a 1 in the column of its category.


If there are `n` categories in the column then we have to keep only the `n-1` columns, because of multicolinearity (mathematical relationship between columns, so we should avoid this) we remove one column generally the first one.

And still we can represent the `n` categories with `n-1` columns.

In [22]:
import numpy as np
import pandas as pd

In [23]:
cars = pd.read_csv('cars.csv')

In [24]:
cars.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6088,Datsun,80000,Petrol,Second Owner,329000
4765,Ford,64000,Diesel,First Owner,1275000
7606,Mahindra,69000,Diesel,First Owner,780000
5217,Tata,25000,Petrol,First Owner,400000
1647,Maruti,110000,Diesel,First Owner,470000


# One-Hot-Encoding with pandas

In [25]:
pd.get_dummies(cars, columns=['fuel', 'owner'], dtype='int', drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


Although we can perform one hot encoding with pandas but we dont usually use it in model creation, why ? Because pandas dont remember anything about the encoding means it doesnt remeber where it places which column, so if we do the same process again we may get different output.

So we prefered to use One-Hot-Encoding from scikit-learn which remember the position of columns after encoding.

In [27]:
from sklearn.preprocessing import OneHotEncoder

In [70]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=int)

In [71]:
X_categorical = ['fuel', 'owner']
encoded_data = ohe.fit_transform(cars[X_categorical])
encoded_data.shape

(8128, 7)

In [72]:
encoded_feature_names = ohe.get_feature_names_out(['fuel', 'owner'])
encoded_feature_names

array(['fuel_Diesel', 'fuel_LPG', 'fuel_Petrol',
       'owner_Fourth & Above Owner', 'owner_Second Owner',
       'owner_Test Drive Car', 'owner_Third Owner'], dtype=object)

In [73]:
cars_encoded = pd.DataFrame(encoded_data, columns=encoded_feature_names)
cars_encoded

Unnamed: 0,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,1,0,0,0,0,0,0
1,1,0,0,0,1,0,0
2,0,0,1,0,0,0,1
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
8123,0,0,1,0,0,0,0
8124,1,0,0,1,0,0,0
8125,1,0,0,0,0,0,0
8126,1,0,0,0,0,0,0


In [75]:
df = pd.concat([cars.drop(columns = X_categorical), cars_encoded], axis = 1)

In [77]:
df.shape

(8128, 10)

In [78]:
df.sample(5)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
2541,Mahindra,30000,1250000,1,0,0,0,0,0,0
7873,BMW,8500,5500000,1,0,0,0,0,0,0
4532,Maruti,65000,260000,0,0,1,0,1,0,0
6329,Ford,163000,200000,1,0,0,0,0,0,0
2968,Renault,20000,275000,0,0,1,0,0,0,0


---