In [1]:
# LabelEncoder in scikit-learn is used to convert categorical labels into numeric form, i.e., it encodes categorical values as integers.
# It is commonly used for output/target variables (like class labels in classification) because many ML models require numeric targets.

# LabelEncoder is mainly used on nominal categorical data—data that has categories with no intrinsic order.
# Examples of nominal data: ["Red", "Blue", "Green"], ["Cat", "Dog", "Rabbit"], ["USA", "India", "UK"]
# It assigns each category a unique integer: Red → 0, Blue → 1, Green → 2.

In [3]:
# Ordinal Encoding is a technique used to convert ordinal categorical data into numeric form.

# Key Points:
# Ordinal Data
# Categories have a natural order or ranking.

# Examples:
# ["Low", "Medium", "High"]
# ["Beginner", "Intermediate", "Advanced"]

In [154]:
import numpy as np
import pandas as pd

In [156]:
df=pd.read_csv("customer.csv")

In [158]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
10,98,Female,Good,UG,Yes
1,68,Female,Poor,UG,No
43,27,Male,Poor,PG,No
2,70,Female,Good,PG,No
25,57,Female,Good,School,No


In [11]:
#we have to apply label encoder to target column purchased and ordinal encoder to ordinal categorical columns: review and education
#however age and gender are nominal categorical data so it should be removed now because we are not applying OneHot encoding here

In [160]:
df=df.iloc[:,2:]

In [162]:
df.head(10)

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes


In [164]:
df.isnull().sum()

review       0
education    0
purchased    0
dtype: int64

In [166]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.iloc[:,:2],df.iloc[:,2],test_size=0.2,random_state=0)

In [168]:
x_train.shape,x_test.shape

((40, 2), (10, 2))

# Ordinal Encoder

In [170]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder(categories=[["Poor","Average","Good"],["School","UG","PG"]])
oe.fit(x_train)
x_train=oe.transform(x_train)


In [172]:
x_train

array([[2., 2.],
       [0., 0.],
       [0., 2.],
       [1., 0.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [0., 2.],
       [2., 1.],
       [1., 1.],
       [0., 1.],
       [1., 1.],
       [1., 1.],
       [0., 1.],
       [2., 2.],
       [1., 0.],
       [0., 2.],
       [1., 1.],
       [1., 0.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [2., 0.],
       [2., 1.],
       [0., 1.],
       [0., 0.],
       [1., 2.],
       [1., 2.],
       [2., 0.],
       [2., 0.],
       [2., 1.],
       [1., 2.],
       [0., 2.],
       [2., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [1., 0.],
       [2., 2.],
       [1., 1.]])

In [174]:
x_test=oe.transform(x_test)

In [176]:
x_test

array([[0., 0.],
       [2., 1.],
       [2., 1.],
       [2., 2.],
       [2., 2.],
       [0., 2.],
       [2., 0.],
       [0., 0.],
       [0., 2.],
       [1., 1.]])

# Label Encoder

In [182]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(y_train)
y_train=le.transform(y_train)

In [184]:
y_train

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0])

In [186]:
y_test=le.transform(y_test)

In [188]:
y_test

array([0, 1, 1, 1, 0, 0, 0, 1, 1, 0])

In [190]:
df

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No
5,Average,School,Yes
6,Good,School,No
7,Poor,School,Yes
8,Average,UG,No
9,Good,UG,Yes
