# Feature Encoding


## 1. Ordinal Encoding
It is for ordinal categorical data

In [2]:
import numpy as np
import pandas as pd

In [18]:
df = pd.read_csv('/content/customer (1).csv').drop(columns=['age','gender'])
df.sample(5)

Unnamed: 0,review,education,purchased
49,Good,UG,No
7,Poor,School,Yes
26,Poor,PG,No
3,Good,PG,No
16,Poor,UG,Yes


In [7]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test= train_test_split(df.iloc[:,0:2],df.iloc[:,-1],test_size=0.2)

In [20]:
X_train.head()

Unnamed: 0,review,education
31,Poor,School
8,Average,UG
32,Average,UG
28,Poor,School
40,Good,School


In [21]:
y_train.head()

Unnamed: 0,purchased
31,Yes
8,No
32,Yes
28,No
40,No


In [22]:
or_encod= OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])  # We specify the category to control the number assigned to each category. Here poor will get 0 and good will get 2. If we keep it auto number assigning  will be random.

In [23]:
X_train= or_encod.fit_transform(X_train)
X_test= or_encod.transform(X_test)

In [25]:
X_train

array([[0., 0.],
       [1., 1.],
       [1., 1.],
       [0., 0.],
       [2., 0.],
       [1., 1.],
       [1., 0.],
       [0., 2.],
       [0., 2.],
       [0., 2.],
       [2., 0.],
       [1., 2.],
       [2., 2.],
       [1., 0.],
       [0., 0.],
       [1., 1.],
       [2., 1.],
       [0., 2.],
       [0., 1.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [2., 1.],
       [2., 1.],
       [2., 2.],
       [2., 0.],
       [1., 0.],
       [0., 1.],
       [2., 0.],
       [0., 0.],
       [1., 2.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [2., 1.],
       [0., 2.],
       [1., 2.],
       [0., 1.],
       [2., 2.],
       [1., 0.]])

In [26]:
or_encod.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [27]:
or_encod.feature_names_in_

array(['review', 'education'], dtype=object)

In [28]:
or_encod.n_features_in_

2

In [30]:
or_encod.get_feature_names_out()

array(['review', 'education'], dtype=object)

In [50]:
or_encod.inverse_transform(np.array([0,1]).reshape(1,2))

array([['Poor', 'UG']], dtype=object)

In [51]:
# handle unknown
or_encod.transform(np.array(['Poor','college']).reshape(1,2)) # i don't have category college



ValueError: Found unknown categories [np.str_('college')] in column 1 during transform

In [52]:
# i can set value for unknown value

or_encod= OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']],
                    handle_unknown='use_encoded_value',
                    unknown_value=-1)

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:2], df.iloc[:,-1], test_size=0.2)

X_train =or_encod.fit_transform(X_train)
or_encod.transform(np.array(['Poor','college']).reshape(1,2))



array([[ 0., -1.]])

In [72]:
# handling infrequent categories
X = np.array([['dog'] * 5 + ['cat'] * 20 + ['rabbit'] * 10 +['snake'] * 3 + ['horse'] * 2], dtype=object).T
X

array([['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['dog'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['cat'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['rabbit'],
       ['snake'],
       ['snake'],
       ['snake'],
       ['horse'],
       ['horse']], dtype=object)

In [73]:
encoder= OrdinalEncoder(max_categories=3).fit(X) # it will sort categories, encode top 3, and then the remaining categories will be merged and treated as same

In [74]:
encoder.infrequent_categories_ # here cat and rabbit is top two, so remaninings are infrequent category

[array(['dog', 'horse', 'snake'], dtype=object)]

In [75]:
encoder.categories_

[array(['cat', 'dog', 'horse', 'rabbit', 'snake'], dtype=object)]

In [59]:
encoder.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1)) # cat and rabbit is treated differently but dog,horse and snake is treated as same,because they are rare.

array([[0.],
       [1.],
       [2.],
       [2.],
       [2.]])

In [60]:
# this is better one
enc = OrdinalEncoder(min_frequency=4).fit(X)
enc.infrequent_categories_

[array(['horse', 'snake'], dtype=object)]

In [61]:
encoder.transform(np.array([['cat','rabbit','snake','dog','horse']]).reshape(5,1))

array([[0.],
       [1.],
       [2.],
       [2.],
       [2.]])

In [76]:
# handling missing data

# Example categorical data with missing values
data = [['Cat'], [np.nan], ['Dog'], ['Fish'], [np.nan]]

In [78]:
encoder=OrdinalEncoder(encoded_missing_value=np.nan)

In [79]:
encoder.fit_transform(data)

array([[ 0.],
       [nan],
       [ 1.],
       [ 2.],
       [nan]])

# Lable Encoder
used only in target column

In [80]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [81]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:2], df.iloc[:,-1], test_size=0.2)

In [82]:
from sklearn.preprocessing import LabelEncoder

In [83]:
le=LabelEncoder()

In [84]:
le.fit_transform(y_train)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0])

In [85]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [87]:
le.inverse_transform(np.array([1,0,0,1]))

array(['Yes', 'No', 'No', 'Yes'], dtype=object)

# For Nominal Categorical Data

## 2.OneHotEncoder
one hot encoding

In [89]:
df=pd.read_csv('/content/cars.csv').drop(columns=['km_driven','owner'])
df.sample(5)

Unnamed: 0,brand,fuel,selling_price
6299,Maruti,Diesel,380000
3146,Honda,Diesel,500000
2360,Chevrolet,Petrol,130000
1825,Hyundai,Petrol,340000
6352,Chevrolet,Diesel,125000


In [103]:
df.brand.count()

np.int64(8128)

In [97]:
df.isnull().sum()

Unnamed: 0,0
brand,0
fuel,0
selling_price,0


In [104]:
X=df.iloc[:,0:2]
y=df.iloc[:,-1]

In [105]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [107]:
X_train['fuel'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG'], dtype=object)

In [108]:
X_train['fuel'].nunique()

4

In [113]:
X_train.brand.nunique()

32

In [109]:
from sklearn.preprocessing import OneHotEncoder

In [110]:
oh= OneHotEncoder()

In [112]:
X_train.shape

(6502, 2)

In [114]:
# 34 category will turn into 34 column and 4 category will turn into 4 column , so total 36 column. [1,0,0,0,0,0,....36th] this type sparse matrix
oh.fit_transform(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13004 stored elements and shape (6502, 36)>

In [116]:
oh.fit_transform(X_train).toarray() # this turns the sparse matrix into array

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [118]:
oh.fit_transform(X_train).toarray().shape

(6502, 36)

In [119]:
oh.categories_

[array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
        'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
        'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
        'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
        'Peugeot', 'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen',
        'Volvo'], dtype=object),
 array(['CNG', 'Diesel', 'LPG', 'Petrol'], dtype=object)]

In [121]:
oh.get_feature_names_out()

array(['brand_Ambassador', 'brand_Ashok', 'brand_Audi', 'brand_BMW',
       'brand_Chevrolet', 'brand_Daewoo', 'brand_Datsun', 'brand_Fiat',
       'brand_Force', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Isuzu', 'brand_Jaguar', 'brand_Jeep', 'brand_Kia',
       'brand_Land', 'brand_Lexus', 'brand_MG', 'brand_Mahindra',
       'brand_Maruti', 'brand_Mercedes-Benz', 'brand_Mitsubishi',
       'brand_Nissan', 'brand_Opel', 'brand_Peugeot', 'brand_Renault',
       'brand_Skoda', 'brand_Tata', 'brand_Toyota', 'brand_Volkswagen',
       'brand_Volvo', 'fuel_CNG', 'fuel_Diesel', 'fuel_LPG',
       'fuel_Petrol'], dtype=object)

In [122]:
oh.fit_transform(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 13004 stored elements and shape (6502, 36)>

In [124]:
pd.DataFrame(oh.fit_transform(X_train).toarray(),columns=oh.get_feature_names_out())

Unnamed: 0,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,brand_Ford,...,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [126]:
# this gives direct array instead of sparse matrix
oh= OneHotEncoder(sparse_output=False,dtype=np.int32)
oh.fit_transform(X_train)

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int32)

In [129]:
oh.inverse_transform(np.array([0., 0., 1., 0., 0.,0.,0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]).reshape(1,36))

ValueError: Samples [0] can not be inverted when drop=None and handle_unknown='error' because they contain all zeros

In [130]:
# drop column to reduce ralation between columns, some algorithm is alergic to this type of relation
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(drop='first',sparse_output=False) # it drops first columns.[0 ,0, 1] can be written as [0, 1 ] also.
ohe.fit_transform(X_train).shape

(6502, 34)

In [131]:
# handling rare categories
X_train['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,1953
Hyundai,1127
Mahindra,635
Tata,586
Toyota,391
Honda,369
Ford,320
Chevrolet,185
Renault,183
Volkswagen,154


In [132]:
X_train.fuel.value_counts() # cng and lpg are rare

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,3545
Petrol,2880
CNG,46
LPG,31


In [134]:
# using min frequency

oh=OneHotEncoder(sparse_output=False,min_frequency=100)
oh.fit_transform(X_train).shape

(6502, 14)

In [135]:
oh.get_feature_names_out() # cng and lpg mearged into fuel_infrequent_sklearn, similarly in brand also.

array(['brand_Chevrolet', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
       'brand_Mahindra', 'brand_Maruti', 'brand_Renault', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_Diesel', 'fuel_Petrol', 'fuel_infrequent_sklearn'],
      dtype=object)

In [136]:
# using max_categories
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', max_categories=15)
ohe.fit_transform(X_train).shape

(6502, 19)

In [138]:
ohe.get_feature_names_out()

array(['brand_BMW', 'brand_Chevrolet', 'brand_Ford', 'brand_Honda',
       'brand_Hyundai', 'brand_Jaguar', 'brand_Mahindra', 'brand_Maruti',
       'brand_Nissan', 'brand_Renault', 'brand_Skoda', 'brand_Tata',
       'brand_Toyota', 'brand_Volkswagen', 'brand_infrequent_sklearn',
       'fuel_CNG', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol'], dtype=object)

In [139]:
# how to handle unknown category
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

ohe = OneHotEncoder(drop='first',sparse_output=False)
ohe.fit_transform(X_train)

ohe.transform(np.array(['local','Petrol']).reshape(1,2))



ValueError: Found unknown categories [np.str_('local')] in column 0 during transform

In [140]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit_transform(X_train)

ohe.transform(np.array(['local','Petrol']).reshape(1,2))



array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]])

In [141]:
ohe.inverse_transform(np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.]).reshape(1,36))

array([[None, 'Petrol']], dtype=object)

# LabelBinarizer

It is used on target column. It perform onehotencoding on target column. We need it in multiclass logistic regression and softmax in deep learning


In [142]:
from sklearn.preprocessing import LabelBinarizer

# Sample target variable for a multi-class classification problem
y = ['cat', 'dog', 'fish', 'dog', 'cat']

# Initialize the LabelBinarizer
lb = LabelBinarizer()

# Fit and transform the target variable
y_binarized = lb.fit_transform(y)

print("Binarized labels:\n", y_binarized)

# Inverse transform to recover original labels
y_original = lb.inverse_transform(y_binarized)

print("Original labels:\n", y_original)

Binarized labels:
 [[1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]]
Original labels:
 ['cat' 'dog' 'fish' 'dog' 'cat']


In [143]:
from sklearn.preprocessing import MultiLabelBinarizer
# If there exist multi layer target, for example movie genre= Action,Drama and Action,Thriller

# Example multi-label data
y = [('red', 'blue'), ('blue', 'green'), ('green',), ('red',)]

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the data to binary matrix format
Y = mlb.fit_transform(y)

print("Binary matrix:\n", Y)
print("Class labels:", mlb.classes_)

# Inverse transform to recover original labels
y_inv = mlb.inverse_transform(Y)
print("Inverse transformed labels:", y_inv)


Binary matrix:
 [[1 0 1]
 [1 1 0]
 [0 1 0]
 [0 0 1]]
Class labels: ['blue' 'green' 'red']
Inverse transformed labels: [('blue', 'red'), ('blue', 'green'), ('green',), ('red',)]
