#Handling Categorical Attributes -->pandas -->get_dummies()

scikit-learn provides Preprocessing techniques such as

-->Label Encoding ->Assign each categorical value within the column a number (ranking) alphabetical order

-->OneHot Encoding ->Creates new variables that takes on values 0 and 1 to represent the original categorical values

In [1]:
import numpy as np
import pandas as pd

In [2]:
d = {'Names':['Saketh','Sivaram','Sai','Rahul','Kiran','Eswar'],
    'places':['Knr','Vjwda','Vzg','Vjwda','Antpr','Vjwda']}
data = pd.DataFrame(d)
data

Unnamed: 0,Names,places
0,Saketh,Knr
1,Sivaram,Vjwda
2,Sai,Vzg
3,Rahul,Vjwda
4,Kiran,Antpr
5,Eswar,Vjwda


In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
label = LabelEncoder()

In [5]:
data['places'] = label.fit_transform(data['places'])
data['places']

0    1
1    2
2    3
3    2
4    0
5    2
Name: places, dtype: int32

In [6]:
data

Unnamed: 0,Names,places
0,Saketh,1
1,Sivaram,2
2,Sai,3
3,Rahul,2
4,Kiran,0
5,Eswar,2


In [7]:
d = {'Names':['Saketh','Sivaram','Sai','Rahul','Kiran','Eswar'],
    'places':['Knr','Vjwda','Vzg','Vjwda','Antpr','Vjwda']}
data = pd.DataFrame(d)
data

Unnamed: 0,Names,places
0,Saketh,Knr
1,Sivaram,Vjwda
2,Sai,Vzg
3,Rahul,Vjwda
4,Kiran,Antpr
5,Eswar,Vjwda


In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
one_hot_encoder = OneHotEncoder()

In [10]:
d=one_hot_encoder.fit_transform(data[['places']])
d

<6x4 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [11]:
d.toarray()

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [12]:
final_data = data.join(pd.DataFrame(d.toarray()))
final_data

Unnamed: 0,Names,places,0,1,2,3
0,Saketh,Knr,0.0,1.0,0.0,0.0
1,Sivaram,Vjwda,0.0,0.0,1.0,0.0
2,Sai,Vzg,0.0,0.0,0.0,1.0
3,Rahul,Vjwda,0.0,0.0,1.0,0.0
4,Kiran,Antpr,1.0,0.0,0.0,0.0
5,Eswar,Vjwda,0.0,0.0,1.0,0.0


or

In [13]:
e=pd.concat([data,pd.DataFrame(d.toarray())],axis=1)
e

Unnamed: 0,Names,places,0,1,2,3
0,Saketh,Knr,0.0,1.0,0.0,0.0
1,Sivaram,Vjwda,0.0,0.0,1.0,0.0
2,Sai,Vzg,0.0,0.0,0.0,1.0
3,Rahul,Vjwda,0.0,0.0,1.0,0.0
4,Kiran,Antpr,1.0,0.0,0.0,0.0
5,Eswar,Vjwda,0.0,0.0,1.0,0.0


In [14]:
d = {'Names':['Saketh','Sivaram','Sai','Rahul','Kiran','Eswar'],
    'places':['Knr','Vjwda','Vzg','Vjwda','Antpr','Vjwda']}
data = pd.DataFrame(d)
data

Unnamed: 0,Names,places
0,Saketh,Knr
1,Sivaram,Vjwda
2,Sai,Vzg
3,Rahul,Vjwda
4,Kiran,Antpr
5,Eswar,Vjwda


In [15]:
one_hot_encoder.fit(data[['places']])

In [16]:
one_hot_encoder.categories_

[array(['Antpr', 'Knr', 'Vjwda', 'Vzg'], dtype=object)]

In [17]:
one_hot_encoder.fit_transform(data[['places']])

<6x4 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [18]:
one_hot_encoder.get_feature_names_out()

array(['places_Antpr', 'places_Knr', 'places_Vjwda', 'places_Vzg'],
      dtype=object)

-->Ordinal Encoding-->transform categorical variables into a numerical format by assigning a unique value to each of its categories.

In [19]:
from sklearn.preprocessing import OrdinalEncoder

In [20]:
ordinal_encoder = OrdinalEncoder()

In [21]:
data = {'cost': ['50', '35', '75', '42', '54', '71'],
        'size': ['large', 'small', 'extra large', 'medium', 'large', 'extra large']}
df = pd.DataFrame(data)

In [22]:
df['size_encoded']=ordinal_encoder.fit_transform(df[['size']])


In [23]:
df

Unnamed: 0,cost,size,size_encoded
0,50,large,1.0
1,35,small,3.0
2,75,extra large,0.0
3,42,medium,2.0
4,54,large,1.0
5,71,extra large,0.0
