##  Nominal/OHE (One Hot Encoding)

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
# create a sample dataframe with a catagorical variable
df = pd.DataFrame({
    'color' : ['red','blue','green','green','red','blue']
})

In [3]:
df

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red
5,blue


In [7]:
# create an instance of one hot encoder
encoder = OneHotEncoder()

In [8]:
encoder.fit_transform(df[['color']])

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [12]:
encoder.fit_transform(df[['color']]).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [13]:
# fit the encoder to the dataframe and transform the catagorical Variable 
encoded = encoder.fit_transform(df[['color']])

In [18]:
encoded_df = pd.DataFrame(encoded.toarray(),columns=encoder.get_feature_names_out())

In [16]:
encoder.get_feature_names_out()

array(['color_blue', 'color_green', 'color_red'], dtype=object)

In [19]:
encoded_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [20]:
pd.concat([df,encoded_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


In [21]:
import seaborn as sns

In [23]:
df = sns.load_dataset('tips')

In [25]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
encoder=OneHotEncoder()

In [27]:
encoder.fit_transform(df[['sex','smoker','day','time']])

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>

In [28]:
encoder.fit_transform(df[['sex','smoker','day','time']]).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [29]:
encoded = encoder.fit_transform(df[['sex','smoker','day','time']])

In [30]:
encoded_df = pd.DataFrame(encoded.toarray(),columns=encoder.get_feature_names_out())

In [31]:
encoded_df

Unnamed: 0,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [32]:
pd.concat([df[['sex','smoker','day','time']],encoded_df],axis=1)

Unnamed: 0,sex,smoker,day,time,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,Female,No,Sun,Dinner,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,Male,No,Sun,Dinner,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Female,No,Sun,Dinner,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,Male,No,Sat,Dinner,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,Female,Yes,Sat,Dinner,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,Male,Yes,Sat,Dinner,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,Male,No,Sat,Dinner,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
