In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
X = pd.DataFrame({"input1" : [1,2,3,4,5],
                  "input2" : ["A","A","B","B","C"],
                  "input3" : ["X","X","X","Y","Y"]})
X

Unnamed: 0,input1,input2,input3
0,1,A,X
1,2,A,X
2,3,B,X
3,4,B,Y
4,5,C,Y


In [3]:
categorical_vars = ["input2", "input3"]


Normally, OneHotEncoder returns a sparse matrix (memory-efficient format).

ðŸ‘‰ sparse=False means:
	â€¢	Output will be a normal NumPy array
	â€¢	Easier to view, debug, and use directly.

  drop="first"

This removes the first category column.

In [4]:
one_hot_encoder = OneHotEncoder(sparse_output=False, drop = "first")
one_hot_encoder

ðŸ”¹ fit()

Learns categories from the data.

ðŸ”¹ transform()

Converts categories into one-hot encoded numeric format.


** encoder_vars_array**

This variable stores:

ðŸ‘‰ The encoded numeric array (NumPy array because sparse=False).


In [5]:
encoder_vars_array = one_hot_encoder.fit_transform(X[categorical_vars])
encoder_vars_array

array([[0., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [1., 0., 1.],
       [0., 1., 1.]])

get_feature_names_out()

This method returns the names of the new columns created by OneHotEncoder after encoding.

In [6]:

encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)
encoder_feature_names

array(['input2_B', 'input2_C', 'input3_Y'], dtype=object)

In [7]:
#Converts encoded numeric array into a labeled dataframe using feature names.
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)
encoder_vars_df

Unnamed: 0,input2_B,input2_C,input3_Y
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,1.0
4,0.0,1.0,1.0


reset_index(drop=True)

VERY IMPORTANT.

Sometimes:
	â€¢	X and encoded dataframe may have different index values.

  â€¢	If indexes donâ€™t match, concat may misalign rows.




In [8]:
X_new = pd.concat([X.reset_index(drop=True), encoder_vars_df.reset_index(drop=True)], axis = 1)
X_new

Unnamed: 0,input1,input2,input3,input2_B,input2_C,input3_Y
0,1,A,X,0.0,0.0,0.0
1,2,A,X,0.0,0.0,0.0
2,3,B,X,1.0,0.0,0.0
3,4,B,Y,1.0,0.0,1.0
4,5,C,Y,0.0,1.0,1.0


** axis = 1**

Specifies drop columns.

	â€¢	axis=0 â†’ drop rows

	â€¢	axis=1 â†’ drop columns


** inplace = True**

Means:

ðŸ‘‰ Modify the dataframe directly.

In [9]:

X_new.drop(categorical_vars, axis = 1, inplace = True)
X_new

Unnamed: 0,input1,input2_B,input2_C,input3_Y
0,1,0.0,0.0,0.0
1,2,0.0,0.0,0.0
2,3,1.0,0.0,0.0
3,4,1.0,0.0,1.0
4,5,0.0,1.0,1.0
