OHE, or **One-Hot Encoding**, is a technique used to represent categorical data as binary vectors. Each unique category is assigned a vector where one element is `1` (indicating the presence of that category) and all other elements are `0`.

Example:


In [1]:
sentences = ["I love NLP", "my name is sourav", "I am a data science learner", "I am learning python", "I am learning NLP"]

In [2]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [3]:
tokens = [sentence.lower().split() for sentence in sentences]
flat_tokens = list(set(word for sentence in tokens for word in sentence))


In [4]:
type(flat_tokens)

list

In [5]:
# Step 2: Vocabulary
print("Vocabulary:", flat_tokens)

Vocabulary: ['nlp', 'learner', 'sourav', 'love', 'data', 'learning', 'python', 'a', 'am', 'science', 'my', 'is', 'name', 'i']


In [6]:
import pandas as pd

In [9]:
#Step 3: One-hot encoding
df = pd.DataFrame(flat_tokens, columns=["word"])
# encoder = OneHotEncoder(sparse_output=False)

# one_hot_encoded = encoder.fit_transform(df[categorical_columns])
encoder = OneHotEncoder(sparse_output=False)
one_hot_matrix = encoder.fit_transform(df[["word"]])

In [10]:
# Show result
encoded_df = pd.DataFrame(one_hot_matrix, columns=encoder.get_feature_names_out())
encoded_df.insert(0, "Word", df["word"])
print(encoded_df)

        Word  word_a  word_am  word_data  word_i  word_is  word_learner  \
0        nlp     0.0      0.0        0.0     0.0      0.0           0.0   
1    learner     0.0      0.0        0.0     0.0      0.0           1.0   
2     sourav     0.0      0.0        0.0     0.0      0.0           0.0   
3       love     0.0      0.0        0.0     0.0      0.0           0.0   
4       data     0.0      0.0        1.0     0.0      0.0           0.0   
5   learning     0.0      0.0        0.0     0.0      0.0           0.0   
6     python     0.0      0.0        0.0     0.0      0.0           0.0   
7          a     1.0      0.0        0.0     0.0      0.0           0.0   
8         am     0.0      1.0        0.0     0.0      0.0           0.0   
9    science     0.0      0.0        0.0     0.0      0.0           0.0   
10        my     0.0      0.0        0.0     0.0      0.0           0.0   
11        is     0.0      0.0        0.0     0.0      1.0           0.0   
12      name     0.0     