# One-Hot Encoding for Text Data in NLP

When working with text data in Natural Language Processing (NLP), transforming text into a format that machine learning models can understand is crucial. One of the simplest techniques for text representation is One-Hot Encoding.

In [2]:
sentences = ["Data is power, and power is knowledge."]

In [3]:
# Split sentence into words
words = " ".join(sentences).split()

In [4]:
words

['Data', 'is', 'power,', 'and', 'power', 'is', 'knowledge.']

In [5]:
# Create a unique set of words (vocabulary)
unique_words = list(set(words))

In [6]:
unique_words

['power,', 'power', 'is', 'knowledge.', 'Data', 'and']

In [7]:
# Create a word-to-index dictionary
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

In [8]:
word_to_index

{'power,': 0, 'power': 1, 'is': 2, 'knowledge.': 3, 'Data': 4, 'and': 5}

In [9]:
# Function to perform One-Hot Encoding
def one_hot_encode_word(words, word_to_index):
    encoded_data = []
    for word in words:
        vector = [0] * len(word_to_index)  # Initialize a zero vector
        vector[word_to_index[word]] = 1  # Set the position corresponding to the word to 1
        encoded_data.append(vector)
    return encoded_data

In [10]:
# Apply One-Hot Encoding to sentences
encoded_words = one_hot_encode_word(words, word_to_index)

In [11]:
encoded_words

[[0, 0, 0, 0, 1, 0],
 [0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 0, 0]]

## Using sklearn OneHotEncoder API

In [13]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [14]:
# Create a numerical representation for One-Hot Encoding
numerical_data = [[word_to_index[word] for word in sentence.split()] for sentence in sentences]

In [15]:
numerical_data

[[4, 2, 0, 5, 1, 2, 3]]

In [17]:
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(np.array(numerical_data).reshape(-1, 1))

In [18]:
encoded

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

## Character-Level One-Hot Encoding

In [19]:
word = "Data"

In [20]:
# Create a unique set of characters
unique_chars = sorted(set(word))
unique_chars

['D', 'a', 't']

In [21]:
# Create a character-to-index dictionary
char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
char_to_index

{'D': 0, 'a': 1, 't': 2}

In [22]:
# Function to perform One-Hot Encoding at character level
def one_hot_encode_char(word, char_to_index):
    encoded_data = []
    for char in word:
        vector = [0] * len(char_to_index)  # Initialize a zero vector
        vector[char_to_index[char]] = 1  # Set the position corresponding to the character to 1
        encoded_data.append(vector)
    return encoded_data

In [23]:
# Apply One-Hot Encoding to word
encoded_chars = one_hot_encode_char(word, char_to_index)

In [24]:
encoded_chars

[[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]