## One-hot encoding of words
- - - -
One-hot encoding is the most common, most basic way to turn a token into a vector. It consists in associating a unique integer index to every word, then 
turning this integer index i into a binary vector of size N, the size of the vocabulary, that would be all-zeros except for the i-th 
entry, which would be 1.

In [1]:
# Load packages
import numpy as np

In [2]:
# Random samples of text
samples = ["The cat sat on the mat.", "The dog ate my homework."]

In [3]:
# Create word-index mapping for each unique word in the sample text
token_index = dict()

for sample in samples:
    word_tokens = sample.split() # Access words from sentences by simple split
    for word in word_tokens:
        if word not in token_index:
            token_index[word] = len(token_index) # Update dictionary
        else:
            # If word already present do nothing
            continue
print("Number of unique words in the mapping = {}".format(len(token_index)))

Number of unique words in the mapping = 10


In [4]:
# Lets view the mapping just created
token_index

{'The': 0,
 'cat': 1,
 'sat': 2,
 'on': 3,
 'the': 4,
 'mat.': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework.': 9}

In [5]:
max_num_words = 10 # Set maximum number of words in a sample

# Create 3d array of all zeros to hold the one-hot vectors
one_hot_encoded = np.zeros(shape=(len(samples), max_num_words, len(token_index)))

# Iterate over samples and create one-hot vectors
for i, sample in enumerate(samples):
    word_tokens = samples[i].split()
    for j, word in enumerate(word_tokens):
        # Find the index of the word
        index = token_index[word]
        # Put 1 at that place and let all be 0
        one_hot_encoded[i, j, index] = 1

In [6]:
samples[0], one_hot_encoded[0]

('The cat sat on the mat.', array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

### Yeah!! Our one-hot word vectors!

## Let's do the same using Keras API

In [7]:
# Load packages
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
tf.__version__

'1.13.1'

In [8]:
# Initiate keras tokenizer
tokenizer = Tokenizer(num_words=100, lower=False)

tokenizer.fit_on_texts(samples) # Fit tokenizer on the entire data

In [9]:
# Generate sequences from train data
sequences = tokenizer.texts_to_sequences(samples)

print(sequences[0], sequences[1])

[1, 2, 3, 4, 5, 6] [1, 7, 8, 9, 10]


In [10]:
# Generate one-hot vector matrix
one_hot_encoded_keras = tokenizer.texts_to_matrix(samples, mode="binary")

print(one_hot_encoded_keras.shape)

print(one_hot_encoded_keras[0])
print(one_hot_encoded_keras[1])

(2, 100)
[0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [11]:
# Now get the word-index mapping for further use
word_index_keras = tokenizer.word_index

print("Number of unique words in vocab = {}".format(len(word_index_keras)))

print(word_index_keras)

Number of unique words in vocab = 10
{'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework': 10}


### Hurray!! Done.