In [1]:
# load packages
import os
import sys
import numpy as np

In [2]:
# random samples
samples = ["The cat sat on the mat.", "The dog ate my homework."]

# One-hot encoding of words
----

One-hot encoding is the most common, most basic way to turn a token into a vector. It consists in associating a unique integer index to every word, then 
turning this integer index i into a binary vector of size N, the size of the vocabulary, that would be all-zeros except for the i-th 
entry, which would be 1.

In [3]:
# create word-index mapping for each unique word in the sample
token_index = dict()
for sample in samples:
    word_tokens = sample.split()
    for word in word_tokens:
        if word not in token_index:
            token_index[word] = len(token_index)
        else:
            continue
"number of unique words = ", len(token_index)

('number of unique words = ', 10)

In [4]:
token_index

{'The': 0,
 'cat': 1,
 'sat': 2,
 'on': 3,
 'the': 4,
 'mat.': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework.': 9}

In [5]:
max_len = 10 # set max number of words in each sample

In [6]:
# now lets fill the variable with encoded vectors for each word form the word-mapping index
# variable to hold one-hot-encoded word vectors
one_hot_encoded = np.zeros(shape=(len(samples), max_len, len(token_index)))

for i, sample in enumerate(samples):
    word_tokens = samples[i].split()
    for j, word in enumerate(word_tokens):
        index = token_index[word]
        one_hot_encoded[i, j, index] = 1

In [7]:
samples[0]

'The cat sat on the mat.'

In [8]:
one_hot_encoded[0]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

There you have your one-hot encoded word vectors.

## Let's do the same using Keras API

In [9]:
import tensorflow as tf
tf.__version__

'1.12.0'

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [11]:
tokenizer = Tokenizer(num_words=100, lower=False) # take accounbt for only most frequent 100 words

In [12]:
samples

['The cat sat on the mat.', 'The dog ate my homework.']

In [13]:
tokenizer.fit_on_texts(samples) # train your tokenizer on the available data

In [14]:
sequences = tokenizer.texts_to_sequences(samples) # generate sequences for the train data

In [15]:
sequences[0], sequences[1]

([1, 2, 3, 4, 5, 6], [1, 7, 8, 9, 10])

In [16]:
one_hot_encoded_keras = tokenizer.texts_to_matrix(samples, mode="binary")

In [17]:
one_hot_encoded_keras[0]

array([0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
one_hot_encoded_keras.shape

(2, 100)

In [19]:
# now lets get the word-index mapping to retrieve the word
word_index_keras = tokenizer.word_index
"number of unique words = ", len(word_index_keras)

('number of unique words = ', 10)

In [20]:
word_index_keras

{'The': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'the': 5,
 'mat': 6,
 'dog': 7,
 'ate': 8,
 'my': 9,
 'homework': 10}

Thats the way you do it using Keras APIs.