In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.0.8'

https://keras.io/preprocessing/text/

# Word-level one-hot encoding

In [2]:
from keras.preprocessing.text import Tokenizer

In [3]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

In [5]:
tokenizer = Tokenizer(num_words = 1000) # only the top-1000 most common words

In [6]:
tokenizer.fit_on_texts(samples)

In [7]:
# this turns strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)

In [8]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [9]:
# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

In [10]:
one_hot_results

array([[ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.]])

In [11]:
?tokenizer.texts_to_matrix

In [12]:
# this is how you can recover word index that was computed
word_index = tokenizer.word_index
word_index

{'ate': 7,
 'cat': 2,
 'dog': 6,
 'homework': 9,
 'mat': 5,
 'my': 8,
 'on': 4,
 'sat': 3,
 'the': 1}

# Character level one-hot encoding

In [20]:
tokenizer = Tokenizer(num_words = 1000, char_level = True)

In [21]:
tokenizer.fit_on_texts(samples)

In [22]:
tokenizer.word_index

{' ': 1,
 '.': 9,
 'T': 8,
 'a': 5,
 'c': 10,
 'd': 13,
 'e': 2,
 'g': 14,
 'h': 4,
 'k': 18,
 'm': 7,
 'n': 12,
 'o': 6,
 'r': 17,
 's': 11,
 't': 3,
 'w': 16,
 'y': 15}

# Word-level one-hot encoding with hashing-trick

A variant of one-hot encoding is the so-called "one-hot hashing trick", which can be used when the number of unique tokens in your vocabulary is too large to handle explicitly. Instead of explicitly assigning an index to each word and keeping a reference of these indices in a dictionary, one may hash words into vectors of fixed size. This is typically done with a very lightweight hashing function. The main advantage of this method is that it does away with maintaining an explicit word index, which saves memory and allows online encoding of the data (starting to generate token vectors right away, before having seen all of the available data). The one drawback of this method is that it is susceptible to "hash collisions": two different words may end up with the same hash, and subsequently any machine learning model looking at these hashes won't be able to tell the difference between these words. The likelihood of hash collisions decreases when the dimensionality of the hashing space is much larger than the total number of unique tokens being hashed.

In [14]:
# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10

In [16]:
import numpy as np

In [18]:
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality
        results[i, j, index] = 1.

In [19]:
results

array([[[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]])