In [2]:
#  Keras provides the text to word sequence() function that you can use to split text into a list of words.
# by default this fn does 3 things 1) split by spaces 2) remove punc 3) loweccase

import tensorflow as tf

from tensorflow.keras.preprocessing.text import text_to_word_sequence

text = 'The quick brown fox jumped over the lazy dog.'

result = text_to_word_sequence(text)

print(result)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [6]:
# Encoding with one_hot

from tensorflow.keras.preprocessing.text import one_hot

words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

result = one_hot(text,round(vocab_size*1.3))
print(result)

8
[7, 3, 4, 6, 8, 6, 7, 1, 3]


In [8]:
# Hash encoding with hashing trick

#A limitation of integer and count base encodings is that they must maintain a vocabulary of words and their mapping to integers. An alternative to this approach is to use a one-way hash
#function to convert words to integers. This avoids the need to keep track of a vocabulary, which
#is faster and requires less memory

from tensorflow.keras.preprocessing.text import hashing_trick

result = hashing_trick(text,round(vocab_size*1.3),hash_function='md5')
print(result)

[6, 4, 1, 2, 7, 5, 6, 2, 6]


So far we have looked at one-off convenience methods for preparing text with Keras. Keras
provides a more sophisticated API for preparing text that can be fit and reused to prepare
multiple text documents. This may be the preferred approach for large projects. Keras provides
the Tokenizer class for preparing text documents for deep learning

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

# define 5 documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!']

t = Tokenizer()

t.fit_on_texts(docs)

Once fit, the Tokenizer provides 4 attributes that you can use to query what has been
learned about your documents:
 word counts: A dictionary of words and their counts.
 word docs: An integer count of the total number of documents that were used to fit the
Tokenizer.
 word index: A dictionary of words and their uniquely assigned integers.
 document count: A dictionary of words and how many documents each appeared in

In [11]:
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'well': 1, 'done': 1, 'work': 2, 'good': 1, 'effort': 1, 'great': 1, 'nice': 1, 'excellent': 1})


Once the Tokenizer has been fit on training data, it can be used to encode documents in
the train or test datasets. The texts to matrix() function on the Tokenizer can be used to
create one vector per document provided per input. The length of the vectors is the total size
of the vocabulary. This function provides a suite of standard bag-of-words model text encoding
schemes that can be provided via a mode argument to the function. The modes available
include:
 binary: Whether or not each word is present in the document. This is the default.
 count: The count of each word in the document.
 tfidf: The Text Frequency-Inverse DocumentFrequency (TF-IDF) scoring for each word
in the document.
 freq: The frequency of each word as a ratio of words within each document

In [12]:
encoded_docs = t.texts_to_matrix(docs,mode='count')
print(encoded_docs)

[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
