#### This notebook guides you through the basic text processing steps required for feeding text into neural network architecture
It covers `tokenization`, converting into `sequences`, and `padding`.

In [13]:
# import libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Sample sentence

In [8]:
sentences = [
                "Penny bought bright blue fishes.",
                "Penny bought bright blue and orange fish.",
                "The cat ate a fish at the store.",
                "Penny went to the store. Penny ate a bug. Penny saw a fish.",
                "It meowed once at the bug, it is still meowing at the bug and the fish",
                "The cat is at the fish store. The cat is orange. The cat is meowing at the fish.",
                "Penny is a fish"
            ]

In [9]:
# tokenization
tokenizer =  Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'the': 2, 'fish': 3, 'penny': 4, 'at': 5, 'is': 6, 'cat': 7, 'a': 8, 'store': 9, 'bug': 10, 'bought': 11, 'bright': 12, 'blue': 13, 'and': 14, 'orange': 15, 'ate': 16, 'it': 17, 'meowing': 18, 'fishes': 19, 'went': 20, 'to': 21, 'saw': 22, 'meowed': 23, 'once': 24, 'still': 25}


In [16]:
# converting the text into sequences
# <OOV> refers to out of vocabulary
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[4, 11, 12, 13, 19], [4, 11, 12, 13, 14, 15, 3], [2, 7, 16, 8, 3, 5, 2, 9], [4, 20, 21, 2, 9, 4, 16, 8, 10, 4, 22, 8, 3], [17, 23, 24, 5, 2, 10, 17, 6, 25, 18, 5, 2, 10, 14, 2, 3], [2, 7, 6, 5, 2, 3, 9, 2, 7, 6, 15, 2, 7, 6, 18, 5, 2, 3], [4, 6, 8, 3]]


In [17]:
#padding
# before feeding into your neural network architecture, you should ideally have the same vector 
padded = pad_sequences(sequences, padding="post", truncating="post", maxlen=10)
print(padded)

[[ 4 11 12 13 19  0  0  0  0  0]
 [ 4 11 12 13 14 15  3  0  0  0]
 [ 2  7 16  8  3  5  2  9  0  0]
 [ 4 20 21  2  9  4 16  8 10  4]
 [17 23 24  5  2 10 17  6 25 18]
 [ 2  7  6  5  2  3  9  2  7  6]
 [ 4  6  8  3  0  0  0  0  0  0]]
