In [1]:
# Import modules
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Initialize a test text sequence
sentences = ["I like eggs and fries.",
             "I love chocolate and ice-creams.",
             "I hate onions."]

In [3]:
# Tokenize and convert to integer sequence
MAX_VOCAB_SIZE = 20000

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)

# Convert to integer sequences
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8, 9], [1, 10, 11]]


In [4]:
# Index mapping of these sequences are stored in the tokenizer object
tokenizer.word_index

{'and': 2,
 'chocolate': 7,
 'creams': 9,
 'eggs': 4,
 'fries': 5,
 'hate': 10,
 'i': 1,
 'ice': 8,
 'like': 3,
 'love': 6,
 'onions': 11}

### Different Padding techinques
Padding prepares the sequences for RNN/LSTM

In [5]:
# 1. Default padding
# By default padding is done for max length of vector
data = pad_sequences(sequences)
print(data)

[[ 0  1  3  4  2  5]
 [ 1  6  7  2  8  9]
 [ 0  0  0  1 10 11]]


In [10]:
# 2. Specifying max length of vector in sequence
MAX_SEQUENCE_LENGTH = 6
data2 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data2)

[[ 0  1  3  4  2  5]
 [ 1  6  7  2  8  9]
 [ 0  0  0  1 10 11]]


In [11]:
# 3. Specify padding position
data2 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data2)

[[ 1  3  4  2  5  0]
 [ 1  6  7  2  8  9]
 [ 1 10 11  0  0  0]]


In [9]:
# 4. Over padding
data3 = pad_sequences(sequences, maxlen=9)
print(data3)

[[ 0  0  0  0  1  3  4  2  5]
 [ 0  0  0  1  6  7  2  8  9]
 [ 0  0  0  0  0  0  1 10 11]]


In [15]:
# 5. Truncation - when maxlen is less than max vector length
data4 = pad_sequences(sequences, maxlen=5, truncating='post')
print(data4)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1 10 11]]
