# Padding the sequences

manipulate sequences to make them of **equal length** using padding.

## Import the APIs

In [6]:
##import the required APIs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Define the training sentences

In [2]:
train_sentences = [
             'It will rain',
             'The weather is cloudy!',
             'Will it be raining today?',
             'It is a super hot day!',
]

## Train the tokenizer

In [4]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

##train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

## Create Sequences

In [5]:
##create sequences
sequences = tokenizer.texts_to_sequences(train_sentences)

## Pad Sequences

In [7]:
##pad sequences
padded_seqs = pad_sequences(sequences)

In [8]:
print('word_index: ', word_index)
print('sequences: ', sequences)
print('padded sequences: ', padded_seqs)

word_index:  {'<OOV>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
sequences:  [[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]
padded sequences:  [[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]


## Customising your padded sequence with parameters

In [9]:
##pad sequences with padding type, max length and truncating parameters
padded_seqs = pad_sequences(sequences,
                            padding='pre',
                            maxlen=5,
                            truncating='post',
                            )

In [10]:
print(padded_seqs)

[[ 0  0  2  3  5]
 [ 0  6  7  4  8]
 [ 3  2  9 10 11]
 [ 2  4 12 13 14]]
