# Creating sequences of tokens

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

## Define training sentences in a list

In [2]:
##define list of sentences to tokenize
train_sentences = [
             'It is a sunny day',
             'It is a cloudy day',
             'Will it rain today?'
]

## Train the tokenizer

In [3]:
##set up the tokenizer
tokenizer = Tokenizer(num_words=100)

##train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}


## Create sequences

In [4]:
##create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [5]:
##print word index dictionary and sequences
print('word_index: ', word_index)

word_index:  {'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}


In [6]:
##print sample sentence and sequence
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[1, 2, 3, 5, 4]


## Tokenizing new data using the same tokenizer

In [7]:
new_sentences = [
                 'Will it be raining today?',
                 'It is a pleasant day.'
]

In [8]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [9]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[7, 1, 9], [1, 2, 3, 4]]


**NOTE: we can see that some of the unseen words are lost when texts are transform to sequences. To overcome this, we can use OOV which is unique word that will be used to replaced for any unseen words.**

## Replacing newly encountered words with special values

In [10]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

##train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

In [11]:
##create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print(word_index)
print(new_sequences)

{'<OOV>': 1, 'it': 2, 'is': 3, 'a': 4, 'day': 5, 'sunny': 6, 'cloudy': 7, 'will': 8, 'rain': 9, 'today': 10}
[[8, 2, 1, 1, 10], [2, 3, 4, 1, 5]]


Now we can see that for unseen words in new sentence are replaced to use OOV.