In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

# Initial Tokenization

In [None]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
]

In [None]:
# Initial tokenization of the corpus, no OOV used
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

# Exploring Test Data with unseen words

In [None]:
test_data = [
  'Today is a snowy day',
  'Will it be rainy tomorrow?'
]

In [None]:
test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

# Adding OOV to improve test data sequences

In [None]:
# Here you can re-tokenize with an OOV token
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

test_sequences = tokenizer.texts_to_sequences(test_data)
print(word_index)
print(test_sequences)

# Exploring Padding

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?',
    'I really enjoyed walking in the snow today'
]

In [None]:
# Re-tokenize with the new sentences from above
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

In [None]:
padded = pad_sequences(sequences)

print(padded)

In [None]:
padded = pad_sequences(sequences, padding='post')

print(padded)

In [None]:
padded = pad_sequences(sequences, padding='post', maxlen=6)

print(padded)

In [None]:
padded = pad_sequences(sequences, padding='post', maxlen=6, truncating='post')

print(padded)