# Part 1 - Tokenization

In [None]:
# Hello!

# Let's import the indispensable libraries that make all of this work!

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# First I will initialise a few sentences, the words from which will be
# tokenized.

sentences = [
    'I love my mom',
    'I love my girlfriend',
    'I love my girlfriend!',    # '!' should be ignored
    'You love my mom?',         # so should '?'
    'I like you.'               # and '.'
]

In [None]:
# Initialise the tokenizer
# Tokenizer() takes in some arguments. (To me, ) the important ones are:
# num_words   - max number of tokens
# filters     - which characters to ignore (! ? . and so on). Done for you!
# oov_token   - Out Of Vocabulary words will be set to that string

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Part 2 - Sentences to Data

In [None]:
# Let's see the sequences of tokens. These represent the sentences as tokens

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

In [None]:
# Now let us test what happens if we use out tokenizer on a new sentence
# One of them contains a word that was not seen before... What will happen?

test_sentences = [
    'I like my girlfriend.',
    'I love her'
]

test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)

In [None]:
# Oh no! A whole word was lost... We will never even know it was there!
# However... Here's a trick! oov_token! Let's redefine the tokenizer

tokenizer = Tokenizer(num_words = 100, oov_token = "<NOPE>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Now the unknown words will be tokenized as 1, like "her" in the second test

test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)

In [None]:
# Now we'd want to Pad our sequences.
# Why?
# In short, to make them look like matrixes and be of the same shape and size
# Zeros (0) will represent the padding.
# padding post/pre means to add zeros after or before tokens
# truncating will remove tokens if theres too many from the selected side
# maxlen speaks for itself

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, padding='post',
                                 truncating='post', maxlen=6)

print(word_index)
print(sequences)
print(padded_sequences)