# Part 1 - Tokenization

In [None]:
# Hello!

# Let's import the indispensable libraries that make all of this work!

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# First I will initialise a few sentences, the words from which will be
# tokenized.

sentences = [
    'I love my mom',
    'I love my girlfriend',
    'I love my girlfriend!',    # '!' should be ignored
    'You love my mom?',         # so should '?'
    'I like you.'               # and '.'
]

In [None]:
# Initialise the tokenizer
# Tokenizer() takes in some arguments. (To me, ) the important ones are:
# num_words   - max number of tokens
# filters     - which characters to ignore (! ? . and so on). Done for you!
# oov_token   - Out Of Vocabulary words will be set to that string

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Part 2 - Sentences to Data

In [None]:
# Let's see the sequences of tokens. These represent the sentences as tokens

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

In [None]:
# Now let us test what happens if we use out tokenizer on a new sentence
# One of them contains a word that was not seen before... What will happen?

test_sentences = [
    'I like my girlfriend.',
    'I love her'
]

test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)

In [None]:
# Oh no! A whole word was lost... We will never even know it was there!
# However... Here's a trick! oov_token! Let's redefine the tokenizer

tokenizer = Tokenizer(num_words = 100, oov_token = "<NOPE>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

# Now the unknown words will be tokenized as 1, like "her" in the second test

test_sequences = tokenizer.texts_to_sequences(test_sentences)
print(test_sequences)

In [None]:
# Now we'd want to Pad our sequences.
# Why?
# In short, to make them look like matrixes and be of the same shape and size
# Zeros (0) will represent the padding.
# padding post/pre means to add zeros after or before tokens
# truncating will remove tokens if theres too many from the selected side
# maxlen speaks for itself

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, padding='post',
                                 truncating='post', maxlen=6)

print(word_index)
print(sequences)
print(padded_sequences)

# Part 3 - Training and Recognition

In [None]:
# For this part I will shamelessly steal the sample data from Tensorflows video

!wget --no-check-certificate \
    https://storage.googleapis.com/learning-datasets/sarcasm.json \
    -O /tmp/sarcasm.json

In [None]:
import json

with open("/tmp/sarcasm.json", 'r') as f:
  datastore = json.load(f)

headlines = []
labels = []
urls = []

for item in datastore:
  headlines.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

In [None]:
tokenizer = Tokenizer(oov_token="<NOPE>")
tokenizer.fit_on_texts(headlines)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(headlines)
padded = pad_sequences(sequences, padding='post')

print(padded[0])
print(padded.shape)

In [None]:
# I'm gonna split the dataset in a somewhat simpler way

from sklearn.model_selection import train_test_split

vocab_size = 10000
embedding_dim = 16
max_length = 100
test_fraction = 0.25

headlines_train, headlines_test, labels_train, labels_test =\
  train_test_split(headlines, labels, test_size=test_fraction)

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<NOPE>")
tokenizer.fit_on_texts(headlines_train)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(headlines_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length,
                                padding='post', truncating='post')

testing_sequences = tokenizer.texts_to_sequences(headlines_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length,
                               padding='post', truncating='post')

In [None]:
import numpy as np

training_padded = np.array(training_padded)
labels_train = np.array(labels_train)
testing_padded = np.array(testing_padded)
labels_test = np.array(labels_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
              )

In [None]:
model.summary()

In [None]:
# Define some useful functions
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
from matplotlib.ticker import MaxNLocator

class PlotLossAccuracy(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.acc = []
        self.losses = []
        self.val_losses = []
        self.val_acc = []
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):

        self.logs.append(logs)
        self.x.append(int(self.i))
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.acc.append(logs.get('accuracy'))
        self.val_acc.append(logs.get('val_accuracy'))

        self.i += 1

        clear_output(wait=True)
        plt.figure(figsize=(16, 6))
        plt.plot([1, 2])
        plt.subplot(121)
        plt.plot(self.x, self.losses, label="train loss")
        plt.plot(self.x, self.val_losses, label="validation loss")
        plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.title('Model Loss')
        plt.legend()
        plt.subplot(122)
        plt.plot(self.x, self.acc, label="training accuracy")
        plt.plot(self.x, self.val_acc, label="validation accuracy")
        plt.legend()
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.title('Model Accuracy')
        plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
        plt.show();

In [None]:
num_epochs = 30
pltCallBack = PlotLossAccuracy()

history = model.fit(training_padded, labels_train,
                    epochs=num_epochs,
                    validation_data=(testing_padded, labels_test),
                    verbose=2,
                    callbacks=[pltCallBack]
                    )

In [None]:
test_sentence = [
    "granny starting to fear spiders in the garden might be real",
    "game of thrones season finale showing this sunday night"
]

test_sequences = tokenizer.texts_to_sequences(test_sentence)
test_padded = pad_sequences(test_sequences, maxlen=max_length,
                       padding='post',
                       truncating='post')

print(model.predict(test_padded))