# GRU and LSTM Classification Example

In [5]:
# Install or upgrade TensorFlow
%pip install --upgrade tensorflow

Note: you may need to restart the kernel to use updated packages.


In [22]:
# Load TensorBoard extension to start and interface with TensorBoard
# from Jupyter notebook
%load_ext tensorboard

In [2]:
from datetime import datetime
from os import path # Filepath utilities
from urllib import request # HTTP(S) requests
import zlib # gzip (de)compression

import tensorflow as tf

In [9]:
# URL to the gzipped tab-separated values dataset file
URL_DS = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz'

# Zero-based indices of elements in TSV file above
INDEX_BODY = 13
INDEX_RATING = 7

# TensorBoard logs parent directory
LOG_DIR_PARENT = 'logs'

In [10]:
# Delete old TensorBoard logs (if any exist)
%rm --recursive "$LOG_DIR_PARENT"

In [11]:
def ds_lines():
    ds = request.urlopen(URL_DS) # Send request
    # gzip decompressor
    decompressor = zlib.decompressobj(32 + zlib.MAX_WBITS)
    line = [] # Will store decompressing line

    for chunk in ds:
        decompressed = decompressor.decompress(chunk)
        if decompressed: # Only act if new data is available
            split = decompressed.split(b'\n') # Split on newlines
            single = True # Will store whether `rv` is only one line
            while True:
                # Appends `line` with first element of `split`
                line.append(split.pop(0))
                # If no more lines exist, break
                if len(split) < 1:
                       break
                yield b''.join(line) # Return line as a string
                line = [] # Reset `line`


def ds_tuples(line):
    split = tf.strings.split(line, sep='\t') # Split on tabs

    body = split[INDEX_BODY]

    rating = split[INDEX_RATING]
    rating = tf.strings.to_number(rating)
    # Decrement `rating` to match
    # `tf.losses.SparseCategoricalCrossentropy`'s zero-based indexing
    # expectation
    rating -= 1

    return body, rating

In [21]:
def configure_for_performance(ds, size, is_train=True):
    # Cache dataset in memory. You can pass a filename to `.cache`
    # if you prefer caching on disk.
    ds = ds.cache()
    # The validation data does not have to be shuffled
    if is_train:
        # Reshuffle dataset every epoch
        ds = ds.shuffle(size, reshuffle_each_iteration=True)
    # Split dataset into batches of 100
    ds = ds.batch(100)
    # Prepare data (pass it through the input pipeline) before it is
    # requested by the training model
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)

    return ds


def create_datasets(size, split):
    # Create new input pipeline from source
    ds = tf.data.Dataset.from_generator(
        ds_lines, output_signature=tf.TensorSpec((), dtype=tf.string))
    ds = ds.skip(1) # Skip the TSV headers
    ds = ds.take(size)

    # `split` is the fraction of the dataset that will be used for
    # validation
    val_size = int(size * split)

    train_ds = ds.skip(val_size) # Skip `val_size` and take the rest
    val_ds = ds.take(val_size) # Take `val_size` and ignore the rest

    # Convert lines of TSV to input and target data.
    # `num_parallel_calls=tf.data.AUTOTUNE` tells TensorFlow that it
    # should can call `ds_tuples` more than ones at a time, depending
    # on the CPU availability.
    train_ds = train_ds.map(ds_tuples, num_parallel_calls=tf.data.AUTOTUNE)
    val_ds = val_ds.map(ds_tuples, num_parallel_calls=tf.data.AUTOTUNE)

    train_ds = configure_for_performance(train_ds, size)
    val_ds = configure_for_performance(val_ds, size, is_train=False)

    return train_ds, val_ds


# Create datasets. Based on the arguments below, there will be
# (1 - 0.2) * 10000 = 8000 training examples and 0.2 * 10000 = 2000
# validation examples. If you are training your model on a low-end
# computer, you should consider changing '10000' to '1000' or '500' so
# training doesn't take too long.
train_ds, val_ds = create_datasets(10000, 0.2)

In [22]:
encoder = tf.keras.layers.TextVectorization()
encoder.adapt(
    train_ds.map(lambda t, r: t, num_parallel_calls=tf.data.AUTOTUNE))

In [23]:
model = tf.keras.Sequential((
    encoder,
    # The `mask_zero` parameter below tells the embedder to interpret
    # zeroes in the output of the `TextVectorization` layer as
    # padding. In Keras, padding is used to indicate that certain
    # timesteps are not applicable. This model will be trained with
    # batches of varying data to descend the gradient of the loss
    # function in a controlled manner. However, this demands that each
    # text vector in the batch fed to the embedder has the same
    # length. To do this, the `TextVectorization` layer adds zeroes to
    # the end of each of the vectorised strings in the batch and we
    # notify the embedder of this by passing `mask_zero=True` to it.
    # The embedder then 'masks' these timesteps, as in, it ignores
    # them and tells all following layers to do the same.
    tf.keras.layers.Embedding(
        input_dim=encoder.vocabulary_size(),
        output_dim=64,
        mask_zero=True),
    # We are using a GRU because they are computationally more
    # efficient and train better on smaller datasets. Feel free to
    # replace the next code line with:

    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),

    # if you would rather use a generic LSTM network.

    # The `reset_after=False` option used below tells Keras to create
    # a GRU identical to the one we looked over in the post.
    # By default, Keras uses a variation of the GRU (this can be
    # disabled by passing `reset_after=False` to the `GRU`
    # constructor) where, in the candidate gate, the element-wise
    # product of the recurrent weights and the former cell state is
    # multiplied by the reset gate activation vector instead of the
    # former cell state being multiplied by the reset gate activation
    # vector and then the recurrent weights. When the `use_bias`
    # parameter is also true (which is the default), Keras also adds
    # a bias to the product of the recurrent weights and the old cell
    # state before multiplying it element-wise with the reset gate
    # activation vector. These default options are two of the current
    # requirements to use the cuDNN (optimised Nvidia GPU-accelerated
    # backend) implementation of the GRU, so we will leave them be.

    # We also pass the GRU layer through the Keras bidirectional layer
    # to duplicate it and use both copies to construct a bidirectional
    # (GRU) RNN.

    # Also, the first parameter to the `GRU` class indicates the
    # number of elements in the cell state vector and thus in the
    # output of the GRU layer. Since we are using a bidirectional RNN,
    # the number of output elements doubles, leading to 2 * 64 = 128
    # outputs to the following dense layer.
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    # The `Dense` layer class is a generic layer found in feedforward
    # neural networks. The first parameter indicates the number of
    # neurons that should exist in the layer.
    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    # Treating Alice's problem as a classification problem instead of
    # a regression problem (returning several values describing
    # different qualities of the input) worked better, so we are using
    # another dense layer to create a more or less one hot vector (a
    # vector with the index of the highest value indicating the
    # output) as the output of the network.
    tf.keras.layers.Dense(5),
    ))

In [24]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=(tf.keras.metrics.SparseCategoricalAccuracy(),))

In [31]:
log_dir = path.join(LOG_DIR_PARENT, str(datetime.now()))
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [32]:
# Start TensorBoard and display interface inline
%tensorboard --logdir "$LOG_DIR_PARENT"

In [25]:
model.fit(
    train_ds, validation_data=val_ds, epochs=20, callbacks=(tensorboard_cb,))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f0db1e625e0>

In [26]:
def predict_rating(text):
    # `model.predict` returns a list of predictions in the order the
    # inputs were passed in. We only care about the first prediction
    # because we only passed one in, so we index the first element
    # with `[0]`.
    one_hot = model.predict((text,))[0]
    # Normalise `one_hot` such that sum of elements is 1
    normalised = tf.nn.softmax(one_hot)
    certainty = max(normalised)
    percentage = certainty * 100
    index = tf.argmax(one_hot) # Returns index of largest element
    rating = index + 1 # Switch back to one-based indexing
    # Use repeated asterisks for a neat representation of the rating
    stars = rating * '*'
    print(
        f'Review:     {text}\n'
        f'Prediction: {percentage:.2f}% certain product was rated {stars}.'
        )


predict_rating('I enjoyed reading this book because the characters were very unique. At one point or the other, I did find that the author went a little off-track, but it was an entertaining read for the most part.')
predict_rating("This book definitely doesn't live up to the standards of the rest of the series. Not enough details and a very confusing storyline spoilt the whole thing.")
predict_rating('This explains how to create your own plant garden so well. The author was really thoughtful to include a couple seeds too! I gave it as a gift and the recipient started her own garden in less than a day.')

Review:     I enjoyed reading this book because the characters were very unique. At one point or the other, I did find that the author went a little off-track, but it was an entertaining read for the most part.
Prediction: 94.59% certain product was rated ****.
Review:     This book definitely doesn't live up to the standards of the rest of the series. Not enough details and a very confusing storyline spoilt the whole thing.
Prediction: 78.58% certain product was rated **.
Review:     This explains how to create your own plant garden so well. The author was really thoughtful to include a couple seeds too! I gave it as a gift and the recipient started her own garden in less than a day.
Prediction: 99.49% certain product was rated *****.