This text classification tutorial trains a recurrent neural network on the IMDB large movie review dataset for sentiment analysis.

Source: https://www.tensorflow.org/tutorials/text/text_classification_rnn



In [20]:
!pip3 install -q tensorflow_datasets
!pip3 install mlflow

Collecting mlflow
[?25l  Downloading https://files.pythonhosted.org/packages/aa/2d/7fa1f6e310ded489d943ea20cd7977a9867cb8d81b526d9c9460ce4a5b39/mlflow-1.11.0-py3-none-any.whl (13.9MB)
[K     |████████████████████████████████| 13.9MB 303kB/s 
[?25hCollecting databricks-cli>=0.8.7
[?25l  Downloading https://files.pythonhosted.org/packages/20/d9/01d0bda8d764be9e7448a41c34eec002e67e88e0d558b80d0ba922b4f7f0/databricks-cli-0.13.0.tar.gz (54kB)
[K     |████████████████████████████████| 61kB 6.8MB/s 
[?25hCollecting gorilla
  Downloading https://files.pythonhosted.org/packages/e3/56/5a683944cbfc77e429c6f03c636ca50504a785f60ffae91ddd7f5f7bb520/gorilla-0.3.0-py2.py3-none-any.whl
Collecting querystring-parser
  Downloading https://files.pythonhosted.org/packages/88/6b/572b2590fd55114118bf08bde63c0a421dcc82d593700f3e2ad89908a8a9/querystring_parser-1.2.4-py2.py3-none-any.whl
Collecting gitpython>=2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/d1/a7f8fe3df258549b303415157

In [21]:
import numpy as np
import mlflow

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [10]:
#config

NGROK_URL='http://ec2-3-133-154-208.us-east-2.compute.amazonaws.com:5001/'
EXPERIMENT='nlp/rnn/IMDB_movie_sentiment'
EPOCHS=10
VERSION=1
BUFFER_SIZE = 10000
BATCH_SIZE = 64
VOCAB_SIZE=1000

loss=tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(1e-4)

SAVE_PATH='saved_model'
%mkdir -p saved_model

In [22]:
#Mlflow settings

#set MLflow server 
mlflow.set_tracking_uri(NGROK_URL)
#Set experiment
if mlflow.get_experiment_by_name(EXPERIMENT) != None:
    exp_id = mlflow.set_experiment(EXPERIMENT)
else: 
    exp_id = mlflow.create_experiment(EXPERIMENT)

#Close active runs
if mlflow.active_run():
    mlflow.end_run()

In [4]:
tags={}
tags['TYPE']='NLP'

In [11]:
#Import matplotlib and create a helper function to plot graphs:

import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])


In [6]:
#Download the dataset using TFDS. 

dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAI4361/imdb_reviews-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAI4361/imdb_reviews-test.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAI4361/imdb_reviews-unsupervised.tfrecord




[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [7]:
#Initially this returns a dataset of (text, label pairs):

for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [12]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b"This inept adaptation of arguably one of Martin Amis's weaker novels fails to even draw comparisons with other druggy oeuvres such as Requiem For A Dream or anything penned by Irvine Walsh as it struggles to decide whether it is a slap-stick cartoon or a hyper-realistic hallucination.<br /><br />Boringly directed by William Marsh in over-saturated hues, a group of public school drop-outs converge in a mansion awaiting the appearance of three American friends for a weekend of decadent drug-taking. And that's it. Except for the ludicrous sub-plot soon-to-be-the-main-plot nonsense about an extremist cult group who express themselves with the violent killings of the world's elite figures, be it political or pampered. Within the first reel you know exactly where this is going.<br /><br />What is a talented actor like Paul Bettany doing in this tiresome, badly written bore? Made prior to his rise to fame and Jennifer Connelly one can be assured that had he been offered this garbag

The raw text loaded by tfds needs to be processed before it can be used in a model. The simplest way to process text for training is using the experimental.preprocessing.TextVectorization layer. 

Create the layer, and pass the dataset's text to the layer's `.adapt` method.

The `.adapt` method sets the layer's vocabulary. Here are the first 20 tokens. After the padding and unknown tokens they're sorted by frequency.

Once the vocabulary is set, the layer can encode text into indices. The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed `output_sequence_length`)



In [14]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [15]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[ 11,   1,   1, ...,   0,   0,   0],
       [  1,   1, 233, ...,   0,   0,   0],
       [  2, 422,   1, ...,   0,   0,   0]])

In [16]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b"This inept adaptation of arguably one of Martin Amis's weaker novels fails to even draw comparisons with other druggy oeuvres such as Requiem For A Dream or anything penned by Irvine Walsh as it struggles to decide whether it is a slap-stick cartoon or a hyper-realistic hallucination.<br /><br />Boringly directed by William Marsh in over-saturated hues, a group of public school drop-outs converge in a mansion awaiting the appearance of three American friends for a weekend of decadent drug-taking. And that's it. Except for the ludicrous sub-plot soon-to-be-the-main-plot nonsense about an extremist cult group who express themselves with the violent killings of the world's elite figures, be it political or pampered. Within the first reel you know exactly where this is going.<br /><br />What is a talented actor like Paul Bettany doing in this tiresome, badly written bore? Made prior to his rise to fame and Jennifer Connelly one can be assured that had he been offered this garb

Above is a diagram of the model.

1. This model can be build as a tf.keras.Sequential.

2. The first latyer is the encoder, which converts the text to a sequence of token indices.

3. After the encoder is an embedding layer. An embedding layer stores one vector per word. When called, it converts the sequences of word indices to sequences of vectors. These vectors are trainable. After training (on enough data), words with similar meanings often have similar vectors.

This index-lookup is much more efficient than the equivalent operation of passing a one-hot encoded vector through a tf.keras.layers.Dense layer.

4. A recurrent neural network (RNN) processes sequence input by iterating through the elements. RNNs pass the outputs from one timestep to their input on the next timestep.

The tf.keras.layers.Bidirectional wrapper can also be used with an RNN layer. This propagates the input forward and backwards through the RNN layer and then concatenates the final output.

* The main advantage to a bidirectional RNN is that the signal from the beginning of the input doesn't need to be processed all the way through every timestep to affect the output.

* The main disadvantage of a bidirectional RNN is that you can't efficiently stream predictions as words are being added to the end.

5. After the RNN has converted the sequence to a single vector the two layers.Dense do some final processing, and convert from this vector representation to a single logit as the classification output.

The code to implement this is below.


<center><img src="https://www.tensorflow.org/tutorials/text/images/bidirectional.png"></center>

In [17]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [18]:
model.compile(loss=loss,
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
with mlflow.start_run(run_id=None, experiment_id=exp_id, run_name=None, nested=False): 
  history = model.fit(train_dataset, epochs=EPOCHS,
                    validation_data=test_dataset, 
                    validation_steps=30)
  #mlflow autolog
      #mlflow.tensorflow.autolog()

  #Set tags
  mlflow.set_tags(tags)

  #mlflow logging
  
  # log parameters
  mlflow.log_param("epochs", EPOCHS)
  mlflow.log_param("loss_function", str(loss))

  # log metrics
  for epoch in range(0, EPOCHS):
    mlflow.log_metric("accuracy", hist['accuracy'][epoch])
    mlflow.log_metric("loss",  hist['loss'][epoch])
    mlflow.log_metric("val_accuracy",  hist['val_accuracy'][epoch])
    mlflow.log_metric("val_loss",  hist['val_loss'][epoch])
  
  #results=evaluate_model() #TODO
  #mlflow.log_metric("average_loss", results[0])
  #mlflow.log_metric("average_acc", results[1])
  
  #log model
  #model.save(os.path.join(BASE_DIR, "models", "{}.h5".format(int(t)))) #HDF5 format
  tf.saved_model.save(model, SAVE_PATH) #SavedModel format
  #mlflow.tensorflow.log_model(model, 'model') #TODO fix
  
  # log artifacts (matplotlib images for loss/accuracy)
  #mlflow.log_artifacts(model_folder)

  mlflow.end_run()

Epoch 1/10

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))