# Text classification with an RNN

Referenced from https://www.tensorflow.org/tutorials/text/text_classification_rnn

This text classification tutorial trains a recurrent neural network on the IMDB large movie review dataset for sentiment analysis.

# Setup

``` bash
# install dataset
pip3 install -q tensorflow_datasets
```

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

tfds.disable_progress_bar()

# Dataset

In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

print(f"Description:\n\n{info.description}\n")
print(f"Features:\n\n{info.features}\n")
print(f"Train Element:\n\n{train_dataset.element_spec}\n")
print(f"{len(train_dataset)} train samples and {len(test_dataset)} samples\n")

In [None]:
for example, label in train_dataset.take(1):
    print('text: ', example.numpy())
    print('label: ', label.numpy())

# Prepare data for training

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

# shuffle and batch data
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
    print(f"{len(example.numpy())} text and {len(label.numpy())} in a batch")

In [None]:
VOCAB_SIZE = 1000

# encode text data
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
# get vocabulary using encoder
vocab = np.array(encoder.get_vocabulary())
print(f"vocabulary: {vocab[:5]}")

In [None]:
# tokenize string with encoder
for text, label in train_dataset.take(1):
    original = text.numpy()[0]
    tokenized = encoder(original).numpy()
    recovered = vocab[tokenized]
    print("original\n", original)
    print("\ntokenize\n", tokenized)
    print("\nrecovered\n", recovered)

# Model

In [None]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    ),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64)
    ),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1),
])

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
# predict on a sample text without padding.
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

# predict on a sample text with padding
padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

# Train Model

In [None]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=test_dataset, 
    validation_steps=30
)

# Evaluate Model

In [None]:
test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.subplot(1,2,2)
plot_graphs(history, 'loss')