# Import libraries

In [46]:
import io
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import TextVectorization, Dense, Embedding, GlobalAveragePooling1D



from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle


# Define helper methods

# Preprocess text

## Load text data

In [47]:
train_data_dir = "./data/train"
test_data_dir = "./data/test"

batch_size = 1024
seed = 123

''' 
Create batched datasets of the text and their labels. 
We'll use the commands validation_split and subset to split the training text into train and validation.
'''
train_ds = tf.keras.utils.text_dataset_from_directory(train_data_dir, batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(train_data_dir, batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed)
test_ds = tf.keras.utils.text_dataset_from_directory(test_data_dir, batch_size=batch_size)

Found 10 files belonging to 5 classes.
Using 8 files for training.
Found 10 files belonging to 5 classes.
Using 2 files for validation.
Found 5 files belonging to 5 classes.


## Vectorize Text

In [48]:
max_length = 600
max_tokens = 20000

text_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length,)

# Prepare a dataset that only yeilds raw text, no labels.
text_only_train_ds = train_ds.map(lambda x, y: x)

# Use that dataset to index the dataset vocabulary via the adapt() method.
text_vectorization.adapt(text_only_train_ds)

# Prepare processed versions of our training, validation, & test dataset. Use num_parallel_calls to leverage multiple CPU cores.
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

2022-12-09 03:32:47.735641: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


# Using Pre-trained Word Embeddings

We can use one-hot vectors to represent text for machine learning purposes, which are sparse vectors that are mostly full of zeros and have a dimension equal to the number of words in a dataset or vocabulary. But, this approach isn't ideal for representing words because it requires a large amount of storage space. An alternative method is using word embeddings, which are dense vectors with fewer dimensions that still contain sufficient information for the machine learning process. This approach is more efficient and can be customized to a specific problem. Since we don't have enough data to build our own embeddings from scratch, we'll use embeddings from a model that has already been trained on a large dataset.

## Create the model

In [49]:
inputs = tf.keras.Input(shape=(None,), dtype="int64")
embedded = tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs) 
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(embedded)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_10 (Embedding)    (None, None, 256)         5120000   
                                                                 
 bidirectional_10 (Bidirecti  (None, 64)               73984     
 onal)                                                           
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
________________________________________________

## Train the model

In [50]:
output_weights_filename = "embeddings_bidir_gru_with_masking.keras"

# Create callbacks to save our best weights during training and log information we can use to visualize the training process in Tensorboard.
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(output_weights_filename, save_best_only=True),
    tf.keras.callbacks.TensorBoard(log_dir="logs")
]

# Train the model on the processesed training and validation datasets for 10 epochs.
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10


2022-12-09 03:32:50.991961: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:32:51.567320: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:32:51.740111: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:32:54.288788: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:32:54.446921: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-09 03:33:01.321502: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:33:01.538297: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:33:01.605120: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17f7a46a0>

## Test the model

In [51]:
# Load embeddings we saved earlier and test it with our processed test data.
model = tf.keras.models.load_model(output_weights_filename) 
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

2022-12-09 03:33:11.731173: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:33:11.971125: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-09 03:33:12.044660: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Test acc: 0.200


## Visualize performance

Since we added a callback to Tensorboard to our training process, we can use the visualization capabilities of this tool to observe the results of our training. 

In [52]:
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 48205), started 0:12:36 ago. (Use '!kill 48205' to kill it.)

## Save the embedding

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

# Sources

- Chapter 7: Deep Learning for Text, Deep Learning in Python by François Chollet
- Word embeddings, Accessed at: https://www.tensorflow.org/text/guide/word_embeddings
- Word2Vec, Accessed at: https://www.tensorflow.org/tutorials/text/word2vec