# Import libraries

In [26]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, TextVectorization

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle



# Define helper methods

# Process text

## Load text data

In [None]:
train_data_dir = "./data/train"
test_data_dir = "./data/test"

batch_size = 1024
seed = 123

train_ds = tf.keras.utils.text_dataset_from_directory(train_data_dir, batch_size=batch_size, validation_split=0.2, subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(train_data_dir, batch_size=batch_size, validation_split=0.2, subset='validation', seed=seed)
test_ds = tf.keras.utils.text_dataset_from_directory(test_data_dir, batch_size=batch_size)

In [29]:
max_length = 600
max_tokens = 20000

text_vectorization = tf.keras.layers.TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length,)

# Prepare a dataset that only yeilds raw text, no labels.
text_only_train_ds = train_ds.map(lambda x, y: x)

# Use that dataset to index the dataset vocabulary via the adapt() method.
text_vectorization.adapt(text_only_train_ds)

# Prepare processed versions of our training, validation, & test dataset. Use num_parallel_calls to leverage multiple CPU cores.
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

2022-12-08 18:02:21.676807: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


We can represent text for machine learning purposes using one-hot vectors, which are sparse vectors that are mostly full of zeros and have a dimension equal to the total number of words in the dataset or vocabulary. But, this approach isn't ideal for representing the relationships between words because it requires a large amount of storage and processing space. An alternative method is word embeddings, which are dense vectors with fewer dimensions that contain enough information for the machine learning process. This approach is more efficient and can be customized to a specific problem. Pre-trained embeddings are available, from models such as Word2Vec, though creating a custom embedding may yield better results.

In [None]:
inputs = tf.keras.Input(shape=(None,), dtype="int64")
embedded = tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs) 
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(embedded)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

callbacks = [
    tf.keras.callbacks.ModelCheckpoint("embeddings_bidir_gru.keras", save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

model = tf.keras.models.load_model("embeddings_bidir_gru.keras") 
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

# Sources

- Chapter 7: Deep Learning for Text - Deep Learning in Python by François Chollet
- Word embeddings https://www.tensorflow.org/text/guide/word_embeddings