* What's the topic of this text? (text classification)
* Does this text contain abuse? (moderation)
* Does this text sound positive or negative? (sentiment analysis)
* What should be the next word in this incomplete sentence? (language modelling)
* How would you say this in Dutch? (translation)
* Produce a summary of this article in one paragraph. (summarization)

# What needs to be done to process text for neural networks?
* Standardizing; convert to lower case, remove punctuation
* Split the text into units (tokens), such as characters, words, groups of words, clauses in sentences, etc
* Convert all tokens to a tensor. This means (typically) indexing the tokens.

### Example
The cat sat on the mat.
the cat sat on the mat
["cat", "sat", "on", "mat"]
[2, 34, 53, 8]
(one-hot encoding very common)

é -> e
è -> e

# Three ways of handling tokens
## Word-level tokenization
Tokens are space-separated substrings (or puncuation-separated if appropriate). A variant also splits into subwords, which is especially important for agglutinating and composing lanugages, such as Finnish or Swedish. 
## N-gram tokenization
Tokens are groups of N consecutive words. For example, "the cat", "he was", "over there" -- these are 2-grams or "bigrams".
## Character-level tokenization
Each character is its own token. In practice, useful for languages with rich writing systems or pictographic writing (cyrillic, chinese)

Dataset to use:
https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [4]:
import os, pathlib, shutil, random
base_dir = pathlib.Path("../../Data/aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok=True)
    files = os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir/category/fname)

In [5]:
import keras
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(train_dir, batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(val_dir, batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory(base_dir / "test", batch_size=batch_size)

Found 18957 files belonging to 3 classes.


I0000 00:00:1734091121.257079 1049404 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 17693 MB memory:  -> device: 0, name: NVIDIA RTX 4000 Ada Generation, pci bus id: 0000:02:00.0, compute capability: 8.9


Found 4662 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [6]:
for inputs, targets in train_ds:
    print(f"inputs: {inputs.shape}, {inputs.dtype}")
    print(f"targets: {targets.shape}, {targets.dtype}")
    break

inputs: (32,), <dtype: 'string'>
targets: (32,), <dtype: 'int32'>


In [7]:
from keras import layers
text_vectorization = layers.TextVectorization(max_tokens=20000, output_mode="multi_hot")
text_only_train_ds = train_ds.map(lambda x, _: x)
text_vectorization.adapt(text_only_train_ds)


2024-12-13 12:58:43.994604: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))

In [9]:
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model


In [10]:
model = get_model()
model.summary()

In [11]:
callbacks = [ 
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True) 
]
model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10


I0000 00:00:1734091124.679903 1051477 service.cc:148] XLA service 0x7887bc0021c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734091124.679954 1051477 service.cc:156]   StreamExecutor device (0): NVIDIA RTX 4000 Ada Generation, Compute Capability 8.9
2024-12-13 12:58:44.689308: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1734091124.728801 1051477 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 55/593[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 3ms/step - accuracy: 0.5719 - loss: 0.6649

I0000 00:00:1734091124.974871 1051477 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7598 - loss: 0.4935 - val_accuracy: 0.8825 - val_loss: 0.2954
Epoch 2/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8985 - loss: 0.2754 - val_accuracy: 0.8887 - val_loss: 0.2911
Epoch 3/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9198 - loss: 0.2351 - val_accuracy: 0.8930 - val_loss: 0.2983
Epoch 4/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9282 - loss: 0.2154 - val_accuracy: 0.8889 - val_loss: 0.3152
Epoch 5/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9349 - loss: 0.2055 - val_accuracy: 0.8867 - val_loss: 0.3371
Epoch 6/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9369 - loss: 0.2069 - val_accuracy: 0.8878 - val_loss: 0.3403
Epoch 7/10
[1m593/593[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7889145bb890>

In [12]:
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8830 - loss: 0.2981
Test acc: 0.884


In [13]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1 1 1 ... 0 0 0], shape=(20000,), dtype=int64)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [14]:
text_vectorization = layers.TextVectorization(ngrams=2, max_tokens=20000, output_mode="tf_idf")

text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


2024-12-13 12:59:26.798081: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [15]:
model = get_model()
callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only=True)]
model.fit(tfidf_2gram_train_ds.cache(), validation_data=tfidf_2gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7190 - loss: 0.6353 - val_accuracy: 0.8835 - val_loss: 0.3132
Epoch 2/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8786 - loss: 0.3221 - val_accuracy: 0.8962 - val_loss: 0.2865
Epoch 3/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9005 - loss: 0.2736 - val_accuracy: 0.8904 - val_loss: 0.2925
Epoch 4/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9079 - loss: 0.2551 - val_accuracy: 0.8893 - val_loss: 0.3066
Epoch 5/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9131 - loss: 0.2386 - val_accuracy: 0.8859 - val_loss: 0.3168
Epoch 6/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9189 - loss: 0.2305 - val_accuracy: 0.8835 - val_loss: 0.3432
Epoch 7/10
[1m593/593[0m 

<keras.src.callbacks.history.History at 0x788914413320>

In [16]:
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8910 - loss: 0.2952
Test acc: 0.891


In [17]:
max_length = 600
max_tokens = 20000

text_vectorization = layers.TextVectorization(max_tokens = max_tokens, output_mode="int",output_sequence_length=max_length)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))


In [18]:
import tensorflow as tf

class MyLayer(keras.Layer):
    def call(self, x):
        return tf.one_hot(x, depth=max_tokens)

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = MyLayer()(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x =  layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])


In [19]:
callbacks = [
    keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras", save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 188ms/step - accuracy: 0.6202 - loss: 0.6359 - val_accuracy: 0.8243 - val_loss: 0.4280
Epoch 2/10
[1m352/593[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m40s[0m 168ms/step - accuracy: 0.8395 - loss: 0.4039

KeyboardInterrupt: 

In [23]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

In [24]:
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_lstm.keras", save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 35ms/step - accuracy: 0.6659 - loss: 0.5769 - val_accuracy: 0.8286 - val_loss: 0.3702
Epoch 2/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 36ms/step - accuracy: 0.8588 - loss: 0.3334 - val_accuracy: 0.8385 - val_loss: 0.3675
Epoch 3/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.9018 - loss: 0.2544 - val_accuracy: 0.8897 - val_loss: 0.2931
Epoch 4/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.9232 - loss: 0.2033 - val_accuracy: 0.8865 - val_loss: 0.2894
Epoch 5/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.9480 - loss: 0.1449 - val_accuracy: 0.8848 - val_loss: 0.2969
Epoch 6/10
[1m593/593[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.9560 - loss: 0.1252 - val_accuracy: 0.8887 - val_loss: 0.3393
Epoch 7/10
[1m5

<keras.src.callbacks.history.History at 0x78890c2f82f0>

In [25]:
model = keras.models.load_model("embeddings_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.8738 - loss: 0.3204
Test acc: 0.874
