**Preparing the data**

In [2]:
import random
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [3]:
batch_size = 8

In [9]:
train = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)
text_only_train = train.map(lambda x, y: x)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


**Preparing integer sequence datasets**

In [10]:
max_length = 600
max_tokens = 20000

text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length)
text_vectorization.adapt(text_only_train)

In [19]:
int_train_ds = train.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = validation.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

**A sequence model built on one-hot encoded vector sequences**

In [14]:
inputs = Input(shape=(None,), dtype="int64")
one_hot_embedded = tf.one_hot(inputs, depth=max_tokens)
x = Bidirectional(LSTM(32))(one_hot_embedded)
x = Dropout(0.5)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs, outputs)

In [15]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot_3 (TFOpLambda)   (None, None, 20000)       0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               5128448   
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________

**Training a first basic sequence model**

In [7]:
callbacks = [ModelCheckpoint("one_hot_bidir_lstm.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25ef5e93fd0>

In [8]:
model = load_model("one_hot_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.500
