**Preparing the data**

In [10]:
import random
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization, Bidirectional, LSTM, Dropout, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [11]:
batch_size = 32

In [12]:
train = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)
text_only_train = train.map(lambda x, y: x)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [13]:
max_length = 600
max_tokens = 20000

text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length)
text_vectorization.adapt(text_only_train)

In [14]:
int_train_ds = train.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = validation.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

**Using an `Embedding` layer with masking enabled**

In [15]:
inputs = Input(shape=(None,), dtype="int64")
embedded = Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = Bidirectional(LSTM(32))(embedded)
x = Dropout(0.5)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs, outputs)

In [16]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               73984     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,194,049
Trainable params: 5,194,049
Non-trainable params: 0
_________________________________________________

In [17]:
callbacks = [ModelCheckpoint("embedding_with_masking.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c5b09ef790>

In [18]:
model = load_model("embedding_with_masking.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.500
