In [24]:
import random
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import text_dataset_from_directory
from tensorflow.keras.layers import TextVectorization, Bidirectional, LSTM, Dropout, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.initializers import Constant
import numpy as np

In [25]:
batch_size = 16

In [26]:
train_ds = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
validation_ds = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [27]:
def clean_text(text):
    # Convert to lowercase
    text = tf.strings.lower(text)
    
    # Remove non-UTF-8 characters (ignore decoding errors)
    text = tf.strings.regex_replace(text, r'[^\x00-\x7F]+', '')  
    
    # Keep only letters, numbers, spaces, and some punctuation
    text = tf.strings.regex_replace(text, r"[^a-zA-Z0-9\s.,!?']", "")
    
    return text

def preprocess_text(text, label):
    text = clean_text(text)  # Apply cleaning function
    return text, label

In [28]:
train = train_ds.map(preprocess_text)
validation = validation_ds.map(preprocess_text)
test = test_ds.map(preprocess_text)

In [29]:
sequence_length = 600
max_tokens = 20000

vectorizer = TextVectorization(max_tokens=max_tokens, output_mode="int", output_sequence_length=sequence_length)

In [30]:
train_text = train.map(lambda x, y: x)
vectorizer.adapt(train_text)

**Parsing the GloVe word-embeddings file**

In [14]:
path_to_glove_file = "./glove/glove.6B.100d.txt"

embeddings_index = {}

with open(path_to_glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [15]:
embedding_dim = 100

vocabulary = vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

In [16]:
int_train_ds = train.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=4)
int_val_ds = validation.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=4)
int_test_ds = test.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=4)

**Preparing the GloVe word-embeddings matrix**

In [17]:
embedding_matrix = np.zeros((max_tokens, embedding_dim))

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [18]:
Embedder = Embedding(input_dim=max_tokens, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), 
                     trainable=False, mask_zero=True
)

**Model that uses a pretrained Embedding layer**

In [19]:
inputs = Input(shape=(None,), dtype="int64")
embedder = Embedder(inputs)
x = Bidirectional(LSTM(32))(embedder)
x = Dropout(0.5)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         2000000   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               34048     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,034,113
Trainable params: 34,113
Non-trainable params: 2,000,000
______________________________________________

In [20]:
callbacks = [ModelCheckpoint("pretrained_embedding_model.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20e9cce0880>

In [22]:
model = load_model("pretrained_embedding_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.500
