In [29]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re
import string
import os
import io

In [30]:
@tf.keras.utils.register_keras_serializable()
def _custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation), '')

ValueError: Custom>_custom_standardization has already been registered to <function _custom_standardization at 0x000001A667F76430>

In [37]:
model = tf.keras.models.load_model("artefacts/model/")

In [38]:
data_dir = "dataset/aclImdb"
batch_size = 1024
seed = 123
embedding_dim = 16
vocab_size = 10000
sequence_length = 100

In [39]:
train_dir = os.path.join(data_dir, 'train')

train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [40]:
vectorize_layer = TextVectorization(
    standardize=_custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [41]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [42]:
out_v = io.open('artefacts/vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('artefacts/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [43]:
len(vocab)

10000