In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [2]:
tf.__version__

'2.3.1'

In [77]:
import os
from glob import iglob

html_filenames = sorted(list(iglob('/Volumes/Seagate/generated-data/html/encoded/*.unescaped.encoded')))
json_filenames = sorted(list(iglob('/Volumes/Seagate/generated-data/expected_json/encoded/*.expected_json.encoded')))
assert(len(html_filenames) == len(json_filenames))
combined_filenames = zip(html_filenames, json_filenames)

for html_fn, json_fn in combined_filenames:
    # print(html_fn, '\n\t', json_fn)
    with open(html_fn, 'r') as f:
        html_data = f.read()
    with open(json_fn, 'r') as f:
        json_data = f.read()

    with open(os.path.join('/Volumes/Seagate/generated-data-combined-html-json',
                           html_fn.split(os.sep)[-1].split('.')[0] + '.combined'), 'w') as f:
        f.write(html_data + ' : ' + json_data)


def read_file(fn):
    with open(fn, 'r') as f:
        return f.read()


def write_file(fn, data):
    with open(fn, 'w') as f:
        f.write(data)


def copy_file(src, dst):
    write_file(dst, read_file(src))
    

copy_file('/Volumes/Seagate/generated-data/expected_json/encoded/max_encoded_file_token_len',
          '/Volumes/Seagate/generated-data-combined-html-json/max_encoded_file_token_len')
copy_file('/Volumes/Seagate/generated-data/tokens',
          '/Volumes/Seagate/generated-data-combined-html-json/tokens')

In [4]:
with open('/Volumes/Seagate/generated-data-combined-html-json/max_encoded_file_token_len', 'r') as f:
    line = f.read()
    key, value = line.split('=')
    assert(key == 'max_encoded_file_token_len')
    max_encoded_file_token_len = int(value)

In [54]:
# Shuffle the data:
#   - During training:
#     - We're planning on using 10K generated files.
#       Average file size around 9K
#       90M X 4 (for uint32 numbers) = 360MB full HTML training data.
#       Also some more memory needed to hold JSON training data.
#       We can decrease from 10K files to 5K generated files, 
#       or increase the memory reserved for this application to
#       hold this entire data in memory.
#     - So we can shuffle this data as a part of the model.
#       It is good to shuffle at least per epoch so the model
#       is not biased.
#     - You can specify:
#       dataset = dataset.shuffle(buffer_size=100,    # prefilled buffer to speed up shuffling
#                                 random_seed = 10,   # random seed set to ensure repeatability
#                                 reshuffle_each_iteration=True)  # True by default. Set to False for debugging.
#   - During validation/testing:
#     - No need to hold the entire dataset in memory to do this since
#       we can apply the model for validation testing on each file.

batch_size = 32
num_prefetch = 1
def get_datasets(filepath):
    def get_text_line_dataset(filepath):
       return tf.data.TextLineDataset(filepath)

    def get_combined(line):
        # print(type(line))
        return tf.strings.split(line, ':')
    
    def unicode_to_ascii(unicode):
        return tf.strings.to_number(unicode, out_type=tf.int32)

    def pad(ints):
        # print(type(ints))
        t = ints.to_tensor(shape=(2, max_encoded_file_token_len))
        return t

    def reverse(padded):
        # print(type(padded))
        return tf.reverse(padded, axis=[1])
    
    n_readers = 5
    dataset = tf.data.Dataset.list_files(filepath, seed=10) \
                             .interleave(get_text_line_dataset, cycle_length=n_readers) \
                             .map(get_combined) \
                             .map(tf.strings.split) \
                             .map(unicode_to_ascii) \
                             .map(int) \
                             .map(pad) \
                             .map(reverse) \
                             .batch(batch_size) \
                             .prefetch(num_prefetch)

    #for x in dataset:
    #    print(x)
    #    break
        
    return dataset

In [55]:
# At the end, we will batch and prefetch
# return dataset.batch(batch_size).prefetch(10)

In [87]:
def get_vocab_size(filename):
    # TODO:
    # Let's fix this for now. Later we should write this information
    # to a file and read it from there.
    # The vocab size is the number of unique tokens (so token size)
    return 485

combined_ds = get_datasets('/Volumes/Seagate/generated-data-combined-html-json/*.combined')
vocab_size = get_vocab_size('/Volumes/Seagate/generated-data-combined-html-json/tokens')

In [83]:
def dataset_len(ds):
    cardinality = tf.data.experimental.cardinality(ds)
    if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
        print('INFINITE_CARDINALITY')
        return
    elif cardinality < 0:
        print(f'Negative cardinality: {cardinality}')
        
    count = 0
    for x in combined_ds:
        count += 1
    print(f'Counted dataset length: {count}')
    return count

In [84]:
def dataset_print(ds):
    dataset_len(ds)
    print('Dataset first element: \n')
    DS_HEAD_LEN = 1
    for x in ds.take(DS_HEAD_LEN):
        print(x)

In [101]:
t = tf.convert_to_tensor(list(combined_ds.as_numpy_iterator()))
encoder_embeddings = t[0, :, 0, :]
decoder_embeddings = t[0, :, 1, :]
t, encoder_embeddings, decoder_embeddings

(<tf.Tensor: shape=(1, 10, 2, 2408), dtype=int32, numpy=
 array([[[[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 237, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         ...,
 
         [[431, 516, 345, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]]]], dtype=int32)>,
 <tf.Tensor: shape=(10, 2408), dtype=int32, numpy=
 array([[  0,   0,   0, ..., 217, 293, 263],
        [  0,   0,   0, ..., 217, 293, 263],
        [  0,   0,   0, ..., 237, 293, 263],
        ...,
        [431, 516, 345, ..., 217, 293, 263],
        [  0,   0,   0, ..., 217, 293, 263],
        [  0,   0,   0, ..., 217, 293, 26

In [102]:
encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, 
                                                 output_layer=output_layer)
final_outputs, final_output_state, final_sequence_lengths = \
    decoder(decoder_embeddings, initial_state=encoder_state,
            sequence_length=sequence_lengths)

Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
                    outputs=[Y_proba])

ValueError: Input 0 of layer lstm_6 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [10, 2408]

In [67]:
text_vectorizer = \
    preprocessing.TextVectorization(max_tokens=None, standardize=None,
                                   split="whitespace", ngrams=None,
                                   output_mode="int", output_sequence_length=max_data_len,
                                   pad_to_max_tokens=True)

In [68]:
adapted_data = text_vectorizer.adapt(data.batch(64))
print('adapted_data:', adapted_data)

AttributeError: 'tensorflow.python.framework.ops.EagerTensor' object has no attribute 'batch'

In [18]:
vocab = text_vectorizer.get_vocabulary()
vocab = sorted(vocab)
print("Vocabulary:", vocab)

inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = layers.Embedding(input_dim=len(vocab), output_dim=64)(x)
outputs = layers.LSTM(1)(x)
model = keras.Model(inputs, outputs)

test_data = tf.constant(["12 5 9 20 52"])
test_output = model(test_data)

print("test_output:", test_output)

Vocabulary: ['', '1', '10', '11', '12,', '13,', '14,', '15,', '16', '17,', '18,', '19,', '2', '20,', '21', '3', '4', '5', '6', '7', '8', '9', '[UNK]']
test_output: tf.Tensor([[0.0213149]], shape=(1, 1), dtype=float32)


In [4]:
# Define some text data to adapt the layer
data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
        "With ease and You beside",
    ]
)
# Instantiate TextVectorization with "int" output_mode
text_vectorizer = preprocessing.TextVectorization(output_mode="int")
# Index the vocabulary via `adapt()`
text_vectorizer.adapt(data)

# You can retrieve the vocabulary we indexed via get_vocabulary()
vocab = text_vectorizer.get_vocabulary()
print("Vocabulary:", vocab)

# Create an Embedding + LSTM model
inputs = keras.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = layers.Embedding(input_dim=len(vocab), output_dim=64)(x)
outputs = layers.LSTM(1)(x)
model = keras.Model(inputs, outputs)

# Call the model on test data (which includes unknown tokens)
test_data = tf.constant(["The Brain is deeper than the sea"])
test_output = model(test_data)


Vocabulary: ['', '[UNK]', 'the', 'side', 'you', 'with', 'will', 'wider', 'them', 'than', 'sky', 'put', 'other', 'one', 'is', 'for', 'ease', 'contain', 'by', 'brain', 'beside', 'and']
