In [521]:
import os
from glob import iglob
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [522]:
tf.__version__

'2.3.1'

In [523]:
tf.config.run_functions_eagerly(True)

In [524]:
import os
from glob import iglob

html_filenames = sorted(list(iglob('/Volumes/Seagate/generated-data/html/encoded/*.unescaped.encoded')))
json_filenames = sorted(list(iglob('/Volumes/Seagate/generated-data/expected_json/encoded/*.expected_json.encoded')))
assert(len(html_filenames) == len(json_filenames))
combined_filenames = zip(html_filenames, json_filenames)

for html_fn, json_fn in combined_filenames:
    # print(html_fn, '\n\t', json_fn)
    with open(html_fn, 'r') as f:
        html_data = f.read()
    with open(json_fn, 'r') as f:
        json_data = f.read()

    with open(os.path.join('/Volumes/Seagate/generated-data-combined-html-json',
                           html_fn.split(os.sep)[-1].split('.')[0] + '.combined'), 'w') as f:
        f.write(html_data + ' : ' + json_data)


def read_file(fn):
    with open(fn, 'r') as f:
        return f.read()


def write_file(fn, data):
    with open(fn, 'w') as f:
        f.write(data)


def copy_file(src, dst):
    write_file(dst, read_file(src))
    

copy_file('/Volumes/Seagate/generated-data/expected_json/encoded/max_encoded_file_token_len',
          '/Volumes/Seagate/generated-data-combined-html-json/max_encoded_file_token_len')
copy_file('/Volumes/Seagate/generated-data/tokens',
          '/Volumes/Seagate/generated-data-combined-html-json/tokens')

In [525]:
with open('/Volumes/Seagate/generated-data-combined-html-json/max_encoded_file_token_len', 'r') as f:
    line = f.read()
    key, value = line.split('=')
    assert(key == 'max_encoded_file_token_len')
    max_encoded_file_token_len = int(value)

In [526]:
max_encoded_file_token_len

2408

In [527]:
# Shuffle the data:
#   - During training:
#     - We're planning on using 10K generated files.
#       Average file size around 9K
#       90M X 4 (for uint32 numbers) = 360MB full HTML training data.
#       Also some more memory needed to hold JSON training data.
#       We can decrease from 10K files to 5K generated files, 
#       or increase the memory reserved for this application to
#       hold this entire data in memory.
#     - So we can shuffle this data as a part of the model.
#       It is good to shuffle at least per epoch so the model
#       is not biased.
#     - You can specify:
#       dataset = dataset.shuffle(buffer_size=100,    # prefilled buffer to speed up shuffling
#                                 random_seed = 10,   # random seed set to ensure repeatability
#                                 reshuffle_each_iteration=True)  # True by default. Set to False for debugging.
#   - During validation/testing:
#     - No need to hold the entire dataset in memory to do this since
#       we can apply the model for validation testing on each file.

batch_size = 32
num_prefetch = 1
def get_datasets(filepath):
    def get_text_line_dataset(filepath):
       return tf.data.TextLineDataset(filepath)

    def get_combined(line):
        # print(type(line))
        return tf.strings.split(line, ':')
    
    def unicode_to_ascii(unicode):
        return tf.strings.to_number(unicode, out_type=tf.int32)

    def pad(ints):
        # print(type(ints))
        t = ints.to_tensor(shape=(2, max_encoded_file_token_len))
        return t

    def reverse(padded):
        # print(type(padded))
        return tf.reverse(padded, axis=[1])
    
    n_readers = 5
    dataset = tf.data.Dataset.list_files(filepath, seed=10) \
                             .interleave(get_text_line_dataset, cycle_length=n_readers) \
                             .map(get_combined) \
                             .map(tf.strings.split) \
                             .map(unicode_to_ascii) \
                             .map(int) \
                             .map(pad) \
                             .map(reverse) \
                             .batch(batch_size) \
                             .prefetch(num_prefetch)

    #for x in dataset:
    #    print(x)
    #    break
        
    return dataset

In [528]:
combined_ds = get_datasets('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

In [529]:
def dataset_len(ds):
    cardinality = tf.data.experimental.cardinality(ds)
    if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
        print('INFINITE_CARDINALITY')
        return
    elif cardinality < 0:
        print(f'Negative cardinality: {cardinality}')
        
    count = 0
    for x in combined_ds:
        count += 1
    print(f'Counted dataset length: {count}')
    return count

In [530]:
def dataset_print(ds):
    dataset_len(ds)
    print('Dataset first element: \n')
    DS_HEAD_LEN = 1
    for x in ds.take(DS_HEAD_LEN):
        print(x)

In [531]:
t = tf.convert_to_tensor(list(combined_ds.as_numpy_iterator()))
encoder_values = t[0, :, 0, :]
encoder_values = encoder_values[:, :, np.newaxis]
encoder_values = tf.cast(encoder_values, dtype=tf.int32)
decoder_values = t[0, :, 1, :]
decoder_values = decoder_values[:, :, np.newaxis]

t, encoder_values, decoder_values

(<tf.Tensor: shape=(1, 10, 2, 2408), dtype=int32, numpy=
 array([[[[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 237, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         ...,
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 237, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]]]], dtype=int32)>,
 <tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
  

In [532]:
# This function assumes the size of the embeddings is 1 per token
def get_sequence_lengths(embeddings):
    axis_removed_embeddings = np.squeeze(embeddings)
    sequence_lengths = np.zeros(embeddings.shape[0])
    max_len = embeddings.shape[1]
    index = 0
    for xs in axis_removed_embeddings:
        for i, y in enumerate(xs):
            if y != 0:
                sequence_lengths[index] = max_len - i
                index += 1
                break

    return sequence_lengths

In [533]:
def check_enc_dec(file_pattern, enc, dec):
    
    filenames = list(tf.data.Dataset.list_files(file_pattern, seed=10).as_numpy_iterator())
    filenames = [fn.decode('utf-8') for fn in filenames]

    def embedding_values(e):
        return np.squeeze(e)
            
    enc_values = embedding_values(enc)
    enc_values = [np.flip(xs) for xs in enc_values]
    enc_values = [list(xs.astype(str)) for xs in enc_values]
    
    dec_values = embedding_values(dec)
    dec_values = [np.flip(xs) for xs in dec_values]
    dec_values = [list(xs.astype(str)) for xs in dec_values]

    enc_lengths = get_sequence_lengths(enc)
    dec_lengths = get_sequence_lengths(dec)
    
    print('Values:')
    print('Filename                                      First few bytes                                  lengths')
    for i, filename in enumerate(filenames):
        fn = filename.split(os.sep)[-1]
        print('{}: {}:{}    {}:{}'.format(fn, ' '.join(enc_values[i][:10]), ' '.join(dec_values[i][:10]),
                                          int(enc_lengths[i]), int(dec_lengths[i])))

check_enc_dec('/Volumes/Seagate/generated-data-combined-html-json/*.combined',
              encoder_values, decoder_values)

Values:
Filename                                      First few bytes                                  lengths
7.combined: 263 293 217 253 569 570 366 156 521 298:306 152 368 402 120 298 509 120 230 120    938:1244
0.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 420 120 230 120 528    1322:805
4.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 513 120 230 120 191    1456:990
1.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 356 120 230 120 264    1467:1001
9.combined: 263 293 217 492 237 369 470 308 492 237:306 152 368 402 120 369 470 120 230 120    2408:1451
2.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 180 120 230 120 322    1321:804
3.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 417 473 120 230 120    1460:994
5.combined: 263 293 217 253 569 570 366 156 521 313:306 152 368 402 120 313 120 230 120 433    938:1244
8.combined: 263 293 237 217 237 492 308 492 308 492:306

In [534]:
def check_data_files(file_pattern):
    enc_lengths = dec_lengths = []
    print('Filename                                      First few bytes                                  lengths')
    for i, fn in enumerate(iglob(file_pattern)):
        with open(fn, 'r') as f:
            line = f.read()
        parts = line.split(':')
        values = [xs.split() for xs in parts]
        values = [[str(x) for x in xs] for xs in values]
        enc_len, dec_len = [len(x) for x in values]
        enc_lengths.append(enc_len)
        dec_lengths.append(dec_len)
        filename = fn.split(os.sep)[-1]
        
        print(f'{filename}: {" ".join(values[0][:10])}:{" ".join(values[1][:10])}    {enc_len}:{dec_len}')
        
check_data_files('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

Filename                                      First few bytes                                  lengths
0.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 420 120 230 120 528    1322:805
1.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 356 120 230 120 264    1467:1001
2.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 180 120 230 120 322    1321:804
3.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 417 473 120 230 120    1460:994
4.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 513 120 230 120 191    1456:990
5.combined: 263 293 217 253 569 570 366 156 521 313:306 152 368 402 120 313 120 230 120 433    938:1244
6.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 383 120 230 120 186    1460:994
7.combined: 263 293 217 253 569 570 366 156 521 298:306 152 368 402 120 298 509 120 230 120    938:1244
8.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 

In [535]:
get_sequence_lengths(encoder_values), get_sequence_lengths(decoder_values)

(array([ 938., 1322., 1456., 1467., 2408., 1321., 1460.,  938., 1324.,
        1460.]),
 array([1244.,  805.,  990., 1001., 1451.,  804.,  994., 1244.,  807.,
         994.]))

In [536]:
encoder_values, decoder_values

(<tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        ...,
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]]], dtype=int32)>,
 <tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [368],
         [152],
         [306]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [

In [537]:
def build_vocabulary(enc, dec):
    def build_vocab(values):
        values_set = set()        
        for v in values:
            values_set.update(np.squeeze(v))
        return values_set
    
    enc_set = build_vocab(enc)
    dec_set = build_vocab(dec)
    values_set = enc_set | dec_set
    
    return sorted(list(values_set))

encoder_values = np.squeeze(encoder_values)
decoder_values = np.squeeze(decoder_values)
vocab = build_vocabulary(encoder_values, decoder_values)
vocab_size = len(vocab)
embed_size = 4  # 4 float32 values for each token of input
vocab_size, vocab

(882,
 [0,
  1,
  2,
  3,
  4,
  5,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  17,
  18,
  19,
  20,
  21,
  23,
  24,
  25,
  26,
  27,
  28,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  53,
  54,
  55,
  56,
  57,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  97,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  

In [538]:
type(encoder_values), encoder_values.shape, encoder_values.dtype

(numpy.ndarray, (10, 2408), dtype('int32'))

In [539]:
vocab_array = np.array(vocab)

def build_indices(values):
    return np.squeeze(np.array([[np.where(vocab == x) 
                                    for x in value] 
                                for value in values]))

encoder_indices = build_indices(encoder_values)
decoder_indices = build_indices(decoder_values)
print(type(encoder_indices), encoder_indices.shape)
print(type(decoder_indices), decoder_indices.shape)

<class 'numpy.ndarray'> (10, 2408)
<class 'numpy.ndarray'> (10, 2408)


In [546]:
# All of this code is taken from Aurelien Geron's
# notebook which accompanies the book
# Handson Machine Learning with Scikit-Learn and Tensorflow.
# You can find it here:
# https://github.com/ageron/handson-ml2/blob/master/16_nlp_with_rnns_and_attention.ipynb
#
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(4, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(4)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
    outputs=[Y_proba])

In [547]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam",
              run_eagerly=True)

In [548]:
model.summary()

Model: "functional_60"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_154 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
input_153 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, None, 4)      3528        input_153[0][0]                  
                                                                 input_154[0][0]                  
__________________________________________________________________________________________________
lstm_61 (LSTM)                  [(None, 4), (None, 4 144         embedding_31[0][0]   

In [550]:
decoder_indices_shifted = np.c_[np.zeros((decoder_indices.shape[0], 1)),
                                decoder_indices[:, :-1]]
# print(encoder_indices.shape)
# print(decoder_indices.shape)
sequence_lengths = np.full([decoder_indices.shape[0]], decoder_indices.shape[1])
# print(sequence_lengths.shape)
# print(sequence_lengths[:5])
# print(type(sequence_lengths))
model.fit([encoder_indices, decoder_indices_shifted, sequence_lengths], 
          decoder_indices,
          epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fad310eba00>