In [160]:
import os
from glob import iglob
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from utils.environ import generated_data_dir
from utils.file import read_file

In [161]:
tf.__version__

'2.3.1'

In [162]:
# This script tokenizes the files in these directories:
#   HTML files: /Volumes/Seagate/generated-data/html/*.unescaped
#   JSON files: /Volumes/Seagate/generated-data/expected_json/*.expected_json
# and saves the tokens in the space-delimited files in the directory:
#   HTML files: /Volumes/Seagate/generated-data/html/tokenized/*.unescaped
#   JSON files: /Volumes/Seagate/generated-data/expected_json/tokenized/*.expected_json
%run preprocessing.py

html_fn: /Volumes/Seagate/generated-data/html/0.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/0.expected_json
html_fn: /Volumes/Seagate/generated-data/html/1.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/1.expected_json
html_fn: /Volumes/Seagate/generated-data/html/2.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/2.expected_json
html_fn: /Volumes/Seagate/generated-data/html/3.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/3.expected_json
html_fn: /Volumes/Seagate/generated-data/html/4.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/4.expected_json
html_fn: /Volumes/Seagate/generated-data/html/5.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/5.expected_json
html_fn: /Volumes/Seagate/generated-data/html/6.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/6.expected_json
html_fn: /Volumes/Seagate/generated-data/html/7.unescaped
json_fn: /Volumes/Seagate/generated-dat

In [163]:
%run train_set_max_token_len.py

def parse_max_token_len(filename):
    max_token_len = read_file(filename).strip().split(':')[1].strip()
    return int(max_token_len)

max_encoded_file_token_len = parse_max_token_len(os.path.join(generated_data_dir(), 'max_token_len'))
max_encoded_file_token_len

Getting filenames ... done


3359

In [164]:
tf.config.run_functions_eagerly(True)

In [165]:
with open('/Volumes/Seagate/generated-data/tokens', 'r') as f:
    vocab = f.read().split('\n')
    vocab = list(map(tf.strings.split, vocab))

vocab

[<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'['], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"<sos>",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"<pad>",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"<eos>",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"0",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"1",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"2",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"3",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"4",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"5",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"6",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"7",'], dtype=object)>,
 <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'"

In [166]:
# Since we're specifying all the tokens, we don't really want any
# OOV buckets, but StaticVocabularyTable requires num_oov > 0.
# So we set it to 1, although it will never be used.
def create_vocab_table(vocab, num_oov=1):
    vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
    vocab_values = tf.reshape(vocab_values, [vocab_values.shape[0], 1])
    vocab = tf.convert_to_tensor(vocab)
    vocab = tf.reshape(vocab, [vocab.shape[0], 1])
    init = tf.lookup.KeyValueTensorInitializer(keys=vocab, values=vocab_values, 
                                               key_dtype=tf.string, value_dtype=tf.int64)
    vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string)
    return vocab_table

vocab_table = create_vocab_table(vocab)
vocab_table

<tensorflow.python.ops.lookup_ops.StaticVocabularyTable at 0x7fb6e6378c40>

In [209]:
from utils.file import read_file

# Shuffle the data:
#   - During training:
#     - We're planning on using 10K generated files.
#       Average file size around 9K
#       90M X 4 (for uint32 numbers) = 360MB full HTML training data.
#       Also some more memory needed to hold JSON training data.
#       We can decrease from 10K files to 5K generated files, 
#       or increase the memory reserved for this application to
#       hold this entire data in memory.
#     - So we can shuffle this data as a part of the model.
#       It is good to shuffle at least per epoch so the model
#       is not biased.
#     - You can specify:
#       dataset = dataset.shuffle(buffer_size=100,    # prefilled buffer to speed up shuffling
#                                 random_seed = 10,   # random seed set to ensure repeatability
#                                 reshuffle_each_iteration=True)  # True by default. Set to False for debugging.
#   - During validation/testing:
#     - No need to hold the entire dataset in memory to do this since
#       we can apply the model for validation testing on each file.

import os
from glob import iglob

html_filenames = list(iglob('/Volumes/Seagate/generated-data/html/tokenized/*.tokenized'))
json_filenames = list(iglob('/Volumes/Seagate/generated-data/expected_json/tokenized/*.tokenized'))

batch_size = 32
num_prefetch = 1
def get_datasets():
    #     base_dir = '/Volumes/Seagate/generated-data/')
    #     relative_html_dir = 'html/tokenized'
    #     relative_json_dir = 'expected_json/tokenized'
    #     def input_filenames():
    #         for html_fn in iglob(os.path.join(base_dir,
    #                                           relative_html_dir,
    #                                           '*.tokenized')):
    #             json_fn = os.path.join(base_dir,
    #                                    relative_json_dir,
    #                                    html_fn.split(os.sep)[-1].split('.')[0])
    #             yield (html_fn, json_fn)
    
    def gen():
        for (html_fn, json_fn) in zip(html_filenames, json_filenames):
            yield (html_fn, json_fn)

    def get_dataset(html_fn, json_fn):
        html_string_tensor = tf.io.read_file(html_fn)
        
        json_string_tensor = tf.io.read_file(json_fn)
        json_string_tensor = tf.strings.format('<sos> {} <eos>', json_string_tensor)
        # When you format a string tensor, the string is shown
        # with quotes around it. Remove those quotes.
        json_string_tensor = tf.strings.regex_replace(json_string_tensor, '\"', '')
        
        html_data = tf.strings.split(html_string_tensor)
        print(f'before expanded dims html_data shape: {html_data.shape}')
        html_data = tf.expand_dims(html_data, axis=0)
        print(f'expanded dims html_data shape: {html_data.shape}')
        print(f'test: {max_encoded_file_token_len}')
        print(f'test2: {tf.shape(html_data)}')
        paddings = tf.constant([[0, 0], [0, max_encoded_file_token_len-len(html_data[0])]])
        html_data = tf.pad(html_data, paddings, 'CONSTANT')
        print(f'after padding html_data shape: {html_data.shape}')
        
        json_data = tf.strings.split(json_string_tensor)
        json_data = tf.expand_dims(json_data, axis=0)

        # Cannot concatenate along rows since the columns are different sizes.
        # combined_data = tf.concat([html_data, json_data], 0)
        # combined_data = tf.data.Dataset.from_tensors(combined_data)
        
        
        # print(f'combined_data: {combined_data}')
        # return combined_data
        return json_data

    def reverse(padded):
        html_padded, json_padded = padded
        return (tf.reverse(html_padded, axis=[1]),
                tf.reverse(json_padded, axis=[1]))
    
    n_readers = 5
    dataset = \
        tf.data.Dataset.from_generator(gen, (tf.string, tf.string)) \
                       .interleave(get_dataset, cycle_length=n_readers)

    #     dataset = \
    #         tf.data.Dataset.interleave(lambda x: tf.data.TextLineDataset(x)
    #                                                .map(get_dataset, num_parallel_calls=1), 
    #                                    cycle_length=n_readers)
    #     dataset = dataset.interleave(filenames
    #                              .map(add_sos_eos_tokens) \
    #                              .map(to_int) \
    #                              .map(pad) \
    #                              .map(reverse) \
    #                              .batch(batch_size) \
    #                              .prefetch(num_prefetch)

    for x in dataset:
       print(x)
       break
        
    return dataset


# ds1 = tf.data.Dataset.from_tensor_slices(list(html_filenames)) \
#         .interleave(test_lambda,
#                    cycle_length=4, block_length=16)

# combined_fns = tf.data.Dataset.from_tensor_slices(list(zip(html_filenames, json_filenames)))
combined_ds = get_datasets()

before expanded dims html_data shape: (None,)
expanded dims html_data shape: (1, None)
test: 3359
test2: Tensor("Shape:0", shape=(2,), dtype=int32)


TypeError: in user code:

    <ipython-input-209-95c50114d8e9>:63 get_dataset  *
        paddings = tf.constant([[0, 0], [0, max_encoded_file_token_len-len(html_data[0])]])
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/framework/constant_op.py:263 constant  **
        return _constant_impl(value, dtype, shape, name, verify_shape=False,
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/framework/constant_op.py:280 _constant_impl
        tensor_util.make_tensor_proto(
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/framework/tensor_util.py:456 make_tensor_proto
        _AssertCompatible(values, dtype)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/framework/tensor_util.py:333 _AssertCompatible
        raise TypeError("Expected any non-tensor type, got a tensor instead.")

    TypeError: Expected any non-tensor type, got a tensor instead.


In [206]:
# t = tf.constant([[1, 2, 3], [4, 5, 6]])
# print(t.shape)
# paddings = tf.constant([[1, 1,], [1, 2]])
# # 'constant_values' is 0.
# # rank of 't' is 2.
# tf.pad(t, paddings, "CONSTANT")

x_html_data = tf.constant(['this', 'is', 'something'])
x_html_data = tf.expand_dims(x_html_data, axis=0)
print(x_html_data.shape, x_html_data.numpy())
print(f'len: {len(x_html_data[0])}')
paddings = tf.constant([[0, 0], [0, 10-len(x_html_data[0])]])
x_html_data = tf.pad(x_html_data, paddings, 'CONSTANT')
x_html_data.shape, x_html_data.numpy(), tf.shape(x_html_data).numpy()[1]

(1, 3) [[b'this' b'is' b'something']]
len: 3


(TensorShape([1, 10]),
 array([[b'this', b'is', b'something', b'', b'', b'', b'', b'', b'', b'']],
       dtype=object),
 10)

In [148]:
html_string = tf.constant('html_string', dtype=tf.string)
json_string = tf.constant('json_string', dtype=tf.string)
j_str = tf.strings.format('<sos> {} <eos>', json_string)

# When you format a string tensor, the string is shown
# with quotes around it. Remove those quotes.
tf.strings.regex_replace(j_str, '\"', '')

<tf.Tensor: shape=(), dtype=string, numpy=b'<sos> json_string <eos>'>

In [30]:
dataset = tf.data.Dataset.from_tensor_slices(html_filenames) # (html_filenames, json_filenames))
list(dataset.as_numpy_iterator())[:5]

[(b'/Volumes/Seagate/generated-data/html/tokenized/0.tokenized',
  b'/Volumes/Seagate/generated-data/expected_json/tokenized/0.tokenized'),
 (b'/Volumes/Seagate/generated-data/html/tokenized/1.tokenized',
  b'/Volumes/Seagate/generated-data/expected_json/tokenized/1.tokenized'),
 (b'/Volumes/Seagate/generated-data/html/tokenized/2.tokenized',
  b'/Volumes/Seagate/generated-data/expected_json/tokenized/2.tokenized'),
 (b'/Volumes/Seagate/generated-data/html/tokenized/3.tokenized',
  b'/Volumes/Seagate/generated-data/expected_json/tokenized/3.tokenized'),
 (b'/Volumes/Seagate/generated-data/html/tokenized/4.tokenized',
  b'/Volumes/Seagate/generated-data/expected_json/tokenized/4.tokenized')]

In [29]:
json_filenames[:10]

['/Volumes/Seagate/generated-data/expected_json/tokenized/0.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/1.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/2.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/3.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/4.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/5.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/6.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/7.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/8.tokenized',
 '/Volumes/Seagate/generated-data/expected_json/tokenized/9.tokenized']

2408

In [528]:
combined_ds = get_datasets('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

In [529]:
def dataset_len(ds):
    cardinality = tf.data.experimental.cardinality(ds)
    if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
        print('INFINITE_CARDINALITY')
        return
    elif cardinality < 0:
        print(f'Negative cardinality: {cardinality}')
        
    count = 0
    for x in combined_ds:
        count += 1
    print(f'Counted dataset length: {count}')
    return count

In [530]:
def dataset_print(ds):
    dataset_len(ds)
    print('Dataset first element: \n')
    DS_HEAD_LEN = 1
    for x in ds.take(DS_HEAD_LEN):
        print(x)

In [531]:
t = tf.convert_to_tensor(list(combined_ds.as_numpy_iterator()))
encoder_values = t[0, :, 0, :]
encoder_values = encoder_values[:, :, np.newaxis]
encoder_values = tf.cast(encoder_values, dtype=tf.int32)
decoder_values = t[0, :, 1, :]
decoder_values = decoder_values[:, :, np.newaxis]

t, encoder_values, decoder_values

(<tf.Tensor: shape=(1, 10, 2, 2408), dtype=int32, numpy=
 array([[[[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 237, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         ...,
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 237, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]],
 
         [[  0,   0,   0, ..., 217, 293, 263],
          [  0,   0,   0, ..., 368, 152, 306]]]], dtype=int32)>,
 <tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
  

In [532]:
# This function assumes the size of the embeddings is 1 per token
def get_sequence_lengths(embeddings):
    axis_removed_embeddings = np.squeeze(embeddings)
    sequence_lengths = np.zeros(embeddings.shape[0])
    max_len = embeddings.shape[1]
    index = 0
    for xs in axis_removed_embeddings:
        for i, y in enumerate(xs):
            if y != 0:
                sequence_lengths[index] = max_len - i
                index += 1
                break

    return sequence_lengths

In [533]:
def check_enc_dec(file_pattern, enc, dec):
    
    filenames = list(tf.data.Dataset.list_files(file_pattern, seed=10).as_numpy_iterator())
    filenames = [fn.decode('utf-8') for fn in filenames]

    def embedding_values(e):
        return np.squeeze(e)
            
    enc_values = embedding_values(enc)
    enc_values = [np.flip(xs) for xs in enc_values]
    enc_values = [list(xs.astype(str)) for xs in enc_values]
    
    dec_values = embedding_values(dec)
    dec_values = [np.flip(xs) for xs in dec_values]
    dec_values = [list(xs.astype(str)) for xs in dec_values]

    enc_lengths = get_sequence_lengths(enc)
    dec_lengths = get_sequence_lengths(dec)
    
    print('Values:')
    print('Filename                                      First few bytes                                  lengths')
    for i, filename in enumerate(filenames):
        fn = filename.split(os.sep)[-1]
        print('{}: {}:{}    {}:{}'.format(fn, ' '.join(enc_values[i][:10]), ' '.join(dec_values[i][:10]),
                                          int(enc_lengths[i]), int(dec_lengths[i])))

check_enc_dec('/Volumes/Seagate/generated-data-combined-html-json/*.combined',
              encoder_values, decoder_values)

Values:
Filename                                      First few bytes                                  lengths
7.combined: 263 293 217 253 569 570 366 156 521 298:306 152 368 402 120 298 509 120 230 120    938:1244
0.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 420 120 230 120 528    1322:805
4.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 513 120 230 120 191    1456:990
1.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 356 120 230 120 264    1467:1001
9.combined: 263 293 217 492 237 369 470 308 492 237:306 152 368 402 120 369 470 120 230 120    2408:1451
2.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 180 120 230 120 322    1321:804
3.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 417 473 120 230 120    1460:994
5.combined: 263 293 217 253 569 570 366 156 521 313:306 152 368 402 120 313 120 230 120 433    938:1244
8.combined: 263 293 237 217 237 492 308 492 308 492:306

In [534]:
def check_data_files(file_pattern):
    enc_lengths = dec_lengths = []
    print('Filename                                      First few bytes                                  lengths')
    for i, fn in enumerate(iglob(file_pattern)):
        with open(fn, 'r') as f:
            line = f.read()
        parts = line.split(':')
        values = [xs.split() for xs in parts]
        values = [[str(x) for x in xs] for xs in values]
        enc_len, dec_len = [len(x) for x in values]
        enc_lengths.append(enc_len)
        dec_lengths.append(dec_len)
        filename = fn.split(os.sep)[-1]
        
        print(f'{filename}: {" ".join(values[0][:10])}:{" ".join(values[1][:10])}    {enc_len}:{dec_len}')
        
check_data_files('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

Filename                                      First few bytes                                  lengths
0.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 420 120 230 120 528    1322:805
1.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 356 120 230 120 264    1467:1001
2.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 402 120 180 120 230 120 322    1321:804
3.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 417 473 120 230 120    1460:994
4.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 513 120 230 120 191    1456:990
5.combined: 263 293 217 253 569 570 366 156 521 313:306 152 368 402 120 313 120 230 120 433    938:1244
6.combined: 263 293 217 253 569 378 269 253 366 156:306 152 368 402 120 383 120 230 120 186    1460:994
7.combined: 263 293 217 253 569 570 366 156 521 298:306 152 368 402 120 298 509 120 230 120    938:1244
8.combined: 263 293 237 217 237 492 308 492 308 492:306 152 368 

In [535]:
get_sequence_lengths(encoder_values), get_sequence_lengths(decoder_values)

(array([ 938., 1322., 1456., 1467., 2408., 1321., 1460.,  938., 1324.,
        1460.]),
 array([1244.,  805.,  990., 1001., 1451.,  804.,  994., 1244.,  807.,
         994.]))

In [536]:
encoder_values, decoder_values

(<tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        ...,
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [237],
         [293],
         [263]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [217],
         [293],
         [263]]], dtype=int32)>,
 <tf.Tensor: shape=(10, 2408, 1), dtype=int32, numpy=
 array([[[  0],
         [  0],
         [  0],
         ...,
         [368],
         [152],
         [306]],
 
        [[  0],
         [  0],
         [  0],
         ...,
         [

In [537]:
def build_vocabulary(enc, dec):
    def build_vocab(values):
        values_set = set()        
        for v in values:
            values_set.update(np.squeeze(v))
        return values_set
    
    enc_set = build_vocab(enc)
    dec_set = build_vocab(dec)
    values_set = enc_set | dec_set
    
    return sorted(list(values_set))

encoder_values = np.squeeze(encoder_values)
decoder_values = np.squeeze(decoder_values)
vocab = build_vocabulary(encoder_values, decoder_values)
vocab_size = len(vocab)
embed_size = 4  # 4 float32 values for each token of input
vocab_size, vocab

(882,
 [0,
  1,
  2,
  3,
  4,
  5,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  17,
  18,
  19,
  20,
  21,
  23,
  24,
  25,
  26,
  27,
  28,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  53,
  54,
  55,
  56,
  57,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  97,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  159,
  160,
  161,
  162,
  163,
  164,
  

In [538]:
type(encoder_values), encoder_values.shape, encoder_values.dtype

(numpy.ndarray, (10, 2408), dtype('int32'))

In [539]:
vocab_array = np.array(vocab)

def build_indices(values):
    return np.squeeze(np.array([[np.where(vocab == x) 
                                    for x in value] 
                                for value in values]))

encoder_indices = build_indices(encoder_values)
decoder_indices = build_indices(decoder_values)
print(type(encoder_indices), encoder_indices.shape)
print(type(decoder_indices), decoder_indices.shape)

<class 'numpy.ndarray'> (10, 2408)
<class 'numpy.ndarray'> (10, 2408)


In [546]:
# All of this code is taken from Aurelien Geron's
# notebook which accompanies the book
# Handson Machine Learning with Scikit-Learn and Tensorflow.
# You can find it here:
# https://github.com/ageron/handson-ml2/blob/master/16_nlp_with_rnns_and_attention.ipynb
#
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(4, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(4)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
    outputs=[Y_proba])

In [547]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam",
              run_eagerly=True)

In [548]:
model.summary()

Model: "functional_60"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_154 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
input_153 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_31 (Embedding)        (None, None, 4)      3528        input_153[0][0]                  
                                                                 input_154[0][0]                  
__________________________________________________________________________________________________
lstm_61 (LSTM)                  [(None, 4), (None, 4 144         embedding_31[0][0]   

In [550]:
decoder_indices_shifted = np.c_[np.zeros((decoder_indices.shape[0], 1)),
                                decoder_indices[:, :-1]]
# print(encoder_indices.shape)
# print(decoder_indices.shape)
sequence_lengths = np.full([decoder_indices.shape[0]], decoder_indices.shape[1])
# print(sequence_lengths.shape)
# print(sequence_lengths[:5])
# print(type(sequence_lengths))
model.fit([encoder_indices, decoder_indices_shifted, sequence_lengths], 
          decoder_indices,
          epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fad310eba00>

In [1]:
import torch

In [2]:
x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
x

tensor([[ 1., -1.],
        [ 1.,  1.]], requires_grad=True)

In [3]:
y = x.pow(2).sum()
y

tensor(4., grad_fn=<SumBackward0>)

In [4]:
y.backward()
x.grad

tensor([[ 2., -2.],
        [ 2.,  2.]])

In [5]:
t = torch.rand(4, 4)  # 4x4 random tensor
t.dtype, t

(torch.float32,
 tensor([[0.7328, 0.7336, 0.2755, 0.2053],
         [0.1659, 0.9661, 0.1790, 0.5412],
         [0.4934, 0.8503, 0.3691, 0.7660],
         [0.8465, 0.3042, 0.8909, 0.3216]]))

In [6]:
tv = t.view(2, 8)
tv

tensor([[0.7328, 0.7336, 0.2755, 0.2053, 0.1659, 0.9661, 0.1790, 0.5412],
        [0.4934, 0.8503, 0.3691, 0.7660, 0.8465, 0.3042, 0.8909, 0.3216]])

In [7]:
t.storage().data_ptr() == tv.storage().data_ptr()

True

In [8]:
tv[0][0] = 3.14
t[0][0], tv[0][0]

(tensor(3.1400), tensor(3.1400))

In [9]:
x = np.array([[0, 1], [1, 1], [2, 2]])
x

array([[0, 1],
       [1, 1],
       [2, 2]])

In [10]:
x.sum(0)

array([3, 4])

In [11]:
x = torch.tensor([[1], [3], [5]])
x

tensor([[1],
        [3],
        [5]])

In [12]:
x.expand(3, 7)

tensor([[1, 1, 1, 1, 1, 1, 1],
        [3, 3, 3, 3, 3, 3, 3],
        [5, 5, 5, 5, 5, 5, 5]])

In [13]:
x.expand_as(torch.rand(3, 7))

tensor([[1, 1, 1, 1, 1, 1, 1],
        [3, 3, 3, 3, 3, 3, 3],
        [5, 5, 5, 5, 5, 5, 5]])

In [14]:
x = torch.tensor([[1, 2], [3, 4], [5, 6]])
torch.movedim(x, 1, 0)

tensor([[1, 3, 5],
        [2, 4, 6]])

In [15]:
torch.__version__

'1.7.0'

In [16]:
x.t()

tensor([[1, 3, 5],
        [2, 4, 6]])

In [18]:
(torch.movedim(x, 1, 0) == x.t()).all()

tensor(True)