In [25]:
import os
import pandas as pd
import torch
import torchtext
from glob import iglob
import numpy as np
from functools import reduce
from torch.nn.utils.rnn import pad_sequence
from utils.environ import generated_data_dir, tokenized_dir, tables_extracted_split_tables_dir
from utils.file import read_file, get_json_from_file
from torch.utils.data import DataLoader, Dataset

In [8]:
tokenized_file_list = os.path.join(tokenized_dir(), 'separate_files', 'file_list')
tokens_file = os.path.join(tokenized_dir(), 'tokens')

In [4]:
torch.__version__

'1.7.0'

In [5]:
# This script tokenizes the files in these directories:
#   HTML files: /Volumes/Seagate/generated-data/html/*.unescaped
#   JSON files: /Volumes/Seagate/generated-data/expected_json/*.expected_json
# and saves the tokens in the space-delimited files in the directory:
#   HTML files: /Volumes/Seagate/generated-data/html/tokenized/*.unescaped
#   JSON files: /Volumes/Seagate/generated-data/expected_json/tokenized/*.expected_json
# It also combines the HTML and JSON tokenized files into a single file:
#   /Volumes/Seagate/generated-data/tokenized
#   This file includes the names of each HTML/JSON file and their tokenizations:
#     html_filename_1^html tokenized string^json_filename_1^json tokenized string\n
#     html_filename_2^html tokenized string^json_filename_2^json tokenized string\n
#     ...
%run preprocessing.py

html_fn: /Volumes/Seagate/generated-data/html/0.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/0.expected_json
html_fn: /Volumes/Seagate/generated-data/html/1.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/1.expected_json
html_fn: /Volumes/Seagate/generated-data/html/2.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/2.expected_json
html_fn: /Volumes/Seagate/generated-data/html/3.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/3.expected_json
html_fn: /Volumes/Seagate/generated-data/html/4.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/4.expected_json
html_fn: /Volumes/Seagate/generated-data/html/5.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/5.expected_json
html_fn: /Volumes/Seagate/generated-data/html/6.unescaped
json_fn: /Volumes/Seagate/generated-data/expected_json/6.expected_json
html_fn: /Volumes/Seagate/generated-data/html/7.unescaped
json_fn: /Volumes/Seagate/generated-dat

In [6]:
%run train_set_max_token_len.py

def parse_max_token_len(filename):
    max_token_len = read_file(filename).strip().split(':')[1].strip()
    return int(max_token_len)

max_encoded_file_token_len = parse_max_token_len(os.path.join(generated_data_dir(), 'max_token_len'))
max_encoded_file_token_len  # This is the maximum number of tokens in any html/json file

Getting filenames ... done


3359

In [26]:
# The problems with an Iterable dataset are:
#   - Cannot shuffle the instances. This might cause the network to learn
#     the sequence of instances. For our case, that is not true.
#     Iterable dataset should be used , for example, when you have a time series,
#     i.e. only where there is dependence between consecutive instances.
#
# class TransformerDataset(torch.utils.data.IterableDataset):
#     def __init__(self, type_='train'):
#         super(TransformerDataset).__init__()
#         if type_ == 'train':  # training data
#             tokenized_fn = os.path.join(generated_data_dir(), 'tokenized')
#         else:                 # testing data
#             tokenized_fn = os.path.join(tables_extracted_split_tables_dir(), 'tokenized')
            
#         self.file_handle = open(tokenized_fn, 'r')

#     def __iter__(self):
#         # Return a generator. A generator is also an iterator.
#         def gen():
#             for line in self.file_handle:
#                 parts = line.rstrip('\n').split('^')
#                 t = (parts[1].split(), parts[3].split())
#                 yield t
                    
#         return iter(gen())

# def worker_init_fn(worker_id):
#     worker_info = torch.utils.data.get_worker_info()
#     dataset = worker_info.dataset  # the dataset copy in this worker process
#     overall_start = dataset.start
#     overall_end = dataset.end
#     # configure the dataset to only process the split workload
#     per_worker = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
#     worker_id = worker_info.id
#     dataset.start = overall_start + worker_id * per_worker
#     dataset.end = min(dataset.start + per_worker, overall_end)

class Vocabulary:
    def __init__(self, tokens_file):
        self.tokens = get_json_from_file(tokens_file)
        self.itos = {index: token for index, token in enumerate(self.tokens)}
        self.stoi = {token: index for index, token in enumerate(self.tokens)}
        #         self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        #         self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        #         self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)
    
    def tokenizer(self, text):
        return [word.strip() for word in text.split()]

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        return [self.stoi[token] for token in tokenized_text]
    
    
class TransformerDataset(Dataset):
    def __init__(self, file_listings, tokens_file, type_='train'):
        super(TransformerDataset, self).__init__()
        self.type_ = type_
        
        # List of all filenames with tokenized data in them.
        # Storage format: html_filename^json_filename\n
        self.filenames = self.get_filenames(file_listings)
        self.vocab = Vocabulary(tokens_file)

    def get_filenames(self, file_listings):
        with open(file_listings, 'r') as f:
            all_filenames = f.read()
            both_filenames_together = all_filenames.split('\n')
            
            both_filenames_split = []
            for filename in both_filenames_together:
                html_fn, json_fn = filename.split('^')
                both_filenames_split.append((html_fn, json_fn))
        return both_filenames_split

    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, index):
        html_fn, json_fn = self.filenames[index]
        html_str = read_file(html_fn)
        json_str = read_file(json_fn)
        
        html_nums = self.vocab.numericalize(html_str)
        json_nums = self.vocab.numericalize(json_str)

        return torch.tensor(html_nums), torch.tensor(json_nums)
    

class Collate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        html = [item[0] for item in batch]
        html = pad_sequence(html, batch_first=False, padding_value=self.pad_idx)
        json = [item[1] for item in batch]
        json = pad_sequence(json, batch_first=False, padding_value=self.pad_idx)

        return html, json

    
def get_loader_dataset(
    batch_size=4,
    num_workers=2 if torch.cuda.is_available() else 0,
    shuffle=True,
    pin_memory=True,
):
    dataset = TransformerDataset(tokenized_file_list, tokens_file, type_='train')

    pad_idx = dataset.vocab.stoi["<pad>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=Collate(pad_idx=pad_idx),
    )

    return loader, dataset

loader, dataset = get_loader_dataset()
for index, (html, json) in enumerate(loader):
    print(html.shape, json.shape)

torch.Size([3850, 4]) torch.Size([1081, 4])
torch.Size([3850, 4]) torch.Size([1080, 4])
torch.Size([1573, 4]) torch.Size([1030, 4])
torch.Size([1953, 4]) torch.Size([1026, 4])
torch.Size([3850, 4]) torch.Size([1079, 4])


In [7]:
dataset = TransformerDataset(type_='train')

def collate_fn(batch):
    len_all_batch_html = np.array([len(batch[i][0]) for i in range(len(batch))])
    len_all_batch_json = np.array([len(batch[i][1]) for i in range(len(batch))])
    
    max_html_size = len(batch[np.argmax(len_all_batch_html)][0])
    
    # We add 2 here to accomodate <sos> and <eos> which all target sequences need.
    max_json_size = len(batch[np.argmax(len_all_batch_json)][1]) + 2
    
    print(f'max_html_size: {max_html_size}, max_json_size: {max_json_size}')
    
    def split_batch(batch_item):
        for i in range(len(batch_item)):
            yield (batch_item[i][0], batch_item[i][1])
    
    result_batch = []
    for i, (html, json) in enumerate(split_batch(batch)):

        html.extend(['<pad>'] * (max_html_size - len(html)))  # pad html
        html = html[::-1]                      # reverse html

        json.append('<eos>')      # append <eos> to json
        json = json[::-1]
        json.append('<sos>') # prepend <sos> to json
        json = json[::-1]
        json.extend(['<pad>'] * (max_json_size - len(json)))

        # print(f'max_html_size: {max_html_size}\nmax_json_size: {max_json_size}')
        # print(f'len of html: {len(html)}\nlen of json: {len(json)}')
        
        # Convert tokens to numbers
        html = [[vocabulary[token]] for token in html]
        json = [[vocabulary[token]] for token in json]

        print(f'i: {i}\nhtml: {html}\n\njson: {json}\n\n\n')

        result_batch.append((html, json))
        
    return result_batch

# Maybe worker_init_fn should be used only when we have GPU to process
# multiple workers. But, shouldn't this work with the 4-cores on this laptop?
#
# print(list(torch.utils.data.DataLoader(dataset, num_workers=2, worker_init_fn=worker_init_fn)))
dataloader = torch.utils.data.DataLoader(dataset, 
                                         batch_size=4, 
                                         collate_fn=collate_fn,
                                         # shuffle=True,  # Cannot set shuffle=True for Iterable dataset
                                         # num_workers=2,   # (error is cannot pickle 
                                                            #  '_io.TextIOWrapper' object)
                                         worker_init_fn=worker_init_fn,
                                         prefetch_factor=2)

list(dataloader)
pass

# All datasets are subclasses of torchtext.data.Dataset, 
# which inherits from torch.utils.data.Dataset 
# i.e, they have split and iters methods implemented.

max_html_size: 3850, max_json_size: 1081
i: 0
html: [[688], [723], [726], [724], [18], [1028], [964], [850], [596], [979], [724], [1009], [21], [1028], [964], [935], [596], [979], [724], [4], [18], [1028], [964], [850], [596], [979], [724], [42], [1028], [964], [850], [596], [979], [724], [366], [21], [1028], [964], [935], [596], [979], [724], [4], [18], [1028], [964], [850], [596], [979], [724], [42], [1028], [964], [850], [596], [979], [724], [719], [867], [963], [21], [1028], [964], [935], [596], [979], [724], [719], [4], [963], [18], [1028], [964], [850], [596], [979], [724], [42], [1028], [964], [979], [724], [705], [128], [1028], [964], [850], [596], [979], [987], [726], [724], [18], [1028], [964], [850], [596], [979], [724], [759], [21], [1028], [964], [935], [596], [979], [724], [4], [18], [1028], [964], [850], [596], [979], [724], [42], [1028], [964], [850], [596], [979], [724], [1049], [21], [1028], [964], [935], [596], [979], [724], [4], [18], [1028], [964], [850], [596], [9

In [8]:
# This paragraph makes each of our runs deterministic.
seed = 0
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [9]:
train_dataset, validation_dataset, test_dataset = dataset.split(split_ratio=[0.5, 0.25, 0.25])

AttributeError: 'TransformerDataset' object has no attribute 'split'

In [15]:
with open('/Volumes/Seagate/generated-data/tokens', 'r') as f:
    vocab = f.read().split('\n')
    vocab = list(map(tf.strings.split, vocab))

vocab

NameError: name 'tf' is not defined

In [None]:
# Since we're specifying all the tokens, we don't really want any
# OOV buckets, but StaticVocabularyTable requires num_oov > 0.
# So we set it to 1, although it will never be used.
def create_vocab_table(vocab, num_oov=1):
    vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
    vocab_values = tf.reshape(vocab_values, [vocab_values.shape[0], 1])
    vocab = tf.convert_to_tensor(vocab)
    vocab = tf.reshape(vocab, [vocab.shape[0], 1])
    init = tf.lookup.KeyValueTensorInitializer(keys=vocab, values=vocab_values, 
                                               key_dtype=tf.string, value_dtype=tf.int64)
    vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string)
    return vocab_table

vocab_table = create_vocab_table(vocab)
vocab_table

In [None]:
# Shuffle the data:
#   - During training:
#     - We're planning on using 10K generated files.
#       Average file size around 9K
#       90M X 4 (for uint32 numbers) = 360MB full HTML training data.
#       Also some more memory needed to hold JSON training data.
#       We can decrease from 10K files to 5K generated files, 
#       or increase the memory reserved for this application to
#       hold this entire data in memory.
#     - So we can shuffle this data as a part of the model.
#       It is good to shuffle at least per epoch so the model
#       is not biased.
#     - You can specify:
#       dataset = dataset.shuffle(buffer_size=100,    # prefilled buffer to speed up shuffling
#                                 random_seed = 10,   # random seed set to ensure repeatability
#                                 reshuffle_each_iteration=True)  # True by default. Set to False for debugging.
#   - During validation/testing:
#     - No need to hold the entire dataset in memory to do this since
#       we can apply the model for validation testing on each file.

import os
from glob import iglob
from utils.file import read_file

html_filenames = list(iglob('/Volumes/Seagate/generated-data/html/tokenized/*.tokenized'))
json_filenames = list(iglob('/Volumes/Seagate/generated-data/expected_json/tokenized/*.tokenized'))

batch_size = 32
num_prefetch = 1
def get_datasets():
    #     base_dir = '/Volumes/Seagate/generated-data/')
    #     relative_html_dir = 'html/tokenized'
    #     relative_json_dir = 'expected_json/tokenized'
    #     def input_filenames():
    #         for html_fn in iglob(os.path.join(base_dir,
    #                                           relative_html_dir,
    #                                           '*.tokenized')):
    #             json_fn = os.path.join(base_dir,
    #                                    relative_json_dir,
    #                                    html_fn.split(os.sep)[-1].split('.')[0])
    #             yield (html_fn, json_fn)
    
    def gen():
        for (html_fn, json_fn) in zip(html_filenames, json_filenames):
            yield (html_fn, json_fn)

    def get_dataset(html_fn, json_fn):
        html_string_tensor = tf.io.read_file(html_fn)
        
        json_string_tensor = tf.io.read_file(json_fn)
        json_string_tensor = tf.strings.format('<sos> {} <eos>', json_string_tensor)
        # When you format a string tensor, the string is shown
        # with quotes around it. Remove those quotes.
        json_string_tensor = tf.strings.regex_replace(json_string_tensor, '\"', '')
        
        html_data = tf.strings.split(html_string_tensor)
        print(f'before expanded dims html_data shape: {html_data.shape}')
        html_data = tf.expand_dims(html_data, axis=0)
        print(f'expanded dims html_data shape: {html_data.shape}')
        print(f'test: {max_encoded_file_token_len}')
        print(f'test2: {tf.shape(html_data)}')
        paddings = tf.constant([[0, 0], [0, max_encoded_file_token_len-len(html_data[0])]])
        html_data = tf.pad(html_data, paddings, 'CONSTANT')
        print(f'after padding html_data shape: {html_data.shape}')
        
        json_data = tf.strings.split(json_string_tensor)
        json_data = tf.expand_dims(json_data, axis=0)

        # Cannot concatenate along rows since the columns are different sizes.
        # combined_data = tf.concat([html_data, json_data], 0)
        # combined_data = tf.data.Dataset.from_tensors(combined_data)
        
        
        # print(f'combined_data: {combined_data}')
        # return combined_data
        return json_data

    def reverse(padded):
        html_padded, json_padded = padded
        return (tf.reverse(html_padded, axis=[1]),
                tf.reverse(json_padded, axis=[1]))
    
    n_readers = 5
    dataset = \
        tf.data.Dataset.from_generator(gen, (tf.string, tf.string)) \
                       .interleave(get_dataset, cycle_length=n_readers)

    #     dataset = \
    #         tf.data.Dataset.interleave(lambda x: tf.data.TextLineDataset(x)
    #                                                .map(get_dataset, num_parallel_calls=1), 
    #                                    cycle_length=n_readers)
    #     dataset = dataset.interleave(filenames
    #                              .map(add_sos_eos_tokens) \
    #                              .map(to_int) \
    #                              .map(pad) \
    #                              .map(reverse) \
    #                              .batch(batch_size) \
    #                              .prefetch(num_prefetch)

    for x in dataset:
       print(x)
       break
        
    return dataset


# ds1 = tf.data.Dataset.from_tensor_slices(list(html_filenames)) \
#         .interleave(test_lambda,
#                    cycle_length=4, block_length=16)

# combined_fns = tf.data.Dataset.from_tensor_slices(list(zip(html_filenames, json_filenames)))
combined_ds = get_datasets()

In [None]:
# t = tf.constant([[1, 2, 3], [4, 5, 6]])
# print(t.shape)
# paddings = tf.constant([[1, 1,], [1, 2]])
# # 'constant_values' is 0.
# # rank of 't' is 2.
# tf.pad(t, paddings, "CONSTANT")

x_html_data = tf.constant(['this', 'is', 'something'])
x_html_data = tf.expand_dims(x_html_data, axis=0)
print(x_html_data.shape, x_html_data.numpy())
print(f'len: {len(x_html_data[0])}')
paddings = tf.constant([[0, 0], [0, 10-len(x_html_data[0])]])
x_html_data = tf.pad(x_html_data, paddings, 'CONSTANT')
x_html_data.shape, x_html_data.numpy(), tf.shape(x_html_data).numpy()[1]

In [None]:
html_string = tf.constant('html_string', dtype=tf.string)
json_string = tf.constant('json_string', dtype=tf.string)
j_str = tf.strings.format('<sos> {} <eos>', json_string)

# When you format a string tensor, the string is shown
# with quotes around it. Remove those quotes.
tf.strings.regex_replace(j_str, '\"', '')

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(html_filenames) # (html_filenames, json_filenames))
list(dataset.as_numpy_iterator())[:5]

In [None]:
json_filenames[:10]

In [None]:
combined_ds = get_datasets('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

In [None]:
def dataset_len(ds):
    cardinality = tf.data.experimental.cardinality(ds)
    if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
        print('INFINITE_CARDINALITY')
        return
    elif cardinality < 0:
        print(f'Negative cardinality: {cardinality}')
        
    count = 0
    for x in combined_ds:
        count += 1
    print(f'Counted dataset length: {count}')
    return count

In [None]:
def dataset_print(ds):
    dataset_len(ds)
    print('Dataset first element: \n')
    DS_HEAD_LEN = 1
    for x in ds.take(DS_HEAD_LEN):
        print(x)

In [None]:
t = tf.convert_to_tensor(list(combined_ds.as_numpy_iterator()))
encoder_values = t[0, :, 0, :]
encoder_values = encoder_values[:, :, np.newaxis]
encoder_values = tf.cast(encoder_values, dtype=tf.int32)
decoder_values = t[0, :, 1, :]
decoder_values = decoder_values[:, :, np.newaxis]

t, encoder_values, decoder_values

In [None]:
# This function assumes the size of the embeddings is 1 per token
def get_sequence_lengths(embeddings):
    axis_removed_embeddings = np.squeeze(embeddings)
    sequence_lengths = np.zeros(embeddings.shape[0])
    max_len = embeddings.shape[1]
    index = 0
    for xs in axis_removed_embeddings:
        for i, y in enumerate(xs):
            if y != 0:
                sequence_lengths[index] = max_len - i
                index += 1
                break

    return sequence_lengths

In [None]:
def check_enc_dec(file_pattern, enc, dec):
    
    filenames = list(tf.data.Dataset.list_files(file_pattern, seed=10).as_numpy_iterator())
    filenames = [fn.decode('utf-8') for fn in filenames]

    def embedding_values(e):
        return np.squeeze(e)
            
    enc_values = embedding_values(enc)
    enc_values = [np.flip(xs) for xs in enc_values]
    enc_values = [list(xs.astype(str)) for xs in enc_values]
    
    dec_values = embedding_values(dec)
    dec_values = [np.flip(xs) for xs in dec_values]
    dec_values = [list(xs.astype(str)) for xs in dec_values]

    enc_lengths = get_sequence_lengths(enc)
    dec_lengths = get_sequence_lengths(dec)
    
    print('Values:')
    print('Filename                                      First few bytes                                  lengths')
    for i, filename in enumerate(filenames):
        fn = filename.split(os.sep)[-1]
        print('{}: {}:{}    {}:{}'.format(fn, ' '.join(enc_values[i][:10]), ' '.join(dec_values[i][:10]),
                                          int(enc_lengths[i]), int(dec_lengths[i])))

check_enc_dec('/Volumes/Seagate/generated-data-combined-html-json/*.combined',
              encoder_values, decoder_values)

In [None]:
def check_data_files(file_pattern):
    enc_lengths = dec_lengths = []
    print('Filename                                      First few bytes                                  lengths')
    for i, fn in enumerate(iglob(file_pattern)):
        with open(fn, 'r') as f:
            line = f.read()
        parts = line.split(':')
        values = [xs.split() for xs in parts]
        values = [[str(x) for x in xs] for xs in values]
        enc_len, dec_len = [len(x) for x in values]
        enc_lengths.append(enc_len)
        dec_lengths.append(dec_len)
        filename = fn.split(os.sep)[-1]
        
        print(f'{filename}: {" ".join(values[0][:10])}:{" ".join(values[1][:10])}    {enc_len}:{dec_len}')
        
check_data_files('/Volumes/Seagate/generated-data-combined-html-json/*.combined')

In [None]:
get_sequence_lengths(encoder_values), get_sequence_lengths(decoder_values)

In [None]:
encoder_values, decoder_values

In [None]:
def build_vocabulary(enc, dec):
    def build_vocab(values):
        values_set = set()        
        for v in values:
            values_set.update(np.squeeze(v))
        return values_set
    
    enc_set = build_vocab(enc)
    dec_set = build_vocab(dec)
    values_set = enc_set | dec_set
    
    return sorted(list(values_set))

encoder_values = np.squeeze(encoder_values)
decoder_values = np.squeeze(decoder_values)
vocab = build_vocabulary(encoder_values, decoder_values)
vocab_size = len(vocab)
embed_size = 4  # 4 float32 values for each token of input
vocab_size, vocab

In [None]:
type(encoder_values), encoder_values.shape, encoder_values.dtype

In [None]:
vocab_array = np.array(vocab)

def build_indices(values):
    return np.squeeze(np.array([[np.where(vocab == x) 
                                    for x in value] 
                                for value in values]))

encoder_indices = build_indices(encoder_values)
decoder_indices = build_indices(decoder_values)
print(type(encoder_indices), encoder_indices.shape)
print(type(decoder_indices), decoder_indices.shape)

In [None]:
# All of this code is taken from Aurelien Geron's
# notebook which accompanies the book
# Handson Machine Learning with Scikit-Learn and Tensorflow.
# You can find it here:
# https://github.com/ageron/handson-ml2/blob/master/16_nlp_with_rnns_and_attention.ipynb
#
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(4, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(4)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_lengths)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.models.Model(
    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
    outputs=[Y_proba])

In [None]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam",
              run_eagerly=True)

In [None]:
model.summary()

In [None]:
decoder_indices_shifted = np.c_[np.zeros((decoder_indices.shape[0], 1)),
                                decoder_indices[:, :-1]]
# print(encoder_indices.shape)
# print(decoder_indices.shape)
sequence_lengths = np.full([decoder_indices.shape[0]], decoder_indices.shape[1])
# print(sequence_lengths.shape)
# print(sequence_lengths[:5])
# print(type(sequence_lengths))
model.fit([encoder_indices, decoder_indices_shifted, sequence_lengths], 
          decoder_indices,
          epochs=2)

In [None]:
import torch

In [None]:
x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
x

In [None]:
y = x.pow(2).sum()
y

In [None]:
y.backward()
x.grad

In [None]:
t = torch.rand(4, 4)  # 4x4 random tensor
t.dtype, t

In [None]:
tv = t.view(2, 8)
tv

In [None]:
t.storage().data_ptr() == tv.storage().data_ptr()

In [None]:
tv[0][0] = 3.14
t[0][0], tv[0][0]

In [None]:
x = np.array([[0, 1], [1, 1], [2, 2]])
x

In [None]:
x.sum(0)

In [None]:
x = torch.tensor([[1], [3], [5]])
x

In [None]:
x.expand(3, 7)

In [None]:
x.expand_as(torch.rand(3, 7))

In [None]:
x = torch.tensor([[1, 2], [3, 4], [5, 6]])
torch.movedim(x, 1, 0)

In [None]:
torch.__version__

In [None]:
x.t()

In [None]:
(torch.movedim(x, 1, 0) == x.t()).all()