### Generate examples depending on window size

In [1]:
from google.cloud.storage import Client
import tensorflow as tf
import os
import numpy as np

In [2]:
class TimeSeriesGenerator():
    def __init__(self, client, bucket, prefix, spec):
        # Find all .tfrecords in the bucket with the given prefix
        self._records = [
            "/".join(["gs://{}".format(bucket), f.name])
            for f in client.list_blobs(bucket, prefix=prefix)
            if ".tfrecord" in f.name
        ]

        assert(len(self._records) > 0)
        
        # Make a TFRecordDataset, parse, drop empty records, shuffle
        self._spec = spec
        self._ds = tf.data.TFRecordDataset(self._records)\
            .map(self._decode_record)\
            .filter(self._empty_record)\
            .shuffle(8)

    def _empty_record(self, example):
        return tf.math.count_nonzero(example["year"]) > 0

    def _decode_record(self, proto):
        return tf.io.parse_single_example(proto, self._spec)
    
    def _generate_examples(self, example, window_size):
        windows = np.lib.stride_tricks.sliding_window_view(example["year"], window_size)
        start_idxs  = np.where(np.sum(windows > 0, axis=1) == window_size)[0]
        # Shuffle so sequences are not chronological
        np.random.shuffle(start_idxs)
        for idx in start_idxs:
            yield {
                key: example[key][idx:idx+window_size] for key in example
            }
    
    def generate_window(self, window_size):
        # Return a generator that yields examples of the given window size
        # TODO consider holding several Examples in memory so that one batch
        # comes from many pixels.
        def gen():
            for example in self._ds:
                for ex_windowed in self._generate_examples(example, window_size):
                    yield ex_windowed
        return gen

In [3]:
FEATURE_LEN = 21
client = Client(project="forest-lst")
spec = {
    "EVI_p5"       : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "EVI_p50"      : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "EVI_p95"      : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "dT_p5"        : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "dT_p50"       : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "dT_p95"       : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "spei30d_p5"   : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "spei30d_p50"  : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "spei30d_p95"  : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "winter_tmin"  : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "prcp"         : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "latitude"     : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "longitude"    : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32),
    "elevation"    : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.int64),
    "year"         : tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.int64),
    "pct_mortality": tf.io.FixedLenFeature([FEATURE_LEN], dtype=tf.float32)
    
}

tsg = TimeSeriesGenerator(client, "forest-lst-test-export", "ca_dense_tensors_v3", spec)

In [4]:
window_size = 4

windowed_spec = {
    key: tf.TensorSpec([window_size], dtype=spec[key].dtype, name=key)
    for key in spec
}

windowed_ds = tf.data.Dataset.from_generator(
    generator=tsg.generate_window(window_size),
    output_signature=windowed_spec
)

In [6]:
%%time
next(iter(windowed_ds.batch(8)))["year"]

CPU times: total: 203 ms
Wall time: 4.14 s


<tf.Tensor: shape=(8, 4), dtype=int64, numpy=
array([[2006, 2007, 2008, 2009],
       [2011, 2012, 2013, 2014],
       [2008, 2009, 2010, 2011],
       [2010, 2011, 2012, 2013],
       [2007, 2008, 2009, 2010],
       [2009, 2010, 2011, 2012],
       [2016, 2017, 2018, 2019],
       [2014, 2015, 2016, 2017]], dtype=int64)>