In [1]:
###Time Windows

#Imports
#Setup

import tensorflow as tf

In [2]:
###Time Windows
#First, we will train a model to forecast the next step given the previous 20 steps, therefore, we need to create a dataset of 20-step windows for training.

dataset = tf.data.Dataset.range(10)
for val in dataset:
    print(val.numpy())#without numpy,a bunch of tensors would be printed so numpy is added to get more readable values

0
1
2
3
4
5
6
7
8
9


In [3]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1)
for window_dataset in dataset: #Each window in this dataset is actually itself a dataset, so when we iterate over the windows 
    for val in window_dataset: # we can iterate over the elements of each window and rint its value
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 
6 7 8 9 
7 8 9 
8 9 
9 


In [5]:
#To eliminate the hanging digits in the last cell and create a new data

dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
for window_dataset in dataset:
    for val in window_dataset:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 


In [6]:
#we would rather have a single dataset containing batches of data in the form of regular tensors so flat map method can be used

dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift = 1, drop_remainder = True)
dataset = dataset.flat_map(lambda window: (window.batch(5)))
"""
allows us to run a method on every single dataset that is part of nested dataset,
i.e. a dataset that contains other datasets
here we are going to call lambda function on every single dataset inside the nested dataset
this lambda does is: calls window.batch a five
In other words, it takes 5 elements from a window and creates a tensors out of them and
since each window is 5 items long, this basically converts every single window dataset into a single tensor
so the resulting dataset is just a dataset that contains tensors of length five
so if iterated, a tensor of size five is produced
"""
for window in dataset:
    print(window.numpy())

[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]


In [7]:
"""
for ML models, input features and labels are required
In this case, train a forecasting model means first four elements can be features and the last one can be labels
so a map method is used to which will transform each window intotwo tensors,
one containing all elements except the last one and the last one containing only the last element
A lambda function is written below to do that where map method is used to produce two tensors.
"""
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
for x, y in dataset:
    print(x.numpy(), y.numpy())

[0 1 2 3] [4]
[1 2 3 4] [5]
[2 3 4 5] [6]
[3 4 5 6] [7]
[4 5 6 7] [8]
[5 6 7 8] [9]


In [8]:
"""
when training a ML model, the instances in dataset are shuffled.
This is to ensure that they are independent and identically distributed or IID,
which is necessary, especially if you're using gradint descent, which is usually the case.
To do that, shuffle method is called and a buffer size is specified.

Each window is not shuffled. As we're ealing with a timeseries, so the elements aren't shuffled within a window
the windows in the dataset are shuffled.
"""
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
dataset = dataset.shuffle(buffer_size = 10)
for x, y in dataset:
    print(x.numpy(), y.numpy())

[3 4 5 6] [7]
[5 6 7 8] [9]
[0 1 2 3] [4]
[4 5 6 7] [8]
[1 2 3 4] [5]
[2 3 4 5] [6]


In [9]:
"""
Batch method is called here to create batches of, in this case 2 windows at each training iteration.
Prefetch method is also called to ensure that TensorFlow will load the next batch of data while it's working on the current batch of data.
So, it never runs out of data and the Processor is kept busy as much as possible.
"""
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.batch(2).prefetch(1)
for x, y in dataset:
    print("x =", x.numpy())
    print("y =", y.numpy())

x = [[2 3 4 5]
 [3 4 5 6]]
y = [[6]
 [7]]
x = [[4 5 6 7]
 [5 6 7 8]]
y = [[8]
 [9]]
x = [[0 1 2 3]
 [1 2 3 4]]
y = [[4]
 [5]]


In [10]:
#Wrap everything in a little function called window_dataset
def window_dataset(series, window_size, batch_size=32, # this will covert any timeseries into TensorFlow dataset that can be loaded and used for own TensorFlow models
                   shuffle_buffer=1000):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
    dataset = dataset.shuffle(shuffle_buffer)
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset