<a href="https://colab.research.google.com/github/rahuljungbahadur/hands_on_ml_book/blob/main/chp13_tf_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to parallelize tensorflow datasets so that the complete data is never loaded in RAM.



In [1]:
import tensorflow as tf

In [6]:
## Sample tf dataset

x = tf.data.Dataset.range(10)
x

<RangeDataset shapes: (), types: tf.int64>

In [5]:
[x for x in x]  ## Generates 10 tensors

[<tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 <tf.Tensor: shape=(), dtype=int64, numpy=5>,
 <tf.Tensor: shape=(), dtype=int64, numpy=6>,
 <tf.Tensor: shape=(), dtype=int64, numpy=7>,
 <tf.Tensor: shape=(), dtype=int64, numpy=8>,
 <tf.Tensor: shape=(), dtype=int64, numpy=9>]

In [10]:
## Generating a dataset for recursive re-reading and batch creation
x_repAndBatch = x.repeat(3).batch(7)#, drop_remainder=True)  ## drop remainder drops the leftover obs if they don't comply with thebatch length

In [11]:
[x for x in x_repAndBatch]  ## Generates batches of len 7 while recursing through the data 3 times. 

[<tf.Tensor: shape=(7,), dtype=int64, numpy=array([0, 1, 2, 3, 4, 5, 6])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([7, 8, 9, 0, 1, 2, 3])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([4, 5, 6, 7, 8, 9, 0])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([1, 2, 3, 4, 5, 6, 7])>]

In [15]:
## Mapping a function to the generated batch

x_squared = x_repAndBatch.map(lambda x: x**2)

In [16]:
[x for x in x_squared] ## squared each element in the batch

[<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 0,  1,  4,  9, 16, 25, 36])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([49, 64, 81,  0,  1,  4,  9])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([16, 25, 36, 49, 64, 81,  0])>,
 <tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 1,  4,  9, 16, 25, 36, 49])>]

In [18]:
## Apply a function to the whole dataset
x_applied = x_repAndBatch.apply(tf.data.experimental.unbatch())


In [20]:
[x for x in x_applied]  ## unbatches each batch so that each element has it's own batch

[<tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 <tf.Tensor: shape=(), dtype=int64, numpy=5>,
 <tf.Tensor: shape=(), dtype=int64, numpy=6>,
 <tf.Tensor: shape=(), dtype=int64, numpy=7>,
 <tf.Tensor: shape=(), dtype=int64, numpy=8>,
 <tf.Tensor: shape=(), dtype=int64, numpy=9>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 <tf.Tensor: shape=(), dtype=int64, numpy=5>,
 <tf.Tensor: shape=(), dtype=int64, numpy=6>,
 <tf.Tensor: shape=(), dtype=int64, numpy=7>,
 <tf.Tensor: shape=(), dtype=int64, numpy=8>,
 <tf.Tensor: shape=(), dtype=int64, numpy=9>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64

In [23]:
## simple filter

x_filtered = x_applied.filter(lambda x : x < 5)  #filtered runs n an unbatched input

In [26]:
[x for x in x_filtered]

[<tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>,
 <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 <tf.Tensor: shape=(), dtype=int64, numpy=3>,
 <tf.Tensor: shape=(), dtype=int64, numpy=4>]