# Load and parse data with TensorFlow 2.0 (tf.data)

A TensorFlow 2.0 example to build input pipelines for loading data efficiently.


- Numpy Arrays
- Images
- CSV file
- Custom data from a Generator

For more information about creating and loading TensorFlow's `TFRecords` data format, see: [tfrecords.ipynb](tfrecords.ipynb)

- Author: Aymeric Damien
- Project: https://github.com/aymericdamien/TensorFlow-Examples/

In [1]:
from __future__ import absolute_import, division, print_function

import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

### Load Numpy Arrays

Build a data pipeline over numpy arrays.

In [2]:
# Create a toy dataset (even and odd numbers, with respective labels of 0 and 1).
evens = np.arange(0, 100, step=2, dtype=np.int32)
evens_label = np.zeros(50, dtype=np.int32)
odds = np.arange(1, 100, step=2, dtype=np.int32)
odds_label = np.ones(50, dtype=np.int32)
# Concatenate arrays
features = np.concatenate([evens, odds])
labels = np.concatenate([evens_label, odds_label])

# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_tensor_slices((features, labels))
# Refill data indefinitely.  
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [4]:
for batch_x, batch_y in data.take(5):
    print(batch_x, batch_y)

tf.Tensor([83 56 91 17], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([84 25 78 69], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([22 54 61 98], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([14 24  2  7], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([16 90 32 29], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)


In [4]:
# Note: If you are planning on calling multiple time,
# you can user the iterator way:
ite_data = iter(data)
for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

tf.Tensor([34 83 75 51], shape=(4,), dtype=int32) tf.Tensor([0 1 1 1], shape=(4,), dtype=int32)
tf.Tensor([37 26 10 10], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([33 41 28 52], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([ 7 29 60 16], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([90 68 98  0], shape=(4,), dtype=int32) tf.Tensor([0 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([97 18 74  4], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([ 6 62 57  3], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([12 59 89 27], shape=(4,), dtype=int32) tf.Tensor([0 1 1 1], shape=(4,), dtype=int32)
tf.Tensor([50 17 88 85], shape=(4,), dtype=int32) tf.Tensor([0 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([70 44 95 65], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)


### Load CSV files

Build a data pipeline from features stored in a CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format.

#### Titanic Dataset



survived|pclass|name|sex|age|sibsp|parch|ticket|fare
--------|------|----|---|---|-----|-----|------|----
1|1|"Allen, Miss. Elisabeth Walton"|female|29|0|0|24160|211.3375
1|1|"Allison, Master. Hudson Trevor"|male|0.9167|1|2|113781|151.5500
0|1|"Allison, Miss. Helen Loraine"|female|2|1|2|113781|151.5500
0|1|"Allison, Mr. Hudson Joshua Creighton"|male|30|1|2|113781|151.5500
...|...|...|...|...|...|...|...|...

In [6]:
# Download Titanic dataset (in csv format).
d = requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv", "wb") as f:
    f.write(d.content)

In [7]:
# Load Titanic dataset.
# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare
# Select specific columns: survived,pclass,name,sex,age,fare
column_to_use = [0, 1, 2, 3, 4, 8]
record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]

# Load the whole dataset file, and slice each line.
data = tf.data.experimental.CsvDataset("titanic_dataset.csv", record_defaults, header=True, select_cols=column_to_use)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)
# Batch data (aggregate records together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [8]:
for survived, pclass, name, sex, age, fare in data.take(1):
    print(survived.numpy())
    print(pclass.numpy())
    print(name.numpy())
    print(sex.numpy())
    print(age.numpy())
    print(fare.numpy())

[0 1]
[2 2]
[b'Milling, Mr. Jacob Christian'
 b'Drew, Mrs. James Vivian (Lulu Thorne Christian)']
[b'male' b'female']
[48. 34.]
[13.  32.5]


### Load Images

Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used.

In [3]:
# Download Oxford 17 flowers dataset
d = requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz", "wb") as f:
    f.write(d.content)
# Extract archive.
with tarfile.open("17flowers.tgz") as t:
    t.extractall()

KeyboardInterrupt: 

In [None]:
with open('jpg/dataset.csv', 'w') as f:
    c = 0
    for i in range(1360):
        f.write("jpg/image_%04i.jpg,%i\n" % (i+1, c))
        if (i+1) % 80 == 0:
            c += 1

In [None]:
# Load Images
with open("jpg/dataset.csv") as f:
    dataset_file = f.read().splitlines()

# Load the whole dataset file, and slice each line.
data = tf.data.Dataset.from_tensor_slices(dataset_file)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)

# Load and pre-process images.
def load_image(path):
    # Read image from path.
    image = tf.io.read_file(path)
    # Decode the jpeg image to array [0, 255].
    image = tf.image.decode_jpeg(image)
    # Resize images to a common size of 256x256.
    image = tf.image.resize(image, [256, 256])
    # Rescale values to [-1, 1].
    image = 1. - image / 127.5
    return image
# Decode each line from the dataset file.
def parse_records(line):
    # File is in csv format: "image_path,label_id".
    # TensorFlow requires a default value, but it will never be used.
    image_path, image_label = tf.io.decode_csv(line, ["", 0])
    # Apply the function to load images.
    image = load_image(image_path)
    return image, image_label
# Use 'map' to apply the above functions in parallel.
data = data.map(parse_records, num_parallel_calls=4)

# Batch data (aggregate images-array together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [None]:
for batch_x, batch_y in data.take(1):
    print(batch_x, batch_y)

tf.Tensor(
[[[[-0.90260804 -0.9550551  -0.9444355 ]
   [-0.9538603  -0.9715073  -0.9136642 ]
   [-0.41687727 -0.37570083 -0.25462234]
   ...
   [ 0.4617647   0.422549    0.3754902 ]
   [ 0.4934436   0.45422792  0.4071691 ]
   [ 0.5530829   0.5138672   0.46680838]]

  [[-0.9301815  -0.98563874 -0.9595933 ]
   [-0.9379289  -0.95557594 -0.89773285]
   [-0.68581116 -0.6446346  -0.5305033 ]
   ...
   [ 0.46960783  0.43039215  0.38333333]
   [ 0.5009191   0.46170342  0.4146446 ]
   [ 0.56071925  0.52150357  0.4744447 ]]

  [[-0.9480392  -0.9862745  -0.96889937]
   [-0.93367803 -0.9485103  -0.8916054 ]
   [-0.9224341  -0.9033165  -0.7915518 ]
   ...
   [ 0.48045343  0.44123775  0.39417893]
   [ 0.51623774  0.47702205  0.42996323]
   [ 0.5740809   0.5348652   0.48780638]]

  ...

  [[ 0.0824219   0.37201285  0.5615885 ]
   [ 0.09744179  0.3858226   0.57758886]
   [ 0.1170305   0.4023859   0.59906554]
   ...
   [ 0.02599955  0.65661     0.7460593 ]
   [-0.0751493   0.6735256   0.7022212 ]
   [-

### Load data from a Generator

In [6]:
# Create a dummy generator.
def generate_features():
    # Function to generate a random string.
    def random_string(length):
        return ''.join(random.choice(string.ascii_letters) for m in range(length))
    # Return a random string, a random vector, and a random int.
    yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)

In [7]:
# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [8]:
# Display data.
for batch_str, batch_vector, batch_int in data.take(5):
    print(batch_str, batch_vector, batch_int)

tf.Tensor([b'wHaT' b'cZkq' b'UIZO' b'oZOk'], shape=(4,), dtype=string) tf.Tensor(
[[0.24114357 0.11343051 0.33091062 0.97158015]
 [0.05516724 0.67957413 0.42812943 0.9246092 ]
 [0.07513198 0.17574777 0.18582867 0.540566  ]
 [0.82032245 0.41548973 0.69706964 0.14527854]], shape=(4, 4), dtype=float32) tf.Tensor([3 0 7 4], shape=(4,), dtype=int32)
tf.Tensor([b'Svvj' b'wwRV' b'PyVT' b'utMe'], shape=(4,), dtype=string) tf.Tensor(
[[0.6694141  0.082927   0.04718021 0.46317402]
 [0.8352948  0.25139913 0.99965566 0.38631207]
 [0.07003514 0.80388176 0.60053045 0.6691096 ]
 [0.51109725 0.62122715 0.21071441 0.72263306]], shape=(4, 4), dtype=float32) tf.Tensor([10  5  5  5], shape=(4,), dtype=int32)
tf.Tensor([b'yFUh' b'UTjz' b'moom' b'aisC'], shape=(4,), dtype=string) tf.Tensor(
[[8.9750880e-01 9.6311384e-01 5.2880429e-02 1.9961230e-01]
 [3.5408661e-02 1.1448821e-01 2.2397154e-04 8.5254920e-01]
 [9.1813570e-01 7.9013294e-01 2.3340425e-02 5.3032752e-02]
 [7.9523420e-01 2.4931072e-01 3.4661773e-01