# Quick ways of data loading in TensorFlow

This is a supporting colab for https://rahulbakshee.github.io/iWriteHere/2022/01/16/Quick-ways-of-data-loading-in-TensorFlow.html

### download raw data

In [1]:
import tensorflow as tf

# download raw data
import pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file(origin=dataset_url,
                                   fname='flower_photos',
                                   untar=True)
data_dir = pathlib.Path(data_dir)

# total number of images
image_count = len(list(data_dir.glob('*/*.jpg')))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz


# 1. image_dataset_from_directory

In [2]:
batch_size = 32
img_height, img_width = 150, 150
seed = 42
import tensorflow as tf

# Load data off disc using a Keras utility
train_ds = tf.keras.utils.image_dataset_from_directory(			
                            data_dir,
                            validation_split=0.2,
                            subset="training",
                            seed=seed,
                            image_size=(img_height, img_width),
                            batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
                            data_dir,
                            validation_split=0.2,
                            subset="validation",
                            seed=seed,
                            image_size=(img_height, img_width),
                            batch_size=batch_size)


Found 3670 files belonging to 5 classes.
Using 2936 files for training.
Found 3670 files belonging to 5 classes.
Using 734 files for validation.


# 2. tf.data

In [3]:
import os
import numpy as np
import tensorflow as tf

img_height, img_width = 150, 150
AUTOTUNE = tf.data.AUTOTUNE

list_ds = tf.data.Dataset.list_files(str(data_dir/'*/*'), shuffle=False)

val_size = int(image_count * 0.2)
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)

class_names = np.array(sorted([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"]))

print("Using {} files for training.".format(len(train_ds)))
print("Using {} files for validation.".format(len(val_ds)))

def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)

def decode_img(img):
  # Convert the compressed string to a 3D uint8 tensor
  img = tf.io.decode_jpeg(img, channels=3)
  # Resize the image to the desired size
  return tf.image.resize(img, [img_height, img_width])

def process_path(file_path):
  label = get_label(file_path)
  # Load the raw data from the file as a string
  img = tf.io.read_file(file_path)
  img = decode_img(img)
  return img, label

# Use Dataset.map to create a dataset of image, label pairs:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

Using 2936 files for training.
Using 734 files for validation.


# 3. tensorflow_datasets

In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds

(train_ds, val_ds), info = tfds.load(
                                    'tf_flowers',
                                    split=['train[:80%]', 'train[80%:]'],
                                    with_info=True,
                                    as_supervised=True,
                                    )

print("Using {} files for training.".format(len(train_ds)))
print("Using {} files for validation.".format(len(val_ds)))

[1mDownloading and preparing dataset tf_flowers/3.0.1 (download: 218.21 MiB, generated: 221.83 MiB, total: 440.05 MiB) to /root/tensorflow_datasets/tf_flowers/3.0.1...[0m


local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



Dl Completed...:   0%|          | 0/5 [00:00<?, ? file/s]


[1mDataset tf_flowers downloaded and prepared to /root/tensorflow_datasets/tf_flowers/3.0.1. Subsequent calls will reuse this data.[0m
Using 2936 files for training.
Using 734 files for validation.
