<a href="https://colab.research.google.com/github/rjarun8/ComputerVisionPyImageSearch/blob/main/A_gentle_introduction_to_tf_data_with_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



#A gentle introduction to tf.data with TensorFlow

https://www.pyimagesearch.com/2021/06/14/a-gentle-introduction-to-tf-data-with-tensorflow/

In [1]:
!wget 'https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/tfdata-intro/tfdata-intro.zip'

--2021-08-07 11:02:06--  https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/tfdata-intro/tfdata-intro.zip
Resolving pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)... 52.218.137.41
Connecting to pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)|52.218.137.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 586970111 (560M) [binary/octet-stream]
Saving to: ‘tfdata-intro.zip’


2021-08-07 11:02:24 (31.2 MB/s) - ‘tfdata-intro.zip’ saved [586970111/586970111]



In [3]:
# !unzip '/content/tfdata-intro.zip'

In [4]:
import os
os.chdir('/content/tfdata-intro')

In [5]:
# import the necessary packages
import time
def benchmark(datasetGen, numSteps):
	# start our timer
	start = time.time()
	# loop over the provided number of steps
	for i in range(0, numSteps):
		# get the next batch of data (we don't do anything with the
		# data since we are just benchmarking)
		(images, labels) = next(datasetGen)
	# stop the timer
	end = time.time()
	# return the difference between end and start times
	return (end - start)

In [6]:
# import the necessary packages
from pyimagesearch.helpers import benchmark
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.datasets import cifar100
from tensorflow.data import AUTOTUNE
import tensorflow as tf

In [7]:
# initialize the batch size and number of steps
BS = 64
NUM_STEPS = 5000
# load the CIFAR-10 dataset from
print("[INFO] loading the cifar100 dataset...")
((trainX, trainY), (testX, testY)) = cifar100.load_data()

[INFO] loading the cifar100 dataset...
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz


In [8]:
# create a standard image generator object
print("[INFO] creating a ImageDataGenerator object...")
imageGen = ImageDataGenerator()
dataGen = imageGen.flow(
	x=trainX, y=trainY,
	batch_size=BS, shuffle=True)

[INFO] creating a ImageDataGenerator object...


In [9]:
dataset = tf.data.Dataset.from_tensor_slices((trainX, trainY))
# build the data input pipeline
print("[INFO] creating a tf.data input pipeline..")
dataset = (dataset
	.shuffle(1024)
	.cache()
	.repeat()
	.batch(BS)
	.prefetch(AUTOTUNE)
)

[INFO] creating a tf.data input pipeline..


In [10]:
# benchmark the image data generator and display the number of data
# points generated, along with the time taken to perform the
# operation
totalTime = benchmark(dataGen, NUM_STEPS)
print("[INFO] ImageDataGenerator generated {} images in " \
	  " {:.2f} seconds...".format(
	BS * NUM_STEPS, totalTime))

[INFO] ImageDataGenerator generated 320000 images in  6.67 seconds...


In [11]:
# create a dataset iterator, benchmark the tf.data pipeline, and
# display the number of data points generator along with the time taken
datasetGen = iter(dataset)
totalTime = benchmark(datasetGen, NUM_STEPS)
print("[INFO] tf.data generated {} images in {:.2f} seconds...".format(
	BS * NUM_STEPS, totalTime))

[INFO] tf.data generated 320000 images in 1.27 seconds...


In [12]:
!python reading_from_memory.py

2021-08-07 11:13:12.136862: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[INFO] loading the cifar100 dataset...
[INFO] creating a ImageDataGenerator object...
2021-08-07 11:13:16.306101: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-08-07 11:13:16.317981: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-08-07 11:13:16.318039: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (1c86dac841a5): /proc/driver/nvidia/version does not exist
[INFO] creating a tf.data input pipeline..
[INFO] ImageDataGenerator generated 320000 images in  6.45 seconds...
[INFO] tf.data generated 320000 images in 1.21 seconds...


In [13]:
from pyimagesearch.helpers import benchmark
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.data import AUTOTUNE
from imutils import paths
import tensorflow as tf
import numpy as np
import argparse
import os

In [14]:
def load_images(imagePath):
	# read the image from disk, decode it, resize it, and scale the
	# pixels intensities to the range [0, 1]
	image = tf.io.read_file(imagePath)
	image = tf.image.decode_png(image, channels=3)
	image = tf.image.resize(image, (96, 96)) / 255.0
	# grab the label and encode it
	label = tf.strings.split(imagePath, os.path.sep)[-2]
	oneHot = label == classNames
	encodedLabel = tf.argmax(oneHot)
	# return the image and the integer encoded label
	return (image, encodedLabel)

In [15]:
# initialize batch size and number of steps
BS = 64
NUM_STEPS = 1000
# grab the list of images in our dataset directory and grab all
# unique class names
print("[INFO] loading image paths...")
imagePaths = list(paths.list_images(r'/content/tfdata-intro/fruits'))
classNames = np.array(sorted(os.listdir(r'/content/tfdata-intro/fruits')))

[INFO] loading image paths...


In [16]:
print("[INFO] creating a tf.data input pipeline..")
dataset = tf.data.Dataset.from_tensor_slices(imagePaths)
dataset = (dataset
	.shuffle(1024)
	.map(load_images, num_parallel_calls=AUTOTUNE)
	.cache()
	.repeat()
	.batch(BS)
	.prefetch(AUTOTUNE)
)

[INFO] creating a tf.data input pipeline..


In [18]:
# create a standard image generator object
print("[INFO] creating a ImageDataGenerator object...")
imageGen = ImageDataGenerator(rescale=1.0/255)
dataGen = imageGen.flow_from_directory(
	r'/content/tfdata-intro/fruits',
	target_size=(96, 96),
	batch_size=BS,
	class_mode="categorical",
	color_mode="rgb")

[INFO] creating a ImageDataGenerator object...
Found 6688 images belonging to 7 classes.


In [19]:
# benchmark the image data generator and display the number of data
# points generated, along with the time taken to perform the
# operation
totalTime = benchmark(dataGen, NUM_STEPS)
print("[INFO] ImageDataGenerator generated {} images in " \
	  " {:.2f} seconds...".format(
	BS * NUM_STEPS, totalTime))

[INFO] ImageDataGenerator generated 64000 images in  328.71 seconds...


In [20]:
# create a dataset iterator, benchmark the tf.data pipeline, and
# display the number of data points generated, along with the time
# taken
datasetGen = iter(dataset)
totalTime = benchmark(datasetGen, NUM_STEPS)
print("[INFO] tf.data generated {} images in {:.2f} seconds...".format(
	BS * NUM_STEPS, totalTime))

[INFO] tf.data generated 64000 images in 15.18 seconds...


In [None]:
python reading_from_disk.py --dataset fruits

#Data pipelines with tf.data and TensorFlow

https://www.pyimagesearch.com/2021/06/21/data-pipelines-with-tf-data-and-tensorflow/

In [26]:
!wget 'https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/tfdata-pipelines/tfdata-pipelines.zip'

--2021-08-07 11:26:28--  https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/tfdata-pipelines/tfdata-pipelines.zip
Resolving pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)... 52.218.217.225
Connecting to pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com (pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com)|52.218.217.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46749 (46K) [application/zip]
Saving to: ‘tfdata-pipelines.zip’


2021-08-07 11:26:28 (554 KB/s) - ‘tfdata-pipelines.zip’ saved [46749/46749]



In [28]:
os.chdir(r'/content')
!unzip 'tfdata-pipelines.zip'

Archive:  tfdata-pipelines.zip
   creating: tfdata-pipelines/
  inflating: tfdata-pipelines/build_dataset.py  
   creating: tfdata-pipelines/datasets/
  inflating: tfdata-pipelines/plot.png  
   creating: tfdata-pipelines/pyimagesearch/
 extracting: tfdata-pipelines/pyimagesearch/__init__.py  
  inflating: tfdata-pipelines/pyimagesearch/cancernet.py  
  inflating: tfdata-pipelines/pyimagesearch/config.py  
  inflating: tfdata-pipelines/train_model.py  


In [29]:
os.chdir(r'/content/tfdata-pipelines')

In [30]:
!mkdir datasets

mkdir: cannot create directory ‘datasets’: File exists


In [31]:
!mkdir datasets/orig

In [32]:
os.chdir('/content/tfdata-pipelines/datasets/orig')

!wget 'https://www.kaggle.com/paultimothymooney/breast-histopathology-images/download'

--2021-08-07 11:29:12--  https://www.kaggle.com/paultimothymooney/breast-histopathology-images/download
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /account/login?titleType=dataset-downloads&showDatasetDownloadSkip=False&messageId=datasetsWelcome&returnUrl=%2Fpaultimothymooney%2Fbreast-histopathology-images%3Fresource%3Ddownload [following]
--2021-08-07 11:29:12--  https://www.kaggle.com/account/login?titleType=dataset-downloads&showDatasetDownloadSkip=False&messageId=datasetsWelcome&returnUrl=%2Fpaultimothymooney%2Fbreast-histopathology-images%3Fresource%3Ddownload
Reusing existing connection to www.kaggle.com:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘download’

download                [<=>                 ]       0  --.-KB/s               download                [ <=>          

#Data augmentation with tf.data and TensorFlow

https://www.pyimagesearch.com/2021/06/28/data-augmentation-with-tf-data-and-tensorflow/

In [None]:
!wget 'https://pyimagesearch-code-downloads.s3-us-west-2.amazonaws.com/tfdata-data-augmentation/tfdata-data-augmentation.zip'