# [Part 1] Data Augmentation Techniques With (tf.data)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import zipfile
import shutil

## Extract And Exploring (Training) And (Testing) Data

In [None]:
# Extract Training Data
extracted = './train'
with zipfile.ZipFile('../input/dogs-vs-cats/train.zip', 'r') as zip_ref:
    zip_ref.extractall(extracted)
    
# Extract Testing Data
extracted = './test'
with zipfile.ZipFile('../input/dogs-vs-cats/test1.zip', 'r') as zip_ref:
    zip_ref.extractall(extracted)

In [None]:
# Exploring Training Data
os.listdir('./train/train')[:6]

In [None]:
# Exploring Testing Data
os.listdir('./test/test1')[:6]

## Organizing The (Training) And (Testing) Data

In [None]:
ORGN_TRAIN_PATH = './train/train'
DIST_TRAIN_PATH = './train'

ORGN_TEST_PATH = './test/test1'
DIST_TEST_PATH = './test'

In [None]:
for image in os.listdir(ORGN_TRAIN_PATH):
    label = image.split('.')[0]
    fileName = image[4:]
    
    labelPath = os.path.join(DIST_TRAIN_PATH, label)
    imgPath = os.path.join(labelPath, fileName)
    
    if not os.path.exists(labelPath):
        os.makedirs(labelPath)
    
    p = os.path.join(ORGN_TRAIN_PATH, image)
    shutil.copy2(p,imgPath)
    os.remove(p)
os.rmdir(ORGN_TRAIN_PATH)
os.listdir(DIST_TRAIN_PATH)

In [None]:
for image in os.listdir(ORGN_TEST_PATH):
    imgPath = os.path.sep.join([DIST_TEST_PATH, image])
    p = os.path.join(ORGN_TEST_PATH, image)
    
    shutil.copy2(p,imgPath)
    os.remove(p)
    
os.rmdir(ORGN_TEST_PATH)
os.listdir(DIST_TEST_PATH)[:3]

## Implementing data augmentation with tf.data and TensorFlow

In [None]:
!pip install imutils

In [None]:
# import the necessary packages
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.data import AUTOTUNE
from imutils import paths
import matplotlib.pyplot as plt
import tensorflow as tf
import argparse
import os

In [None]:
def load_images(imagePath):
    # read the image from disk, decode it, convert the data type to
    # floating point, and resize it
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    image = tf.image.resize(image, (156, 156))
    # parse the class label from the file path
    label = tf.strings.split(imagePath, os.path.sep)[-2]
    
    # return the image and the label
    return (image, label)

In [None]:
aug = tf.keras.Sequential([
    preprocessing.RandomFlip("horizontal_and_vertical"),
    preprocessing.RandomZoom(
        height_factor=(-0.05, -0.15),
        width_factor=(-0.05, -0.15)),
    preprocessing.RandomRotation(0.3)
])

In [None]:
def augment_using_layers(images, labels):
    # pass a batch of images through our data augmentation pipeline
    # and return the augmented images
    images = aug(images)
    # return the image and the label
    return (images, labels)

def augment_using_ops(images, labels):
    # randomly flip the images horizontally, randomly flip the images
    # vertically, and rotate the images by 90 degrees in the counter
    # clockwise direction
    images = tf.image.random_flip_left_right(images)
    images = tf.image.random_flip_up_down(images)
    images = tf.image.rot90(images)
    # return the image and the label
    return (images, labels)

In [None]:
# set the batch size
BATCH_SIZE = 8
# grabs all image paths
imagePaths = list(paths.list_images(DIST_TRAIN_PATH))
# build our dataset and data input pipeline
print("[INFO] loading the dataset...")

# Original Dataset
orgDS = tf.data.Dataset.from_tensor_slices(imagePaths)
orgDS = (orgDS
    .shuffle(len(imagePaths), seed=42)
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .cache()
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

# Augmented Dataset Using Layer augmentation
layersDS = tf.data.Dataset.from_tensor_slices(imagePaths)
layersDS = (layersDS
    .shuffle(len(imagePaths), seed=42)
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .map(lambda x, y: augment_using_layers(x, y),num_parallel_calls=AUTOTUNE)
    .cache()
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

# Augmented Dataset Using (Manually Created Function) augmentation
opsDS = tf.data.Dataset.from_tensor_slices(imagePaths)
opsDS = (opsDS
    .shuffle(len(imagePaths), seed=42)
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .map(augment_using_ops, num_parallel_calls=AUTOTUNE)
    .cache()
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

In [None]:
# grab a batch of data from our dataset
orgBatch = next(iter(orgDS))
layersBatch = next(iter(layersDS))
opsBatch = next(iter(opsDS))

In [None]:
# initialize a figure
print("[INFO] visualizing the first batch of the dataset...")
def plotting (batch, Type):
    title = "With data augmentation {} applied".format(Type)
    fig = plt.figure(figsize=(BATCH_SIZE, BATCH_SIZE))
    fig.suptitle(title)
    # loop over the batch size
    for i in range(0, BATCH_SIZE):
        # grab the image and label from the batch
        (image, label) = (batch[0][i], batch[1][i])
        # create a subplot and plot the image and label
        ax = plt.subplot(2, 4, i + 1)
        plt.imshow(image.numpy())
        plt.title(label.numpy().decode("UTF-8"))
        plt.axis("off")
    # show the plot
    plt.tight_layout()
    plt.show()

In [None]:
plotting(orgBatch, '(Original)')
plotting(layersBatch, '(Layers)')

In [None]:
plotting(orgBatch, '(Original)')
plotting(opsBatch, '(Ops)')

# [Part 2] Implementing Our Data Augmentation Training Script With tf.data

In [None]:
# import the necessary packages
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.layers.experimental import preprocessing
import matplotlib.pyplot as plt
import tensorflow as tf
import argparse

In [None]:
# define training hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
# load the CIFAR-10 dataset
print("[INFO] loading training data...")
((trainX, trainLabels), (testX, testLabels)) = cifar10.load_data()

In [None]:
# initialize our sequential data augmentation pipeline for training
trainAug = Sequential([
    preprocessing.Rescaling(scale=1.0 / 255),
    preprocessing.RandomFlip("horizontal_and_vertical"),
    preprocessing.RandomZoom(
        height_factor=(-0.05, -0.15),
        width_factor=(-0.05, -0.15)),
    preprocessing.RandomRotation(0.3)
])
# initialize a second data augmentation pipeline for testing (this
# one will only do pixel intensity rescaling
testAug = Sequential([
    preprocessing.Rescaling(scale=1.0 / 255)
])

In [None]:
# prepare the training data pipeline (notice how the augmentation
# layers have been mapped)
trainDS = tf.data.Dataset.from_tensor_slices((trainX, trainLabels))
trainDS = (
    trainDS
    .shuffle(BATCH_SIZE * 100)
    .batch(BATCH_SIZE)
    .map(lambda x, y: (trainAug(x), y),num_parallel_calls=tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
)
# create our testing data pipeline (notice this time that we are
# *not* apply data augmentation)
testDS = tf.data.Dataset.from_tensor_slices((testX, testLabels))
testDS = (
    testDS
    .batch(BATCH_SIZE)
    .map(lambda x, y: (testAug(x), y),num_parallel_calls=tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
# initialize the model as a super basic CNN with only a single CONV
# and RELU layer, followed by a FC and soft-max classifier
print("[INFO] initializing model...")
model = Sequential()
# CONV => RELU => POOL
model.add(Conv2D(32, (3, 3), padding="same",input_shape=(32, 32, 3)))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
# (CONV => RELU => POOL) * 2
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
# first (and only) set of FC => RELU layers
model.add(Flatten())
model.add(Dense(256))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))
# softmax classifier        
model.add(Dense(10))
model.add(Activation("softmax"))

In [None]:
# compile the model
print("[INFO] compiling model...")
model.compile(loss="sparse_categorical_crossentropy",optimizer="sgd", metrics=["accuracy"])
# train the model
print("[INFO] training model...")
H = model.fit(
    trainDS,
    validation_data=testDS,
    epochs=EPOCHS)
# show the accuracy on the testing set
(loss, accuracy) = model.evaluate(testDS)
print("[INFO] accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
def plot_training(H, N, plotPath):
    %matplotlib inline
    # construct a plot that plots and saves the training history
    fig, (accuracy, loss) = plt.subplots(1, 2)
    fig.suptitle('Training Loss and Accuracy')
    fig.tight_layout()
    
    plt.subplots_adjust(wspace = 0.5)
    plt.style.use("ggplot")
    plt.figure()
    
    loss.plot(np.arange(0, N), H.history["loss"], label="train_loss")
    loss.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
    loss.set_title('Training Loss')
    loss.set_xlabel('Epoch #')
    loss.set_ylabel('Loss')
    loss.legend(loc="upper right")
    
    accuracy.plot(np.arange(0, N), H.history["accuracy"], label="train_acc")
    accuracy.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc")
    accuracy.set_title('Training Accuracy')
    accuracy.set_xlabel('Epoch #')
    accuracy.set_ylabel('Accuracy')
    accuracy.legend(loc="lower right")

In [None]:
plot_training(H, EPOCHS, "None")