# Setup

In [None]:
# Install packages and frameworks

import tensorflow as tf
import os
import shutil
import matplotlib.pyplot as plt

# expecting 2.11
# if 2.7, than logging errors will show "Cleanup called..."
print(tf.__version__)

In [None]:
# scratch directory is apart of the .gitignore to ensure it is not committed to git
%env SCRATCH=../scratch
! [ -e "${SCRATCH}" ] || mkdir -p "${SCRATCH}"

scratch_path = os.environ.get('SCRATCH', './scratch')

## Cleanup original training data

In [None]:
# path
path = scratch_path + '/train'

if os.path.exists(path) and os.path.isdir(path):
    # Directory exists, execute your code here
    print("Directory exists. Removing...")
    shutil.rmtree(path)
    # Your code goes here
else:
    print("Directory does not exist.")

# Split the data into Train, Validation and Test

Keras utility generates a dataset in tf.data.Dataset format from image files in a directory and infers the labels based on the parent folder. This utility will return a tf.data.Dataset that yields batches of images from the subdirectories left and right

```
train_lr/
├── left/
│   ├── a_image_1.jpg
│   └── a_image_2.jpg
└── right/
    ├── b_image_1.jpg
    └── b_image_2.jpg
```

In [None]:
# set variables for consistency
img_height = 96              # desired height
img_width = 96               # desired width
batch_size = 32              # batch inputs in 32
seed_train_validation = 42   # Must be same for train_ds and val_ds
validation_split = 0.3       # move 30% of the data into validation

## Create some new directories to save our prepared datasets

In [None]:
! mkdir -p "${SCRATCH}"/tf_datasets/{train,validate,test}

## Create Train

Train is the sample of data used to fit the model. Let's generate a tf.data.Dataset from the processed training examples and infer the labels from the directory structure.

In [None]:
# in order for keras to infer the labels, you cannot have any "extra" subdirectories that do not match your expected labels

!rm -rf scratch_path + '/train_lr/.ipynb_checkpoints'

The saved dataset is saved in multiple file "shards". By default, the dataset output is divided to shards in a round-robin fashion but custom sharding can be specified via the shard_func function. For example, you can save the dataset to using a single shard as follows:

In [None]:
# create the training dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    scratch_path + '/train_lr',
    labels='inferred',
    label_mode = "categorical", 
    class_names=['left','right'],
    color_mode="grayscale",
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True, 
    seed=seed_train_validation,
    validation_split=validation_split,
    subset='training'
)

# save the dataset
dir = scratch_path + "/tf_datasets/train"
tf.data.Dataset.save(train_ds,
                     dir, 
                     #compression=None,
                     #shard_func=custom_shard_func,
                     checkpoint_args=None)

## Create Validation

Validation is the sample of data used to provide an unbiased evaluation of a model fit on the training dataset while tuning model hyperparameters. The evaluation becomes more biased as skill on the validation dataset is incorporated into the model configuration.

In [None]:
# create the validation dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    scratch_path + '/train_lr',
    labels='inferred',
    label_mode = "categorical", 
    class_names=['left','right'],
    color_mode="grayscale",
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True, 
    seed=seed_train_validation,
    validation_split=validation_split,
    subset='validation'
)

# save the dataset
dir = scratch_path + "/tf_datasets/validate"
tf.data.Dataset.save(validation_ds,
                     dir, 
                     #compression=None,
                     #shard_func=custom_shard_func,
                     checkpoint_args=None)

## Create Test

The sample of data used to provide an unbiased evaluation of a final model fit on the training dataset.

In [None]:
# create the test dataset
test_ds = validation_ds.take(16)
validation_ds = validation_ds.skip(16)

# save the datasets
dir = scratch_path + "/tf_datasets/test"
tf.data.Dataset.save(test_ds,
                     dir, 
                     #compression=None,
                     #shard_func=custom_shard_func,
                     checkpoint_args=None)

You now have a train, validation, and test dataset written to a directory. tf.data.Dataset.save() is used to save the dataset to the specified save_dir. Make sure to provide a valid path to the directory where you want to save the dataset. The dataset will be saved in a sharded file format.

Later, if you want to load the saved dataset, you can use tf.data.Dataset.load()

## Print the Dataset batches

In [None]:
# reserves 393 batches training
print('70% for training -->', train_ds.cardinality())
# reserves 164 batches validation
print('20% for validating -->', validation_ds.cardinality())
# reserves 5 batches testing
print('10% for testing -->', test_ds.cardinality())

## Print Inferred Dataset Classes

In [None]:
# display the class names inferred from the training dataset
class_names = train_ds.class_names
print(class_names)