# Setup

In [33]:
# Install packages and frameworks

# uncomment below if using a notebook with a sagemaker notebook instance lifecycle config
#! pip install -U pip --quiet
#! pip install -r ../requirements.txt --quiet

import tensorflow as tf
import os
import shutil

# debugging code "Cleanup Called..." gets displayed if get_logger is not set
# the below code suppresses the "Cleanup Called..." output
tf.get_logger().setLevel('INFO')

# expecting 2.11
# if 2.7, than logging errors will show "Cleanup called..."
print(tf.__version__)

2.11.1


In [39]:
# scratch directory is apart of the .gitignore to ensure it is not committed to git
%env SCRATCH=../scratch
! [ -e "${SCRATCH}" ] || mkdir -p "${SCRATCH}"

scratch_path = os.environ.get('SCRATCH', './scratch')

env: SCRATCH=../scratch


## Cleanup original training data

In [43]:
# path
path = scratch_path + '/train'

if os.path.exists(path) and os.path.isdir(path):
    # Directory exists, execute your code here
    print("Directory exists. Removing...")
    shutil.rmtree(path)
    # Your code goes here
else:
    print("Directory does not exist.")

Directory exists. Removing...


# Split the data into Train, Validation and Test

Keras utility generates a dataset in tf.data.Dataset format from image files in a directory and infers the labels based on the parent folder. This utility will return a tf.data.Dataset that yields batches of images from the subdirectories left and right

```
train_lr/
├── left/
│   ├── a_image_1.jpg
│   └── a_image_2.jpg
└── right/
    ├── b_image_1.jpg
    └── b_image_2.jpg
```

In [44]:
# set variables for consistency
img_height = 96              # desired height
img_width = 96               # desired width
batch_size = 32              # batch inputs in 32
seed_train_validation = 42   # Must be same for train_ds and val_ds
validation_split = 0.3       # move 30% of the data into validation

## Create some new directories to save our prepared datasets

In [45]:
! mkdir -p "${SCRATCH}"/tf_datasets/{train,validate,test}

## Create Train

Train is the sample of data used to fit the model. Let's generate a tf.data.Dataset from the processed training examples and infer the labels from the directory structure.

In [46]:
# in order for keras to infer the labels, you cannot have any "extra" subdirectories that do not match your expected labels

!rm -rf scratch_path + '/train_lr/.ipynb_checkpoints'

In [54]:
# create the training dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    scratch_path + '/train_lr',
    labels='inferred',
    label_mode = "categorical", 
    class_names=['left','right'],
    color_mode="grayscale",
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True, 
    seed=seed_train_validation,
    validation_split=validation_split,
    subset='training'
)

# save the dataset
dir = scratch_path + "/tf_datasets/train"
tf.data.Dataset.save(train_ds, dir, compression=None, shard_func=None, checkpoint_args=None)

Found 17955 files belonging to 2 classes.
Using 12569 files for training.


## Create Validation

Validation is the sample of data used to provide an unbiased evaluation of a model fit on the training dataset while tuning model hyperparameters. The evaluation becomes more biased as skill on the validation dataset is incorporated into the model configuration.

In [50]:
# create the validation dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    scratch_path + '/train_lr',
    labels='inferred',
    label_mode = "categorical", 
    class_names=['left','right'],
    color_mode="grayscale",
    batch_size=batch_size,
    image_size=(img_height, img_width),
    shuffle=True, 
    seed=seed_train_validation,
    validation_split=validation_split,
    subset='validation'
)

# save the dataset
dir = scratch_path + "/tf_datasets/validate"
tf.data.Dataset.save(validation_ds, dir, compression=None, shard_func=None, checkpoint_args=None)

Found 17955 files belonging to 2 classes.
Using 5386 files for validation.


## Create Test

The sample of data used to provide an unbiased evaluation of a final model fit on the training dataset.

In [51]:
# create the test dataset
test_ds = validation_ds.take(16)
validation_ds = validation_ds.skip(16)

# save the datasets
dir = scratch_path + "/tf_datasets/test"
tf.data.Dataset.save(test_ds, dir, compression=None, shard_func=None, checkpoint_args=None)

You now have a train, validation, and test dataset written to a directory. tf.data.Dataset.save() is used to save the dataset to the specified save_dir. Make sure to provide a valid path to the directory where you want to save the dataset. The dataset will be saved in a sharded file format.

Later, if you want to load the saved dataset, you can use tf.data.Dataset.load()

## Print the Dataset batches

In [52]:
# reserves 393 batches training
print('70% for training -->', train_ds.cardinality())
# reserves 164 batches validation
print('20% for validating -->', validation_ds.cardinality())
# reserves 5 batches testing
print('10% for testing -->', test_ds.cardinality())

70% for training --> tf.Tensor(393, shape=(), dtype=int64)
20% for validating --> tf.Tensor(153, shape=(), dtype=int64)
10% for testing --> tf.Tensor(16, shape=(), dtype=int64)


## Print Inferred Dataset Classes

In [53]:
# display the class names inferred from the training dataset
class_names = train_ds.class_names
print(class_names)

['left', 'right']


# Apply augmentation
When you don't have a large image dataset or when your images are all set in a single direction like ours are, it's a good practice to artificially introduce sample diversity by applying random, yet realistic, transformations to the training images, such as rotation and horizontal flipping. This helps expose the model to different aspects of the training data and reduce over-fitting.

Learn more https://www.tensorflow.org/tutorials/images/data_augmentation

In [21]:
data_augmentation = tf.keras.Sequential([

  # randomly rotates images during training
  tf.keras.layers.RandomRotation(
    # a float represented as fraction of 2 Pi, or a tuple of size 2 representing lower and upper bound for rotating clockwise and counter-clockwise. 
    0.2,                     # A positive values means rotating counter clock-wise, while a negative value means clock-wise. 
    fill_mode='constant',    # Points outside the boundaries of the input are filled according to the given mode (one of {"constant", "reflect", "wrap", "nearest"}).
    interpolation='nearest', # Supported values: "nearest", "bilinear".
    seed=None,               # Integer. Used to create a random seed.
    fill_value=0.0           # the value to be filled outside the boundaries when fill_mode="constant".
),
])

for image, _ in train_ds.take(1):
  plt.figure(figsize=(10, 10))
  first_image = image[2]
  for i in range(10):
    ax = plt.subplot(5, 5, i + 1)
    augmented_image = data_augmentation(tf.expand_dims(first_image, 0))
    plt.imshow(augmented_image[0] / 1, cmap='gray')
    plt.axis('off')
    
clear_output()