Reference- "Input Pipeline for Images using Keras and TensorFlow - Guide to creating an input pipeline for custom image dataset for deep learning models using Keras and TensorFlow" by Renu Khandelwal (Aug 21)
https://towardsdatascience.com/input-pipeline-for-images-using-keras-and-tensorflow-c5e107b6d7b9

### Import modules

In [None]:
#%matplotlib inline

import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import metrics
from PIL import Image
import cv2

### Setup the GPU memory growth

In [None]:
### Taken from- https://github.com/tensorflow/tensorflow/issues/34695
### This was done to resolve the error: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above. [Op:Conv2D]
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


### Set parameters

In [None]:
# Full dataset
train_data_dir=r'data_mip3_aug/train'
test_data_dir=r'data_mip3_aug/test'

# Calculation of max batch size-
# Max batch size= available GPU memory bytes / 4 / (size of tensors + trainable parameters)
# size of tensors = batch_size*100*100 *4 if using 64 bit integers

batch_size = 256
test_batch_size = 256
img_height = 100
img_width = 100
mean = 157.1
std = 64.6
AUTOTUNE = tf.data.experimental.AUTOTUNE

### Create filelist datasets
### tf.data.Dataset.list_files() creates a dataset from a directory list of files using a matching pattern.

In [None]:
train_list_ds = tf.data.Dataset.list_files(str(train_data_dir + '/*/*'), shuffle=False)
# get the count of image files in the train directory
train_image_count=0
for dir1 in os.listdir(train_data_dir):
    for files in os.listdir(os.path.join(train_data_dir, dir1)):
        train_image_count+=1
train_list_ds = train_list_ds.shuffle(train_image_count, reshuffle_each_iteration=False)
print(train_image_count)

In [None]:
# Print some filenames
for x in train_list_ds.take(5):
    print(x.numpy().decode('utf-8'))

In [None]:
test_list_ds = tf.data.Dataset.list_files(str(test_data_dir + '/*/*'), shuffle=False)
# get the count of image files in the train directory
test_image_count=0
for dir1 in os.listdir(test_data_dir):
    for files in os.listdir(os.path.join(test_data_dir, dir1)):
        test_image_count+=1
test_list_ds = test_list_ds.shuffle(test_image_count, reshuffle_each_iteration=False)
print(test_image_count)

In [None]:
# Use this function to list files from a filelist dataset, such as test_list_ds.
# This may be useful later
list_files_from_tfdataset = lambda tfd: [f.numpy().decode('utf-8') for f in tfd]

In [None]:
#l = list_files_from_tfdataset(test_list_ds)
#print(len(l))
#del(l)

In [None]:
#test_ds_filepaths = test_list_ds.take(test_image_count)

### Creating class labels from the directory name

In [None]:
class_names = np.array(sorted([dir1 for dir1 in os.listdir(train_data_dir)]))
class_names

### Splitting the dataset into train, and Val.
The validation dataset is 30% of the total dataset, and train dataset is 70% of the entire dataset.

In [None]:
# Create training and validation datasets
val_size = int(train_image_count * 0.3)
train_ds = train_list_ds.skip(val_size)
val_ds = train_list_ds.take(val_size)
print(train_image_count-val_size, val_size)

In [None]:
# Different way of creatint training and validation datasets
# This is useful to select only small subset of the full data
#val_size = int(train_image_count * 0.01)
#train_size = int(train_image_count * 0.03)
#train_ds = train_list_ds.take(train_size)
#remaining_list_ds = train_list_ds.skip(train_size)
#val_ds = remaining_list_ds.take(val_size)
#print(train_size, val_size)

In [None]:
# Print some filenames
for x in train_ds.take(5):
    print(x.numpy().decode('utf-8'))

In [None]:
# Create test dataset
test_ds = test_list_ds.take(test_image_count)

### Creating input pipeline components for a single training/validation example representing a pair of tensors to represent the image and its corresponding label.

In [None]:
#To process the label
def get_label(filepath):
    # convert the path to a list of path components separated by sep
    parts = tf.strings.split(filepath, os.path.sep)
    # The second to last is the class-directory
    one_hot = parts[-2] == class_names
    # Integer encode the label
    return tf.argmax(tf.cast(one_hot, tf.int32))
    
    #label = tf.strings.split(filepath, sep='/')
    #label = tf.strings.split(label[-1], sep='.')

# To process the image
def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    #img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.decode_png(img, channels=1)
    # resize the image to the desired size
    return tf.image.resize(img, [img_height, img_width])

# To create the single training of validation example with image and its corresponding label
def process_path(filepath):
    label = get_label(filepath)
    # load the raw data from the file as a string
    img = tf.io.read_file(filepath)
    img = decode_img(img)
    return img, label

### Set the AUTOTUNE; this will help to delegate the decision on the level of parallelism to use to the tf.data at runtime to optimize the CPU/GPU utilization.

In [None]:
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [None]:
test_ds = test_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [None]:
# Test if the datasets are prepared correctly
for images, labels in train_ds.take(1):
    print(images.shape, labels.shape)
    img = images[0].numpy()
    print(img.min(), img.max(), img.dtype, labels.numpy())
    img = (img-5)/2 # Testing mathematical operations on tensor
    print(img.min(), img.max(), img.dtype, labels.numpy())

### Set augmentation and normalization functions

In [None]:
def augment(image, label):
    img = tf.image.rot90(image)
    img = tf.image.flip_left_right(img)
    #img = tf.clip_by_value(img, 0.0, 1.0)
    return img, label

def standardize_per_image(image, label):
    img = tf.image.per_image_standardization(image)
    return img, label

def standardize_dataset(image, label):
    # NOTE: mean and std have to be defined globally
    # Not checking that mean and std are > 0 to avoid time lag. So be careful with the values of mean and std.
    img = (image - mean)/std
    return img, label

### Configure data source for Performance
To configure the data source for performance, use prefetching.
Prefetching in tf.data allows the preprocessing of the data and model execution of a training step to overlap.
While the model is executing a training step 100, the input pipeline is reading the data for step 101.

In [None]:
def configure_for_performance(ds, cache_filename):
    #ds = ds.cache()
    ds = ds.cache(cache_filename)
    ds = ds.shuffle(buffer_size=1000)
    #ds = ds.map(augment, num_parallel_calls=AUTOTUNE) # No need if pre-augmented images.
    #ds = ds.map(standardize_per_image, num_parallel_calls=AUTOTUNE) # Use either of the two standardizations.
    ds = ds.map(standardize_dataset, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

train_ds = configure_for_performance(train_ds, '/home/harsh/tensorflow_cache/train_v5.train_ds')
val_ds = configure_for_performance(val_ds, '/home/harsh/tensorflow_cache/train_v5.val_ds')

In [None]:
def configure_for_performance_forTestDataset(ds, cache_filename):
    #ds = ds.cache()
    ds = ds.cache(cache_filename)
    #ds = ds.shuffle(buffer_size=1000)
    #ds = ds.map(standardize_per_image, num_parallel_calls=AUTOTUNE) # Use either of the two standardizations.
    ds = ds.map(standardize_dataset, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(test_batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

test_ds = configure_for_performance_forTestDataset(test_ds, '/home/harsh/tensorflow_cache/train_v5.test_ds')

In [None]:
# Test that the pixels values are actually standardized and the images are properly stored.
for images, labels in train_ds.take(1):
    print(images.shape, labels.shape)
    img = images[0].numpy()
    print(img.min(), img.max(), img.dtype, labels[0].numpy())
    plt.imshow(img, cmap=plt.get_cmap("gray"))

In [None]:
# Check the test dataset. This dataset is not shuffled.
# So you should see same values printed if you run it again.
for images, labels in test_ds.take(1):
    print(images.shape, labels.shape)
    #img = images[0].numpy()
    print(images[0].numpy().min(), images[0].numpy().max())
    print(images[1].numpy().min(), images[1].numpy().max())

In [None]:
# Check if the file names can be matched with output of the test dataset
test_ds_filelist = list_files_from_tfdataset(test_list_ds) # Create test dataset file list.

In [None]:
for images, labels in test_ds.take(1):
    print(images.shape, labels.shape)
    plt.imshow(images[2].numpy()[:,:], cmap=plt.get_cmap("gray"))
    print(images[2].numpy().shape, labels[2].numpy().shape)
    print(labels[2].numpy())
    print(test_ds_filelist[2])

In [None]:
# Check if a batch of images from test dataset can be printed along with classnames and filenames
plt.figure(figsize=(30, 30))
cnt = 0
for images, labels in train_ds.take(1):
    #print(images.shape)
    for i in range(batch_size):
        ax = plt.subplot(int(np.sqrt(batch_size))+1, int(np.sqrt(batch_size))+1, i + 1)
        #ax = plt.subplot(12, 12, i + 1)
        #print(images[i].shape)
        plt.imshow(images[i].numpy(), cmap=plt.get_cmap("gray"))
        #plt.title(class_names[labels[i]]+"\n"+test_ds_filelist[i+cnt*batch_size].split("/")[-1].replace(".png",""))
        plt.axis("off")
    cnt += 1

### Create keras model
The input to the model is tf.data.Dataset

In [None]:
# Confirm current working directory
os.getcwd()

In [None]:
# Create callbacks
# This part is not complete. Needs to be done for later.

# Reference for saving checkpoints and restarting from latest checkpoint
# https://www.tensorflow.org/tutorials/keras/save_and_load

#checkpoint_path = ""
#checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1)

In [None]:
#Define model
model=tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(input_shape=(img_height, img_width, 1)),
        ###keras.layers.Conv2D(32,(3,3), activation='relu', input_shape=input_shape),
        tf.keras.layers.Conv2D(32,(3,3),activation='relu'),
        tf.keras.layers.MaxPool2D((2,2)),
        tf.keras.layers.Conv2D(64,(3,3),activation='relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Conv2D(128,(3,3),activation='relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Conv2D(256,(3,3),activation='relu'),
        tf.keras.layers.MaxPool2D(2,2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        #tf.keras.layers.Dense(1024, activation='relu'),
        #tf.keras.layers.BatchNormalization(),
        #tf.keras.layers.Dense(256, activation='relu'),
        #tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

#Compile the model
model.compile(
              optimizer=opt, 
              #optimizer='adam', 
              loss='binary_crossentropy',
              #loss='categorical_crossentropy',
              metrics=['acc']
              #metrics=['accuracy', 'f1score', 'precision', 'recall']
             )

#Print model summary
model.summary()

In [None]:
#Visualize model graphically
#tf.keras.utils.plot_model(model)

### Fit model to data

In [None]:
# Scratch
#epochs = 10000
#steps_per_epoch = 1
#num_batches = steps_per_epoch * epochs
#print("Available images",image_count-val_size)
#print("Number of images required", num_batches * batch_size)
#(image_count-val_size)/68

In [None]:
#Fitting the model
history = model.fit(train_ds,
                    #steps_per_epoch=100,#(image_count-val_size) // batch_size,
                    epochs=100,
                    #validation_steps=100,#val_size // batch_size,
                    validation_data=val_ds,
                    verbose=1
                   )

### Save model to file

In [None]:
#model_filename = "models/model.10dec2020.data_mip3_aug.per_image_dataset"
model_filename = "models/model4.data_mip3_aug.std_dataset"
model.save(model_filename)

### Access model performance

In [None]:
num_epochs = 100
acc_train = history.history['acc'][:num_epochs]
acc_val = history.history['val_acc'][:num_epochs]
epochs = range(1,num_epochs+1)
plt.plot(epochs,acc_train, 'orange', label='training accuracy')
plt.plot(epochs, acc_val, 'b', label= 'validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
num_epochs = 100
loss_train = history.history['loss'][:num_epochs]
loss_val = history.history['val_loss'][:num_epochs]
epochs = range(1,num_epochs+1)
plt.plot(epochs,loss_train, 'orange', label='training loss')
plt.plot(epochs, loss_val, 'b', label= 'validation loss')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Plot images with true and predicted class labels

In [None]:
# Scratch
#a = val_ds.list_files("*.png")
#len(list(val_ds.list_files("*")))
#dataset_length = [i for i,_ in enumerate(val_ds)][-1] + 1
#dataset_length*batch_size
for images, labels in test_ds.take(1):
    print(images.shape, labels[0].numpy())


In [None]:
def classification_report(true_labels, pred_labels):
    print("---Classification report---\n")
    confusion_matrix = metrics.confusion_matrix(true_labels, pred_labels)
    print("\t\t\tClassified as")
    print("\t\t"+"\t".join(class_names)+"\tsum")
    for i in range(0,len(class_names)):
        print(class_names[i]+"\t"+
              "\t\t".join([str(x) for x in confusion_matrix[i]])+
              "\t\t"+str(np.sum(confusion_matrix[i])))
    print("==========================================================\n")
    print("F1 scores:",metrics.f1_score(true_labels, pred_labels, average=None))
    print("==========================================================\n")
    print("metrics.classification_report:\n",metrics.classification_report(true_labels, pred_labels))

In [None]:
def get_true_pred_labels(model, dataset, num_batches):
    pred_1_to_10 = []
    pred_labels = []
    true_labels = []
    cnt = 0
    for images, labels in dataset.take(num_batches):
        #3print(images.shape, labels.shape)
        pred = model.predict(images)
        pred_temp = (pred*10).astype(int)
        #pred_temp = pred_temp.astype(int)
        #print(pred.shape, pred[:5].astype(int))
        #print(pred.shape)
        for i in range(0,pred.shape[0]):
            true_label = class_names[labels[i].numpy()]
            pred_label = class_names[int(pred[i][0]>0.5)]
            true_labels.append(true_label)
            pred_labels.append(pred_label)
            ##if true_label ==  'connection' and pred_label == 'no_connection':
                # ???Save image- Saving image is still under review
                #print(images[i].shape)
                #print(type(image), image.shape, image.min(), image.max())
                #img = Image.fromarray(np.array(255*image)).convert("L")#"RGB")#(images[i].numpy())#.astype('uint8'))
                #img.save("data_mip3/test_connection_as_noconnection/image_"+str(cnt)+".png")
                ##image = images[i].numpy()
                ##image = np.squeeze(image)
                ##image = images[5].astype('uint8')
                ##Image.fromarray(images[6].astype('uint8')).save("image_6.png")

            pred_1_to_10.append(pred_temp[i])
            cnt += 1
    pred_1_to_10 = np.array(pred_1_to_10)
    plt.hist(pred_1_to_10, bins=10) # This is to see histogram of predictions
    true_labels = np.array(true_labels)
    pred_labels = np.array(pred_labels)
    print(true_labels.shape, pred_labels.shape)
    print("Unique counts in true_labels:", np.unique(true_labels, return_counts=True))
    print("Unique counts in pred_labels:", np.unique(pred_labels, return_counts=True))
    print()
    return(true_labels, pred_labels)

In [None]:
# Classification report for test dataset
true_labels, pred_labels = get_true_pred_labels(model, test_ds, 200)
classification_report(true_labels, pred_labels)

In [None]:
# Classification report for training dataset
#true_labels, pred_labels = get_true_pred_labels(model, train_ds, 100)
#classification_report(true_labels, pred_labels)

In [None]:
# Classification report for validation dataset
#true_labels, pred_labels = get_true_pred_labels(model, val_ds, 100)
#classification_report(true_labels, pred_labels)