# Introduction  
To improve classification accuracy of the model on the test dataset, the following are explored:
* Input image size
* Pretrained model and number of trainable parameters of final model
* Data augmentation
* Regularization techniques
* Use of learning rate schedule


# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed on Kaggle
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
import os
import numpy as np 
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import regularizers      # mitigate overfitting 
from kaggle_datasets  import KaggleDatasets    # import kaggle data files
# Stop training when a monitored metric has stopped improving
from tensorflow.keras.callbacks import EarlyStopping   
print("Tensorflow version " + tf.__version__)  # verify tensorflow versionis 2.x

# Detect Hardware

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set. 
    # On Kaggle this is always the case.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Data Directories

In [None]:
# Input data files are available in the read-only "kaggle/input/" directory
#   image files are in TFRecords format, each of which contains a sequeence
#   of records and can only be read sequentially.

TFRec_selected = '512x512'
for dirpath, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if TFRec_selected in dirpath: # 
            print(os.path.join(dirpath, filename))

In [None]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved 
# as output when you create a version using "Save & Run All" 
for dirpath, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirpath, filename))     
        
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of 
# the current session
!gsutil cp /kaggle/input/tpu-getting-started/sample_submission.csv /kaggle/temp/temp1.csv
for dirpath, _, filenames in os.walk('/kaggle/temp'):
    for filename in filenames:
        print(os.path.join(dirpath, filename))     
        

# Set up data path

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path() 
# you can list the bucket with "!gsutil ls $GCS_DS_PATH"
print(GCS_DS_PATH)
!gsutil ls $GCS_DS_PATH

# Set up parameters

In [None]:
# parameters set for tfrecords-jpeg-512x512 TFRecord files
IMAGE_SIZE        = [512, 512] 
HEIGHT            = IMAGE_SIZE[0]
WIDTH             = IMAGE_SIZE[1]
EPOCHS            = 20
BATCH_SIZE        = 16 * strategy.num_replicas_in_sync
NUM_TRAIN_IMAGES  = 12753
NUM_VAL_IMAGES    = 3712
NUM_TEST_IMAGES   = 7382
STEPS_PER_EPOCH   = NUM_TRAIN_IMAGES // BATCH_SIZE
AUTO              = tf.data.experimental.AUTOTUNE
TRAIN_FILENAMES   = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/train/*.tfrec') 
VAL_FILENAMES     = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/val/*.tfrec') 
TEST_FILENAMES    = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-512x512/test/*.tfrec') 

# Functions to handle data

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),    # shape [] means single element
        # class is missing, to be predicted flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of (image, idnum) pairs

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    
    # automatically interleaves reads from multiple file
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) 
    
    # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.with_options(ignore_order) 
    
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    dataset = dataset.map(read_labeled_tfrecord if labeled 
                          else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    return dataset

def get_validation_dataset(filenames):
    dataset = load_dataset(filenames,labeled=True, ordered=False)
    dataset = dataset.cache()
    dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(BATCH_SIZE)
    # prefetch next batch while training (autotune prefetch buffer size)
    dataset = dataset.prefetch(AUTO) 
    return dataset

def get_test_dataset(filenames, ordered=False):  # order matters to submit predictions to Kaggle
    dataset = load_dataset(filenames, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    # prefetch next batch while training (autotune prefetch buffer size)
    dataset = dataset.prefetch(AUTO) 
    return dataset

# Functions to get training dataset with data augmentation option

In [None]:
# image augmentation                                  
def data_augment(image, label):
    # Pad the image with a black, 3-pixel border
    # image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 6, WIDTH + 6)
    # Randomly crop to original size from the padded image
    # image = tf.image.random_crop(image, size=[*IMAGE_SIZE,3])
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_brightness(image, 0.1) 
    image = tf.image.random_saturation(image, 0.7, 1.3)
    return image, label 

# get training datatset with augmentation option
def get_training_dataset(filenames, augmentation=False):
    # Thanks to the dataset.prefetch(AUTO) statement below, data pipeline code 
    #  is executed on the "CPU" part of the TPU while the TPU itself is computing gradients.
    dataset = load_dataset(filenames, labeled=True, ordered=False)
    if augmentation:
        dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)  # prefetch next batch while training
    return dataset

# Functions to visualize images in batches

In [None]:
np.set_printoptions(threshold=15, linewidth=80)

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object: # binary string in this case, these are image ID strings
        numpy_labels = [None for _ in enumerate(numpy_images)]
    # If no labels, only image IDs, return None for labels (this is the case for test data)
    return numpy_images, numpy_labels

def show_images(databatch, row=6, col=8):
    #row = 6; col = 8;
    FIGSIZE = col*2
    plt.figure(figsize=(FIGSIZE,FIGSIZE/col*row))
    images, num_labl = batch_to_numpy_images_and_labels(databatch)
    for j in range(row*col):
        plt.subplot(row,col,j+1)
        plt.axis('off')
        plt.title(num_labl[j])
        plt.imshow(images[j,])
    plt.show()

# Explore image augmentation

In [None]:
# get original training_dataset without augmentation
ori_train_set = get_training_dataset(TRAIN_FILENAMES, augmentation=False)
ori_image_batch = (next(iter(ori_train_set.unbatch().batch(16)))) # get a batch for 
images, _ = batch_to_numpy_images_and_labels(ori_image_batch)

# function to show image with random data augmentation
def show_aug(image):
    plt.figure(figsize=(12,2))
    plt.subplot(1,6,1)
    plt.imshow(image)
    plt.title('no augmentation')
    plt.axis('off')
    plt.subplot(1,6,3)
    plt.imshow(tf.image.random_flip_left_right(image))       # augmented with random flip
    plt.title('rdm flip L/R')
    plt.axis('off')    
    plt.subplot(1,6,4)
    plt.imshow(tf.image.random_contrast(image, 0.90, 0.99))  # augmented with contrast
    plt.title('rdm contrast')
    plt.axis('off')
    plt.subplot(1,6,5)
    plt.imshow(tf.image.random_brightness(image, 0.1))       # augmented with brightness
    plt.title('rdm brightness')
    plt.axis('off')
    plt.subplot(1,6,6)
    plt.imshow(tf.image.random_saturation(image, 0.8, 0.9))  # augmented with saturation
    plt.title('rdm saturation')
    plt.axis('off')
    plt.subplot(1,6,2)
    image = data_augment(image, None)
    plt.imshow(image[0])  # any random combinations of the above augmenations, if any
    plt.title('rdm aug combo')
    plt.axis('off')    
    plt.show()

# show images
print('Training Dataset')
print('Sample Images: Original versus w/ Random Augmentation')
for im in images:
    show_aug(im)

# Select train, val and test datasets 

In [None]:
# Get training dataset w/o data augmentation to fit model
training_dataset   = get_training_dataset(TRAIN_FILENAMES, augmentation=True)
validation_dataset = get_validation_dataset(VAL_FILENAMES)
# order of test images matters to submit predictions to Kaggle for a score
test_dataset       = get_test_dataset(TEST_FILENAMES, ordered=True)

print('trainin dataset:    ', training_dataset)
print('validation dataset: ', validation_dataset)
print('test dataset:       ', test_dataset)

# Visualize image samples

In [None]:
# you may run these lines repeatedly to take different samples from the image sets
print('Training Images with random data augmentation in data pipeline')
show_images(next(iter(training_dataset.unbatch().batch(56))), row=7, col=8)
print('Validation Images')
show_images(next(iter(validation_dataset.unbatch().batch(56))), row=7, col=8)
print('Test Images - shuffled')
show_images(next(iter(test_dataset.shuffle(buffer_size=80).unbatch().batch(56))), row=7, col=8)

> # Build the model

In [None]:
'''
Alternatively, data augmentation may be done by creating image preprocessing layers
   and make them part of the model, as show below:  

data_augmentation = tf.keras.Sequential([
   tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical", seed = SEED),
   tf.keras.layers.experimental.preprocessing.RandomRotation(0.2, seed = SEED)
])
'''

# With pretrained model: InceptionV3
with strategy.scope():    
    pretrained_model = tf.keras.applications.DenseNet201(
        weights='imagenet', 
        include_top=False ,
        input_shape=[*IMAGE_SIZE, 3]
    )
    pretrained_model.trainable = True # transfer learning
    model = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.Dropout(0.2),  # moved up
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(104, 
            kernel_regularizer=regularizers.l2(0.00011), 
            activation='softmax')
    ])
        
model.compile(
    optimizer='adam',
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

In [None]:
# display pretrained DenseNet201 base model summary
print('############ pretrained DenseNet201 base model summary ############')
pretrained_model.summary()

In [None]:
# display model summary
print('######################### my model summary ########################')
model.summary()

# Monitor model metrics and EarlyStopping

In [None]:
# Learning Rate Schedule for Fine Tuning 
def exponential_lr(epoch,
                  start_lr=0.00001,min_lr=0.00001,max_lr=0.00005,
                  rampup_epochs = 5, sustain_epochs = 0,
                  exp_decay = 0.8):
    def lr(epoch, start_lr, min_lr,max_lr,rampup_epochs,sustain_epochs,
          exp_decay):
        # linear increase from start to rampup_epochs
        if epoch < rampup_epochs:
            lr= ((max_lr-start_lr)/
                rampup_epochs * epoch + start_lr)
        elif epoch < rampup_epochs + sustain_epochs:
            lr = max_lr 
        else:
            lr = ((max_lr - min_lr)* exp_decay ** (epoch-rampup_epochs-sustain_epochs)
                  + min_lr)
            
        return lr
    return lr(epoch,start_lr,min_lr,max_lr,rampup_epochs,sustain_epochs,exp_decay)

lr_callback = tf.keras.callbacks.LearningRateScheduler(exponential_lr,verbose=True)

# learning rate chart
epoch_rng = [i for i in range(EPOCHS+31)]
y = [exponential_lr(x) for x in epoch_rng]
plt.plot(epoch_rng,y)
plt.xlim(-1, EPOCHS+32)

print("Learning rate schedule: start = {:.3g}; peak = {:.3g}; end = {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
# Stop training when a monitored metric has stopped improving
earlystopping = EarlyStopping(monitor='val_loss', patience=2)

# Train the model

In [None]:
# fit the model
historical = model.fit(
    training_dataset, 
    steps_per_epoch=STEPS_PER_EPOCH, 
    epochs=EPOCHS, 
    validation_data=validation_dataset,
    # continuously monitoring val_loss
    # adjust learning rate & determine earlystopping
    callbacks=[lr_callback,earlystopping]  
)

# Plots: accuracy and loss metrics

In [None]:
# Create plots of loss and accuracy on the training and validation sets.

acc = historical.history['sparse_categorical_accuracy']
val_acc = historical.history['val_sparse_categorical_accuracy']

loss = historical.history['loss']
val_loss = historical.history['val_loss']

epochs_range = range(1, len(historical.history['loss'])+1)

plt.figure(figsize=(14, 14))
plt.subplot(2, 1, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')

plt.subplot(2, 1, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.show()

# Compute predictions on the test set

In [None]:
print('Computing predictions...')
test_images_ds = test_dataset.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

print('Generating submission.csv file...')
test_ids_ds = test_dataset.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), 
           fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')