In [None]:
import math, re, os
import numpy as np
import tensorflow as tf

**Distribution Strategy**

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

**Load Data**

In [None]:
from kaggle_datasets import KaggleDatasets

path = KaggleDatasets().get_gcs_path('tpu-getting-started')

In [None]:
image_size = [512, 512]
gcsP = path + '/tfrecords-jpeg-512x512'
auto = tf.data.experimental.AUTOTUNE

trainingFiles = tf.io.gfile.glob(gcsP + '/train/*.tfrec')
validationFiles = tf.io.gfile.glob(gcsP + '/val/*.tfrec')
testFiles = tf.io.gfile.glob(gcsP + '/test/*.tfrec') 

flowers = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'swee',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose']                                                                                                                                               # 100 - 102

**Helper Methods**

In [None]:
# This method converts image to floats and reshapes to a size needed for TPU
def decode_image(data):
    image = tf.image.decode_jpeg(data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.reshape(image, [*image_size, 3])
    return image

# This method returns a dataset of (image, label) pairs 
def read_labeled_tfrecord(single):
    labeledTfRecordFormat = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "class": tf.io.FixedLenFeature([], tf.int64),
    }
    single = tf.io.parse_single_example(single, labeledTfRecordFormat)
    image = decode_image(single['image'])
    label = tf.cast(single['class'], tf.int32)
    return image, label

# This method returns a data of just images without the label like above
def read_unlabeled_tfrecord(single):
    unlabledTfRecordFormat = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "id": tf.io.FixedLenFeature([], tf.string),
    }
    single = tf.io.parse_single_example(single, unlabledTfRecordFormat)
    image = decode_image(single['image'])
    idnum = single['id']
    return image, idnum

# This method returns a dataset of (image, label) pairs if labeled=True or (image, ID) pairs if labeled=False
def load_dataset(filenames, labeled=True, ordered=False):
    # We ignore order because data will be shuffled anyways, also helps with speed
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=auto)
    return dataset

**Pipelines**

In [None]:
# This method augments the data so there are more data points for training
def data_augment(image, label):
    image = tf.image.random_flip_left_right(image)
    return image, label   

# This method gets the training data ready for the model
def get_training_dataset():
    dataset = load_dataset(trainingFiles, labeled=True)
    dataset = dataset.map(data_augment, num_parallel_calls=auto)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batchSize)
    dataset = dataset.prefetch(auto)
    return dataset

# This method gets the validation data ready for the model
def get_validation_dataset(ordered=False):
    dataset = load_dataset(validationFiles, labeled=True, ordered=ordered)
    dataset = dataset.batch(batchSize)
    dataset = dataset.cache()
    dataset = dataset.prefetch(auto)
    return dataset

# This method gets the testing data ready for the model
def get_test_dataset(ordered=False):
    dataset = load_dataset(testFiles, labeled=False, ordered=ordered)
    dataset = dataset.batch(batchSize)
    dataset = dataset.prefetch(auto)
    return dataset

# This method returns the number of data points within training, validation, and testing
def count_data_items(filenames):
    x = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(x)

numTraining = count_data_items(trainingFiles)
numValidation = count_data_items(validationFiles)
numTest = count_data_items(testFiles)


In [None]:
batchSize = 16 * strategy.num_replicas_in_sync

ds_train = get_training_dataset()
ds_valid = get_validation_dataset()
ds_test = get_test_dataset()

In [None]:
# Here we are displaying the shape of the training data
np.set_printoptions(threshold=15, linewidth=80)

print("Training data shapes:")
for image, label in ds_train.take(3):
    print(image.numpy().shape, label.numpy().shape)
print("Training data label examples:", label.numpy())

In [None]:
# Here we are displaying the shape of the testing data
print("Test data shapes:")
for image, idnum in ds_test.take(3):
    print(image.numpy().shape, idnum.numpy().shape)
print("Test data IDs:", idnum.numpy().astype('U')) # U=unicode string

**Display Data**

In [None]:
from matplotlib import pyplot as plt

def batch_to_numpy_images_and_labels(data):
    images, labels = data
    numpy_images = images.numpy()
    numpy_labels = labels.numpy()
    if numpy_labels.dtype == object:               
        numpy_labels = [None for _ in enumerate(numpy_images)]
        
    return numpy_images, numpy_labels

def title_from_label_and_target(label, correct_label):
    if correct_label is None:
        return flowers[label], True
    correct = (label == correct_label)
    
    return "{} [{}{}{}]".format(flowers[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '',
                                flowers[correct_label] if not correct else ''), correct

def display_one_flower(image, title, subplot, red=False, titlesize=16):
    plt.subplot(*subplot)
    plt.axis('off')
    plt.imshow(image)
    if len(title) > 0:
        plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5))
    return (subplot[0], subplot[1], subplot[2]+1)
    
def display_batch_of_images(databatch, predictions=None):
    images, labels = batch_to_numpy_images_and_labels(databatch)
    if labels is None:
        labels = [None for _ in enumerate(images)]
    
    rows = int(math.sqrt(len(images)))
    cols = len(images)//rows
        
    # size and spacing
    FIGSIZE = 13.0
    SPACING = 0.1
    subplot=(rows,cols,1)
    if rows < cols:
        plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows))
    else:
        plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE))
    
    # display
    for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])):
        title = '' if label is None else flowers[label]
        correct = True
        if predictions is not None:
            title, correct = title_from_label_and_target(predictions[i], label)
        dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images
        subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize)
     
    #layout
    plt.tight_layout()
    if label is None and predictions is None:
        plt.subplots_adjust(wspace=0, hspace=0)
    else:
        plt.subplots_adjust(wspace=SPACING, hspace=SPACING)
    plt.show()


def display_training_curves(training, validation, title, subplot):
    if subplot%10==1: # set up the subplots on the first call
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    #ax.set_ylim(0.28,1.05)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])

In [None]:
# Here we are displaying a batch of 20 images
ds_iter = iter(ds_train.unbatch().batch(20))
one_batch = next(ds_iter)
display_batch_of_images(one_batch)

**Define Model**

In [None]:
numEpochs = 20

with strategy.scope():
    pretrained_model = tf.keras.applications.VGG16(
        weights='imagenet',
        include_top=False ,
        input_shape=[*image_size, 3]
    )
    pretrained_model.trainable = False
    
    model = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(len(flowers), activation='relu'),
        tf.keras.layers.Dropout(0.00625),
        tf.keras.layers.Dense(len(flowers), activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(len(flowers), activation='relu'),
        tf.keras.layers.Dropout(0.003125),
        tf.keras.layers.Dense(len(flowers), activation='softmax')
    ])

In [None]:
model.compile(
    optimizer='adam',
    loss = 'sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
)

model.summary()

**Training**

In [None]:
# Here we are establishing the learning rate and how it will change
def exponential_lr(epoch,
                   startLearningRate = 0.0001, minimumLearningRate = 0.0001, maxLearningRate = 0.0007,
                   rampup_epochs = 20, sustain_epochs = 0,
                   exp_decay = 0.8):

    def lr(epoch, startLearningRate, minimumLearningRate, maxLearningRate, rampup_epochs, sustain_epochs, exp_decay):
        if epoch < rampup_epochs:
            lr = ((maxLearningRate - startLearningRate) /
                  rampup_epochs * epoch + startLearningRate)
        elif epoch < rampup_epochs + sustain_epochs:
            lr = maxLearningRate
        else:
            lr = ((maxLearningRate - minimumLearningRate) *
                  exp_decay**(epoch - rampup_epochs - sustain_epochs) +
                  minimumLearningRate)
        return lr
    return lr(epoch,
              startLearningRate,
              minimumLearningRate,
              maxLearningRate,
              rampup_epochs,
              sustain_epochs,
              exp_decay)

lr_callback = tf.keras.callbacks.LearningRateScheduler(exponential_lr, verbose=True)


**Fit Model**

In [None]:
numEpochs = 20
numSteps = numTraining // batchSize

history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=numEpochs,
    steps_per_epoch=numSteps,
    callbacks=[lr_callback],
)

In [None]:
# Here we are displaying the training curves for visuals
display_training_curves(
    history.history['loss'],
    history.history['val_loss'],
    'loss',
    211,
)
display_training_curves(
    history.history['sparse_categorical_accuracy'],
    history.history['val_sparse_categorical_accuracy'],
    'accuracy',
    212,
)

**Evaluation**

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# Displays the confusion matrix with the F1, precision, and recall scores
def display_confusion_matrix(cmat, score, precision, recall):
    plt.figure(figsize=(15,15))
    ax = plt.gca()
    ax.matshow(cmat, cmap='Reds')
    ax.set_xticks(range(len(flowers)))
    ax.set_xticklabels(flowers, fontdict={'fontsize': 7})
    plt.setp(ax.get_xticklabels(), rotation=45, ha="left", rotation_mode="anchor")
    ax.set_yticks(range(len(flowers)))
    ax.set_yticklabels(flowers, fontdict={'fontsize': 7})
    plt.setp(ax.get_yticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    titlestring = ""
    if score is not None:
        titlestring += 'f1 = {:.3f} '.format(score)
    if precision is not None:
        titlestring += '\nprecision = {:.3f} '.format(precision)
    if recall is not None:
        titlestring += '\nrecall = {:.3f} '.format(recall)
    if len(titlestring) > 0:
        ax.text(101, 1, titlestring, fontdict={'fontsize': 18, 'horizontalalignment':'right', 'verticalalignment':'top', 'color':'#804040'})
    plt.show()

# Displays the training curves, used to visualize loss and accuracy of training and validation
def display_training_curves(training, validation, title, subplot):
    if subplot%10==1:
        plt.subplots(figsize=(10,10), facecolor='#F0F0F0')
        plt.tight_layout()
    ax = plt.subplot(subplot)
    ax.set_facecolor('#F8F8F8')
    ax.plot(training)
    ax.plot(validation)
    ax.set_title('model '+ title)
    ax.set_ylabel(title)
    ax.set_xlabel('epoch')
    ax.legend(['train', 'valid.'])

**Confusion Matrix**

In [None]:
# Creating the confusion matrix
cmdataset = get_validation_dataset(ordered=True)
images_ds = cmdataset.map(lambda image, label: image)
labels_ds = cmdataset.map(lambda image, label: label).unbatch()

cm_correct_labels = next(iter(labels_ds.batch(numValidation))).numpy()
cm_probabilities = model.predict(images_ds)
cm_predictions = np.argmax(cm_probabilities, axis=-1)

labels = range(len(flowers))
cmat = confusion_matrix(
    cm_correct_labels,
    cm_predictions,
    labels=labels,
)
cmat = (cmat.T / cmat.sum(axis=1)).T # normalize

In [None]:
score = f1_score(
    cm_correct_labels,
    cm_predictions,
    labels=labels,
    average='macro',
)
precision = precision_score(
    cm_correct_labels,
    cm_predictions,
    labels=labels,
    average='macro',
)
recall = recall_score(
    cm_correct_labels,
    cm_predictions,
    labels=labels,
    average='macro',
)
display_confusion_matrix(cmat, score, precision, recall)

**Visual**

In [None]:
# Here we are displaying a grouping of data images that and the models predictions - right and wrong 
dataset = get_validation_dataset()
dataset = dataset.unbatch().batch(20)
batch = iter(dataset)

images, labels = next(batch)
probabilities = model.predict(images)
predictions = np.argmax(probabilities, axis=-1)
display_batch_of_images((images, labels), predictions)

**Predictions**

In [None]:
# Here we are calculating predictions
test_ds = get_test_dataset(ordered=True)
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

In [None]:

# Here we are creating the submission file for the competition 
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(numTest))).numpy().astype('U')
np.savetxt(
    'submission.csv',
    np.rec.fromarrays([test_ids, predictions]),
    fmt=['%s', '%d'],
    delimiter=',',
    header='id,label',
    comments='',
)