In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from kaggle_datasets import KaggleDatasets
import tensorflow as tf
import tensorflow_addons as tfa
import os
import re
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path('tpu-getting-started')
GCS_DS_PATH2 = KaggleDatasets().get_gcs_path('oxford-flowers-tfrecords')
!gsutil ls $GCS_DS_PATH2


In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
CLASSES = ['pink primrose',    'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea',     'wild geranium',     'tiger lily',           'moon orchid',              'bird of paradise', 'monkshood',        'globe thistle',         # 00 - 09
           'snapdragon',       "colt's foot",               'king protea',      'spear thistle', 'yellow iris',       'globe-flower',         'purple coneflower',        'peruvian lily',    'balloon flower',   'giant white arum lily', # 10 - 19
           'fire lily',        'pincushion flower',         'fritillary',       'red ginger',    'grape hyacinth',    'corn poppy',           'prince of wales feathers', 'stemless gentian', 'artichoke',        'sweet william',         # 20 - 29
           'carnation',        'garden phlox',              'love in the mist', 'cosmos',        'alpine sea holly',  'ruby-lipped cattleya', 'cape flower',              'great masterwort', 'siam tulip',       'lenten rose',           # 30 - 39
           'barberton daisy',  'daffodil',                  'sword lily',       'poinsettia',    'bolero deep blue',  'wallflower',           'marigold',                 'buttercup',        'daisy',            'common dandelion',      # 40 - 49
           'petunia',          'wild pansy',                'primula',          'sunflower',     'lilac hibiscus',    'bishop of llandaff',   'gaura',                    'geranium',         'orange dahlia',    'pink-yellow dahlia',    # 50 - 59
           'cautleya spicata', 'japanese anemone',          'black-eyed susan', 'silverbush',    'californian poppy', 'osteospermum',         'spring crocus',            'iris',             'windflower',       'tree poppy',            # 60 - 69
           'gazania',          'azalea',                    'water lily',       'rose',          'thorn apple',       'morning glory',        'passion flower',           'lotus',            'toad lily',        'anthurium',             # 70 - 79
           'frangipani',       'clematis',                  'hibiscus',         'columbine',     'desert-rose',       'tree mallow',          'magnolia',                 'cyclamen ',        'watercress',       'canna lily',            # 80 - 89
           'hippeastrum ',     'bee balm',                  'pink quill',       'foxglove',      'bougainvillea',     'camellia',             'mallow',                   'mexican petunia',  'bromelia',         'blanket flower',        # 90 - 99
           'trumpet creeper',  'blackberry lily',           'common tulip',     'wild rose'] # 100 - 102

In [None]:
TRAINING_FILENAMES = (tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/train/*.tfrec')
                   +tf.io.gfile.glob(GCS_DS_PATH2 + '/tfrecords-png-331x331/*.tfrec'))
                
VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/val/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_DS_PATH + '/tfrecords-jpeg-331x331/test/*.tfrec') 

In [None]:
def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec
    # files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TEST_IMAGES=count_data_items(TEST_FILENAMES)

AUTO = tf.data.experimental.AUTOTUNE

print(NUM_TRAINING_IMAGES)

In [None]:
IMAGE_SIZE = [331, 331] 
EPOCHS = 50
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE
V_STEPS_PER_EPOCH = NUM_VALIDATION_IMAGES // BATCH_SIZE

In [None]:
def random_erasing(img, sl=0.1, sh=0.2, rl=0.4, p=0.3):
    h = tf.shape(img)[0]
    w = tf.shape(img)[1]
    c = tf.shape(img)[2]
    origin_area = tf.cast(h*w, tf.float32)

    e_size_l = tf.cast(tf.round(tf.sqrt(origin_area * sl * rl)), tf.int32)
    e_size_h = tf.cast(tf.round(tf.sqrt(origin_area * sh / rl)), tf.int32)

    e_height_h = tf.minimum(e_size_h, h)
    e_width_h = tf.minimum(e_size_h, w)

    erase_height = tf.random.uniform(shape=[], minval=e_size_l, maxval=e_height_h, dtype=tf.int32)
    erase_width = tf.random.uniform(shape=[], minval=e_size_l, maxval=e_width_h, dtype=tf.int32)

    erase_area = tf.zeros(shape=[erase_height, erase_width, c])
    erase_area = tf.cast(erase_area, tf.uint8)

    pad_h = h - erase_height
    pad_top = tf.random.uniform(shape=[], minval=0, maxval=pad_h, dtype=tf.int32)
    pad_bottom = pad_h - pad_top

    pad_w = w - erase_width
    pad_left = tf.random.uniform(shape=[], minval=0, maxval=pad_w, dtype=tf.int32)
    pad_right = pad_w - pad_left

    erase_mask = tf.pad([erase_area], [[0,0],[pad_top, pad_bottom], [pad_left, pad_right], [0,0]], constant_values=1)
    erase_mask = tf.squeeze(erase_mask, axis=0)
    erased_img = tf.multiply(tf.cast(img,tf.float32), tf.cast(erase_mask, tf.float32))

    return tf.cond(tf.random.uniform([], 0, 1) > p, lambda: tf.cast(img, img.dtype), lambda:  tf.cast(erased_img, img.dtype))

In [None]:
def onehot(image,label):
    return image,tf.one_hot(label, len(CLASSES))

def data_augmentation(image, label):
    image = tf.image.random_flip_left_right(image)
    image = random_erasing(image)
    return image, label  

def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU
    return image

def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "class": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    label = tf.cast(example['class'], tf.int32)
    return image, label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "id": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        # class is missing, this competitions's challenge is to predict flower classes for the test dataset
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    idnum = example['id']
    return image, idnum # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset(do_onehot):
    dataset = load_dataset(tf.io.gfile.glob(TRAINING_FILENAMES), labeled=True)
    dataset = dataset.map(data_augmentation, num_parallel_calls=AUTO)
    if do_onehot:
        dataset = dataset.map(onehot, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

def get_validation_dataset(do_onehot):
    dataset = load_dataset(tf.io.gfile.glob(VALIDATION_FILENAMES), labeled=True, ordered=False)
    if do_onehot:
        dataset = dataset.map(onehot, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    return dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(tf.io.gfile.glob(TEST_FILENAMES), labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

training_dataset = get_training_dataset(do_onehot=False)
validation_dataset = get_validation_dataset(do_onehot=False)


In [None]:
row_count=2
column_count=4

for t in training_dataset.take(1):
    sample_images=t[0].numpy()
    sample_labels=t[1].numpy()
    
fig, axs = plt.subplots(row_count, column_count, figsize=(20,20))
for i in range(row_count):
        for j in range(column_count):
            axs[i,j].set_title(CLASSES[sample_labels[i * column_count + j]], fontsize=16)
            axs[i,j].imshow(sample_images[i * column_count + j])
            axs[i,j].axis('off')
plt.show()

In [None]:
!pip install -q efficientnet

In [None]:
tf.keras.backend.clear_session()

In [None]:
from tensorflow import keras
from tensorflow.keras import Model
import efficientnet.tfkeras as efn
from tensorflow.keras.optimizers import Adam

with strategy.scope():  
    
    pre_trained_model = efn.EfficientNetB7(input_shape = [*IMAGE_SIZE, 3],
                                include_top = False,
                                pooling='avg',
                                weights = 'noisy-student')



    pre_trained_model.trainable = True
        
    last_layer = pre_trained_model.get_layer('top_activation')

    last_output=last_layer.output

    def create_model(last_output):
        x=keras.layers.GlobalAveragePooling2D()(last_output)
        x=keras.layers.Dropout(0.2)(x)
        x=keras.layers.Dense(104, activation='softmax')(x)

        model = Model(pre_trained_model.input, x)

        model.compile(optimizer = Adam(learning_rate=0.0005),
                      loss = 'categorical_crossentropy',
                      metrics = [tfa.metrics.F1Score(len(CLASSES), average='macro')])
        return model



    model=create_model(last_output)



In [None]:
LR_START = 0.00001
LR_MAX = 0.00005 * strategy.num_replicas_in_sync
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

rng = [i for i in range(25 if EPOCHS<25 else EPOCHS)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))


checkpoint_path = "modelEfficientB7.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True)

earlystop=tf.keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=5, mode='max')

callbacks=[lr_callback, checkpoint, earlystop]

In [None]:
history = model.fit(get_training_dataset(do_onehot=True),
          steps_per_epoch=STEPS_PER_EPOCH,
          epochs=EPOCHS, 
          validation_data=get_validation_dataset(do_onehot=True),
          validation_steps=V_STEPS_PER_EPOCH,
           callbacks=callbacks)
       
                              

In [None]:
f1=history.history['f1_score']
val_f1=history.history['val_f1_score']
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs = range(len(f1))
fig, axs = plt.subplots(2,1, figsize=(10,10), facecolor='#F0F0F0')
plt.tight_layout()
axs[0].set_title('Training and validation F1 Score')
axs[0].plot(epochs, f1, label='Training F1')
axs[0].plot(epochs, val_f1, label='Validation F1')
axs[0].legend()

axs[1].set_title('Training and validation loss')
axs[1].plot(epochs, loss, label='Training loss')
axs[1].plot(epochs, val_loss, label='Validation loss')
axs[1].legend()


In [None]:
from tensorflow.keras.applications import DenseNet201

with strategy.scope():  
    
    pre_trained_model = DenseNet201(input_shape = [*IMAGE_SIZE, 3],
                                include_top = False,
                                pooling='avg',
                                weights = 'imagenet')



    pre_trained_model.trainable = True
        
    last_layer = pre_trained_model.get_layer('avg_pool')

    last_output=last_layer.output

    def create_model(last_output):
        x=keras.layers.Flatten()(last_output)
        x=keras.layers.Dropout(0.2)(x)
        x=keras.layers.Dense(104, activation='softmax')(x)

        model = Model(pre_trained_model.input, x)

        model.compile(optimizer = Adam(learning_rate=0.0005),
                      loss = 'categorical_crossentropy',
                      metrics = [tfa.metrics.F1Score(len(CLASSES), average='macro')])
        return model



    model_2=create_model(last_output)


In [None]:
checkpoint_path_2 = "modelDenseNet.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True)

callbacks=[lr_callback, checkpoint, earlystop]

In [None]:
history2 = model_2.fit(get_training_dataset(do_onehot=True),
          steps_per_epoch=STEPS_PER_EPOCH,
          epochs=EPOCHS, 
          validation_data=get_validation_dataset(do_onehot=True),
          validation_steps=V_STEPS_PER_EPOCH,
           callbacks=callbacks)

In [None]:
f1=history.history['f1_score']
val_f1=history.history['val_f1_score']
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs = range(len(f1))
fig, axs = plt.subplots(2,1, figsize=(10,10), facecolor='#F0F0F0')
plt.tight_layout()
axs[0].set_title('Training and validation F1 Score')
axs[0].plot(epochs, f1, label='Training F1')
axs[0].plot(epochs, val_f1, label='Validation F1')
axs[0].legend()

axs[1].set_title('Training and validation loss')
axs[1].plot(epochs, loss, label='Training loss')
axs[1].plot(epochs, val_loss, label='Validation loss')
axs[1].legend()


In [None]:
test_dataset = get_test_dataset(ordered=True)

test_images = test_dataset.map(lambda image, idnum: image)
probs1 = model.predict(test_images)
probs2 = model_2.predict(test_images)
probabilities = (probs1 + probs2)/2
predictions = np.argmax(probabilities, axis=-1)




test_ids = test_dataset.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids.batch(NUM_TEST_IMAGES))).numpy().astype('U')


np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')


In [None]:
test_dataset = get_test_dataset(ordered=True).unbatch()



In [None]:
test_images_sample=[]
for t in test_dataset.take(4):
    test_images_sample.append(t[0].numpy())

In [None]:
fig, axs = plt.subplots(len(test_images_sample), 1, figsize=(20,20))
for i in range(0,len(test_images_sample)):
    axs[i].set_title(CLASSES[predictions[i]], fontsize=16)
    axs[i].imshow(test_images_sample[i])
    axs[i].axis('off')