In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, cv2, math, re
from PIL import Image
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import random

#model imports (keras/tensorflow)
import tensorflow as tf
import keras
from keras import layers, models
from keras.optimizers import Adam
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow.keras.backend as K
import tensorflow_addons as tfa

from kaggle_datasets import KaggleDatasets
from functools import partial

print("Tensorflow version " + tf.__version__)

os.system('pip install /kaggle/input/kerasapplications -q')
os.system('pip install /kaggle/input/efficientnet-keras-source-code/ -q --no-deps')

import efficientnet.tfkeras as efn

### Notes

- If you want to learn more about Tensorflow + Computer Vision check out [@dimitreoliveira](https://www.kaggle.com/dimitreoliveira) I learned a ton of tips/tricks from his notebooks.
- Baseline model trained using TPU's and resized 1200 x 1200 images stored as TFrecords.

- Find a custom loss function that maximizes F1-score - [TF Add-ons](https://stackoverflow.com/questions/59496936/how-to-use-tensorflow-addons-metrics-correctly-in-functional-api), [Kaggle Notebook](https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric)

- Next up I need to edit the load_dataset function as I no longer need to perform augmentations in this notebook. (they are done during tfrecord creation)

### Version's 

- V1: Baseline
- V2: Changed Seed
- V5: Leaderboard Rescore and using val_f1_score for CV (CV: 0.60391 -- LB: 0.630)
- V7: Pre-Augment Data on Tfrec creation, Heavy Augs (CV: 0.70128 -- LB: 0.785)
- V9: Efficientnet B4 five-fold (LB: 0.745)
- V11: seed test (CV: 0.689)

### Seed

- Setting seed for reproduciblity.

In [None]:
SEED = 1002

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)    

seed_everything(SEED)  

### Detecting TPU's

- Number of replicas will be 8 if the TPU's are correctly initialized, but will output 1 if not correctly connected. 

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

### Variables

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path('plant-pathology-2021-tfrecords-1200-x-1200')

BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]
TARGET_SIZE = 512
CLASSES = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
NUM_OF_CLASSES = len(CLASSES)
EPOCHS = 15
DROPOUT_RATE = 0.2 
AUG_BATCH = BATCH_SIZE

### Splitting TFRecords

The number on the end of each tfrecord file corresponds to the number of images in that tfrecord.

Example: 'gs://kds-100c2bc3bab7e1f77f19378980a417f43e62119932994bd622dc7cb4/Id_train01-1427.tfrec' (1427 imgs)

In [None]:
#this function counts number of images in all TFRecords
def count_data_items(filenames):
    n = [int(re.compile(r'-([0-9]*)\.').search(filename).group(1)) for filename in filenames]
    return np.sum(n)

ALL_TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.tfrec')
NUM_ALL_TRAINING_IMAGES = count_data_items(ALL_TRAINING_FILENAMES)

#reading train metadata
train = pd.read_csv('../input/plant-pathology-2021-tfrecords-1200-x-1200/train.csv')

print(f'GCS: train images: {NUM_ALL_TRAINING_IMAGES}')

### Train_test_split

Using this train_test_split to train baseline model. Once other model types and variables have been experimented with I plan to see if five-fold cross validation combination increases performance. 

Note: When I created the TFrecords, I split them into 50 files so that I could experiment with different sizes of Training and Validation datasets. All tfrecords are created using a stratified split (equal distribution of classes in each TFrecord).

Something to note here is that since I have pre-augmented the images it looks like I have very few images in validation compared to training set. Will have to experiment to see if this discrepency makes a difference.

In [None]:
def split_validation_set():
    TRAINING_FILENAMES = []
    VALIDATION_FILENAMES = []

    for file_name in ALL_TRAINING_FILENAMES:
        #using regex to get second last number in file
        result = re.findall('[0-9]+', file_name)[-2]
        if result[0] == "0":
            result = result[1:]
            
        #checking if the filenumber is marked as a validation file
        if int(result) in np.unique(train.loc[train.validation == 1].file.values).tolist():
            VALIDATION_FILENAMES.append(file_name)
        else:
            TRAINING_FILENAMES.append(file_name)
            
    return TRAINING_FILENAMES,VALIDATION_FILENAMES
            
#assingning list of filenames to variable names
TRAINING_FILENAMES,VALIDATION_FILENAMES = split_validation_set()

In [None]:
# TRAINING_FILENAMES, VALIDATION_FILENAMES = train_test_split(
#     ALL_TRAINING_FILENAMES,
#     train_size= 0.90, test_size=0.10,
#     random_state=SEED,
# )

NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)

print("Training Images: {}  Validation Image: {}".format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES))
print("Training Percent: {:.2f}  Validation Percent: {:.2f}".format((NUM_TRAINING_IMAGES/NUM_ALL_TRAINING_IMAGES),
                                                           (NUM_VALIDATION_IMAGES/NUM_ALL_TRAINING_IMAGES)))

STEPS_PER_EPOCH =  NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

### Functions

The following functions are how I am reading the data from the TF records. Casting each pixel value to a floating point and dividing them by 255 is a great way to increase training time.

NOTE: Need to always be wary of order of color channels. RGB or BGR? 

Read more here --> [image-read-and-resize-with-opencv-tensorflow-and-pil](https://towardsdatascience.com/image-read-and-resize-with-opencv-tensorflow-and-pil-3e0f29b992be)

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3) #decoding jpeg-encoded img to uint8 tensor
    image = tf.cast(image, tf.float32) / 255.0 #cast int val to float so we can normalize pixels
    image = tf.image.resize(image, [*IMAGE_SIZE]) #precautionary as all imgs should be 512x512
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) #resizing to split channels
    
    #CHECK IF THE IMAGES ARE RGB OR BGR
    
    return image

In [None]:
def read_tfrecord(example, labeled=True):
    """
        1. Parse data based on the 'TFREC_FORMAT' map.
        2. Decode image.
        3. If 'labeled' returns (image, label) if not (image, name).
    """
    if labeled:
        TFREC_FORMAT = {
            'image': tf.io.FixedLenFeature([], tf.string), 
            'target': tf.io.FixedLenFeature([], tf.int64), 
        }
    else:
        TFREC_FORMAT = {
            'image': tf.io.FixedLenFeature([], tf.string), 
            'image_name': tf.io.FixedLenFeature([], tf.string), 
        }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    image = decode_image(example['image'])
    if labeled:
        label_or_name = tf.cast(example['target'], tf.int32)
    else:
        label_or_name =  example['image_name']
    return image, label_or_name

In [None]:
def load_dataset(filenames, labeled=True, ordered=False):
    """
        Create a Tensorflow dataset from TFRecords.
    """
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(lambda x: read_tfrecord(x, labeled=labeled), num_parallel_calls=AUTOTUNE)
    return dataset

### Data augmentation

I will experiment with many different augmentation types, but for a baseline I am going to use simple flips and rotations. 

Maybe some cutmix/mixup in later experiments. 

In [None]:
def simple_data_augmenter(image, label):
    # Thanks to the dataset.prefetch(AUTO) statement in the following function this happens essentially for free on TPU. 
    # Data pipeline code is executed on the "CPU" part of the TPU while the TPU itself is computing gradients.
    
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32) #random int and rotating img based on result
    
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) # rotate 270ยบ
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) # rotate 180ยบ
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) # rotate 90ยบ
    
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_flip_left_right(image)
    
    
    return image, label

### Getting Dataset Functions

In [None]:
def get_training_dataset(dataset, do_aug=True, do_onehot=False):
    #dataset = dataset.map(simple_data_augmenter, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.batch(AUG_BATCH)
    if do_onehot: dataset = dataset.map(onehot, num_parallel_calls=AUTOTUNE) #onehot happens in do_aug as well 
    dataset = dataset.unbatch()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(dataset, do_onehot=True):
    dataset = dataset.batch(BATCH_SIZE)
    if do_onehot: dataset = dataset.map(onehot, num_parallel_calls=AUTOTUNE) # we must use one hot like augmented train data
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def onehot(image,label):
    CLASSES = NUM_OF_CLASSES
    return image,tf.one_hot(label,CLASSES)

### Building Model

- I am going to start by training some small EfficientNet models as these are relatively small but powerful models that often perform well in Computer Vision Problems.

In [None]:
#This simplecustom loss function has worked well for me in the past so I am going to start with this
catcross_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, 
                                               label_smoothing=0.1, 
                                               name='categorical_crossentropy' ) 

In [None]:
def create_model():
    model = models.Sequential()
    
    model.add(efn.EfficientNetB5(include_top = False, weights = 'noisy-student', 
                              input_shape = (TARGET_SIZE, TARGET_SIZE, 3)))
    
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(DROPOUT_RATE))
    model.add(layers.Dense(NUM_OF_CLASSES, activation = "softmax"))# 12 is the dimensionality of the output space "12 classes"

    model.compile(optimizer = 'adam',
                  loss = catcross_loss, #use sparse_catgeorical_crossentropy if not one_hot_encoding
                  metrics = ["acc", tfa.metrics.F1Score(
                    num_classes = NUM_OF_CLASSES, 
                    average = 'weighted')])
    return model

- Creating the model in strategy.scope() as I am training the model on TPU's

In [None]:
with strategy.scope():
    model = create_model()

model.save('./EffNet_untrained_TPU_model.h5')

### Model Callbacks

- Using a custom learning rate scheduler that is a cosine decay w/ a warmup period. I used a custom LR scheduler because I wanted to be able to update on every step rather than every epoch. This is beneficial for the warmup period.

In [None]:
# LR_START = 0.000007
# LR_MAX = 0.00007
# LR_RAMPUP_EPOCHS = 3
# WARMUP_STEPS = LR_RAMPUP_EPOCHS * (NUM_TRAINING_IMAGES//BATCH_SIZE)
# TOTAL_STEPS = EPOCHS * (NUM_TRAINING_IMAGES//BATCH_SIZE)

# def lrfn_step(step):
#     if step < WARMUP_STEPS:
#         lr = (LR_MAX - LR_START) / WARMUP_STEPS * step + LR_START
#     else:
#         progress = (step - WARMUP_STEPS) / (TOTAL_STEPS - WARMUP_STEPS)
#         lr = LR_MAX * (0.5 * (1.0 + tf.math.cos(np.pi * ((1.0 * progress) % 1.0))))
#     return lr



# class CustomCallback(keras.callbacks.Callback):
#     def __init__(self, schedule):
#         super(CustomCallback, self).__init__()
#         self.schedule = schedule
#         self.epoch = 0
        
#     def on_train_batch_begin(self, batch, logs=None):
#         actual_step = (self.epoch*STEPS_PER_EPOCH) + batch
#         # Call schedule function to get the scheduled learning rate.
#         scheduled_lr = self.schedule(actual_step)
#         # Set the value back to the optimizer before this epoch starts
#         tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
#         if batch == 0:
#             print("--Learning Rate: {:.6f} --".format(scheduled_lr))
        
#     def on_epoch_end(self, epoch, logs=None):
#         self.epoch+=1
        
    

# #visualizing Learning Rate Schedule
# rng = [i for i in range(TOTAL_STEPS)]
# y = [lrfn_step(tf.cast(x, tf.float32)) for x in rng]

# sns.set(style='whitegrid')
# fig, ax = plt.subplots(figsize=(20, 6))
# plt.plot(rng, y)

# print(f'{TOTAL_STEPS} total steps and {NUM_TRAINING_IMAGES//BATCH_SIZE} steps per epoch')
# print(f'Learning rate schedule: {y[0]:.3g} to {max(y):.3g} to {y[-1]:.3g}')

### Experimental Cosine Annealing LR Scheduler

Note V14: this is specific to the following for a single_fold model trained on all data.

- 9825 total steps 
- 1700 rampup steps
- LR_MAX = 7

Note V15: Specific to a five-fold model notebook
- 1965 total steps 
- 348 rampup steps
- LR_MAX = 7



In [None]:
TOTAL_STEPS = EPOCHS * (NUM_TRAINING_IMAGES//BATCH_SIZE)
LR_RAMPUP_STEPS = 348
LR_START = 1
LR_MAX = 7

def lrfn_step(step):
    if step <= LR_RAMPUP_STEPS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_STEPS * step + LR_START
    else:
        step = step/105
        lr = LR_MAX + tf.math.cos(step - np.pi*(tf.math.floor(step/np.pi))) - tf.math.floor(step/np.pi)
    return lr*1/100000

class CustomCallback(keras.callbacks.Callback):
    def __init__(self, schedule):
        super(CustomCallback, self).__init__()
        self.schedule = schedule
        self.epoch = 0
        
    def on_train_batch_begin(self, batch, logs=None):
        actual_step = (self.epoch*STEPS_PER_EPOCH) + batch
        # Call schedule function to get the scheduled learning rate.
        scheduled_lr = self.schedule(actual_step)
        # Set the value back to the optimizer before this epoch starts
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        if batch == 0:
            print("--Learning Rate: {:.6f} --".format(scheduled_lr))
        
    def on_epoch_end(self, epoch, logs=None):
        self.epoch+=1
        
#visualizing Learning Rate Schedule
# rng = [i for i in range(TOTAL_STEPS)]
# y = [lrfn_step(tf.cast(x, tf.float32)) for x in rng]

# sns.set(style='whitegrid')
# fig, ax = plt.subplots(figsize=(20, 6))
# plt.plot(rng, y)

# print(f'{TOTAL_STEPS} total steps and {NUM_TRAINING_IMAGES//BATCH_SIZE} steps per epoch')
# print(f'Learning rate schedule: {y[0]:.3g} to {max(y):.3g} to {y[-1]:.3g}')

- Using a ModelCheckpoint callback that saves best_weights_only, this reduces time taken between each epoch as file to save is much smaller

- Also created an early_stopping callback function that will stop the training cycle if there are no improvements in model on validation dataset in three consecutive epoch cycles.

In [None]:
model_save = ModelCheckpoint('./Effnet_TPU_Model_best_weights.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_f1_score', #note: set to val_f1_score
                             mode = 'max',
                             verbose = 1)

my_early_stopper = EarlyStopping(monitor = 'val_acc', min_delta = 0.001, 
                           patience = 6, mode = 'max', verbose = 1,
                           restore_best_weights = False)

### Training Model

- NOTE: Setting do_onehot to "True" as we need to one_hot_encode the labels for the categorical crossentropy function that we are using.

In [None]:
# history = model.fit(x = get_training_dataset(load_dataset(TRAINING_FILENAMES), do_onehot=True),
#                     epochs = EPOCHS,
#                     steps_per_epoch = STEPS_PER_EPOCH,
#                     validation_steps = VALID_STEPS,
#                     validation_data = get_validation_dataset(load_dataset(VALIDATION_FILENAMES)),
#                     callbacks = [CustomCallback(lrfn_step), model_save, my_early_stopper],
#                     verbose = 1,
#                    )

### Visualizing Model History

- We can visualize the accuracy of the model on the training and validation datasets, as well as their loss over each epoch cycle.

In [None]:
# plt.figure(figsize=(13, 5))
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title("Model Loss")
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend(['Train', 'Test'])
# plt.ylim(ymax = 2, ymin = 0)
# plt.grid()
# plt.show()

In [None]:
# plt.figure(figsize=(13, 5))
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('Model Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(['Train','Test'])
# plt.grid()
# plt.show()

 ### Training a five_fold model
 
 Second option: Training a five-fold model.

In [None]:
TRAIN_ROUND = 0
MODELS = []

with strategy.scope():
    for val in range(0,5):

        TRAINING_FILENAMES_SPLIT = []
        VALIDATION_FILENAMES_SPLIT = []
        
        TRAINING_FILENAMES_SPLIT = TRAINING_FILENAMES[val:val+9]
        #had to add extra square brackets as there is only one tfrec file
        VALIDATION_FILENAMES_SPLIT = [VALIDATION_FILENAMES[val]]
        
        #the cosine annealing function is not super reproducable yet so...
        NUM_VALIDATION_IMAGES_SPLIT = count_data_items(VALIDATION_FILENAMES_SPLIT)
        NUM_TRAINING_IMAGES_SPLIT = count_data_items(TRAINING_FILENAMES_SPLIT)

        STEPS_PER_EPOCH =  NUM_TRAINING_IMAGES_SPLIT // BATCH_SIZE
        VALID_STEPS = NUM_VALIDATION_IMAGES_SPLIT // BATCH_SIZE
        WARMUP_STEPS = LR_RAMPUP_STEPS * (STEPS_PER_EPOCH)
        TOTAL_STEPS = EPOCHS * (STEPS_PER_EPOCH)
        
        #fitting each model fold
        print("TRAINING MODEL: {}".format(TRAIN_ROUND))
        
        MODELS.append(create_model())

        MODELS[TRAIN_ROUND].fit(x=get_training_dataset(load_dataset(TRAINING_FILENAMES_SPLIT), do_onehot=True),
                                    epochs = EPOCHS,
                                    steps_per_epoch = STEPS_PER_EPOCH,
                                    validation_steps = VALID_STEPS,
                                    validation_data=get_validation_dataset(load_dataset(VALIDATION_FILENAMES_SPLIT)),
                                    callbacks = [CustomCallback(lrfn_step), my_early_stopper],
                                    verbose=1,
                                   )
        MODELS[TRAIN_ROUND].save_weights('Model_{}_best_weights.h5'.format(TRAIN_ROUND))
        TRAIN_ROUND+=1

#### Feel free to comment below with any questions/concerns! Constructive criticism is welcomed!