In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, cv2, math, re
from PIL import Image
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import random

#model imports (keras/tensorflow)
import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras import layers, models
from keras.optimizers import Adam
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow.keras.backend as K

from kaggle_datasets import KaggleDatasets
from functools import partial

print("Tensorflow version " + tf.__version__)

os.system('pip install /kaggle/input/kerasapplications -q')
os.system('pip install /kaggle/input/efficientnet-keras-source-code/ -q --no-deps')

import efficientnet.tfkeras as efn

### Notes
 
- Everyone here is in the top 100, might be worth checking out [https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/212411#1162195](link)

- vit models? [vit-keras](https://github.com/faustomorales/vit-keras)
    - #! pip install vit-keras
    - #from vit_keras import vit, utils

- Oof cross val [M.Innat's Notebook](https://www.kaggle.com/ipythonx/tf-keras-cassava-leaf-advanced-augmentation)
    
- Try training on a EfficientNetB6 or EfficientNetB7 because their input sizes are closer to 512x512? [link](https://github.com/brendanartley/keras-applications)

- Going to see if I can try using a ResNet50 / SeresneXt50 model. I am able to implement build that model, but it seems I need some sort of pipeline to get validation accuracy

- Train without mixup and cutmix on first two or three epochs [Custom Training Loop](https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/212347)

#### Denoising

- See distributions of the imgs that were wrongly predicted. If the wrongly predicted images have a low prob threshold we could change something on submission?
- denoise strat 1st place solution [link](https://www.kaggle.com/c/prostate-cancer-grade-assessment/discussion/169143)
- awesome discussion [link](https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/202673)

### Updates

- V4: Normalized test images on submission (prior to this all TPU submission were <.60) 
- V9: New Transform option + effnetb3 - LB: .870
- V14: Increase train size - LB: .870
- V27: Manually executing notebook via run all as TPU Commit is not working
- V28: add dropout layer + cosine decay LR - LB: .877
- V30: Lowered learning rate / lowest val_loss score (val_loss - 0.34114) - LB: 0.883
- V31: EffNetB4 - LB: 0.881
- V35: Course Dropout/My TFrecord Files - LB: 0.881
- V36: Effnetb3 new seed (val_loss 0.30586) - LB: 0.876 
- V41: Fixed TFrecords + Normalized by channel mean-std - LB: 0.882
- V46: simple_data_augmentor w/ cutmix and mixup + one-hot-encoded labels (val_loss 0.29366) - LB: 0.884
- V48: seed 14 - LB: 0.887
- V58: 2019 + 2020 data, Custom Loss LB: 0.887
- V61: Label Smoothing + no cutmix LB: 0.888
- V65: EffnetB7 + CCE w/ Label Smoothing LB: 0.891
- V78: five-fild EffNetB3 LB: 0.891

### Notebooks that I found very useful

- [dimitreoliveira's notebook](https://www.kaggle.com/dimitreoliveira/flower-classification-with-tpus-eda-and-baseline)

- [Xhlulu's flower competition notebook](https://www.kaggle.com/xhlulu/flowers-tpu-concise-efficientnet-b7)

- [Getting Started with TPU's](https://www.kaggle.com/jessemostipak/getting-started-tpus-cassava-leaf-disease) -- [TPU Docs](https://www.kaggle.com/docs/tpu) -- [TFRecords Basics](https://www.kaggle.com/ryanholbrook/tfrecords-basics)

DATA Aug
- [TF Data Augmentation Docs](tensorflow.org/tutorials/images/data_augmentation)
- [TF Image Docs](https://www.tensorflow.org/api_docs/python/tf/image)

### Setting a Seed

Doing this for reproduciblity.

In [None]:
SEED = 15

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)    

seed_everything(SEED)    

# Detecting TPU

The following cell isnt necessary but is nice to double check that we are going to be using the TPU. If we have everything set up correctly the number of replicas should be 8. If we do not have the TPU turned on we will see a value of 1.

ADD Google Cloud Software Development Kit (SDK) to Notebook if using a private dataset (add-ons tab). Pretty much always using public data though.

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)

### Variables

- note: actual file path for data is 'cassava-leaf-disease-tfrecords-center-512x512'

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
GCS_PATH = KaggleDatasets().get_gcs_path('cldtfrecords512x512') 
#GCS_PATH = KaggleDatasets().get_gcs_path('v2cldtfrecordswnoisyremoved') #dataset with some removed

BATCH_SIZE = 16 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]
TARGET_SIZE = 512
CLASSES = ['0', '1', '2', '3', '4']
NUM_OF_CLASSES = len(CLASSES)
EPOCHS = 15
DROPOUT_RATE = 0.4 #would like to implement this 
AUG_BATCH = BATCH_SIZE

# Splitting TFRecords


The number on the end of the tfrecord file corresponds to the number of images in that tfrecord.

Example: 'gs://kds-100c2bc3bab7e1f77f19378980a417f43e62119932994bd622dc7cb4/Id_train01-1427.tfrec' (1427 imgs)


In [None]:
#this function counts number of images in all TFRecords
def count_data_items(filenames):
    n = [int(re.compile(r'-([0-9]*)\.').search(filename).group(1)) for filename in filenames]
    return np.sum(n)

ALL_TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.tfrec')
NUM_ALL_TRAINING_IMAGES = count_data_items(ALL_TRAINING_FILENAMES)

#reading train metadata
train = pd.read_csv('../input/cldtfrecords512x512/train.csv')
#print(f'Train samples: {len(train)}')

print(f'GCS: train images: {NUM_ALL_TRAINING_IMAGES}')

### Train_test_split

We only use this train_test_split if we plan to train a single model. If we are doing a five-fold split I have included that in the training loop at the bottom of the notebook.

In [None]:
TRAINING_FILENAMES, VALIDATION_FILENAMES = train_test_split(
    ALL_TRAINING_FILENAMES,
    train_size= 0.90, test_size=0.10,
    random_state=SEED,
)

NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES)
NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)

print("Training Images: {}  Validation Image: {}".format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES))
print("Training Percent: {:.2f}  Validation Percent: {:.2f}".format((NUM_TRAINING_IMAGES/NUM_ALL_TRAINING_IMAGES),
                                                           (NUM_VALIDATION_IMAGES/NUM_ALL_TRAINING_IMAGES)))

STEPS_PER_EPOCH =  NUM_TRAINING_IMAGES // BATCH_SIZE
VALID_STEPS = NUM_VALIDATION_IMAGES // BATCH_SIZE

I am printing out the validation TFrecs so that when I use cross-validation model strategies I can ensure no data leakage between train and validation datasets.

In [None]:
for num, filename in enumerate(VALIDATION_FILENAMES):
    print("File: {}".format(int(re.compile(r'train([0-9]*)').search(VALIDATION_FILENAMES[num]).group(1))))

# Functions

The following functions are how I am reading the data from the TF records. 

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3) #decoding jpeg-encoded img to uint8 tensor
    image = tf.cast(image, tf.float32) / 255.0 #cast int val to float so we can normalize pixels
    ch_1 = (tf.cast(image[:, :, 0], tf.float32)  - 0.4303)/ 0.2142 #TEST
    ch_2 = (tf.cast(image[:, :, 1], tf.float32)  - 0.4967)/ 0.2191 #TEST
    ch_3 = (tf.cast(image[:, :, 2], tf.float32)  - 0.3134)/ 0.1954 #TEST
    image = tf.stack([ch_1, ch_2, ch_3], axis = 2) #TEST
    image = tf.image.resize(image, [*IMAGE_SIZE]) #precautionary as all imgs should be 512x512
    image = tf.reshape(image, [*IMAGE_SIZE, 3]) #resizing to proper shape
    return image

In [None]:
def read_tfrecord(example, labeled=True):
    """
        1. Parse data based on the 'TFREC_FORMAT' map.
        2. Decode image.
        3. If 'labeled' returns (image, label) if not (image, name).
    """
    if labeled:
        TFREC_FORMAT = {
            'image': tf.io.FixedLenFeature([], tf.string), 
            'target': tf.io.FixedLenFeature([], tf.int64), 
        }
    else:
        TFREC_FORMAT = {
            'image': tf.io.FixedLenFeature([], tf.string), 
            'image_name': tf.io.FixedLenFeature([], tf.string), 
        }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    image = decode_image(example['image'])
    if labeled:
        label_or_name = tf.cast(example['target'], tf.int32)
        #label_or_name = tf.one_hot(indices, depth, dtype=tf.int32)
    else:
        label_or_name =  example['image_name']
    return image, label_or_name

In [None]:
def load_dataset(filenames, labeled=True, ordered=False):
    """
        Create a Tensorflow dataset from TFRecords.
    """
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(lambda x: read_tfrecord(x, labeled=labeled), num_parallel_calls=AUTOTUNE)
    return dataset

### Data augmentation

I have a few options for data augmentation here.

* The simple_data_augmenter provides some simple rotation and flipping augmentation.

* The dropout function provides random dropout areas in the image of a specified size and number of dropout areas. 

* The transform_one function is similar to the simple_data_augmentor, but provides more variation in terms of structural manipulation. Random rotations/zooms/shears. 

* The transform_two function provides cut_mix and mix_up transformations on the images. 

NOTE: I will not use all the augmentation options when training the model, but I have included them all here for testing.

#### simple_data_augmenter

In [None]:
def simple_data_augmenter(image, label):
    # Thanks to the dataset.prefetch(AUTO) statement in the following function this happens essentially for free on TPU. 
    # Data pipeline code is executed on the "CPU" part of the TPU while the TPU itself is computing gradients.
    
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32) #random int and rotating img based on result
    
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) # rotate 270ยบ
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) # rotate 180ยบ
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) # rotate 90ยบ
    
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_flip_left_right(image)
    
    
    return image, label

#### Dropout

Trying out another augmentation technique here called Coarse Dropout which randomly cuts out parts of the image. This is another technique to prevent overfitting.

To perform this dropout function just call it on the return statement of the transform function

In [None]:
def dropout(image, DIM=512, PROBABILITY = 0.75, CT = 5, SZ = 0.1):
    
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image with CT squares of side size SZ*DIM removed
    
    # DO DROPOUT WITH PROBABILITY DEFINED ABOVE
    P = tf.cast( tf.random.uniform([],0,1)<PROBABILITY, tf.int32)
    if (P==0)|(CT==0)|(SZ==0): return image
    
    for k in range(CT):
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        # COMPUTE SQUARE 
        WIDTH = tf.cast( SZ*DIM,tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)
        # DROPOUT IMAGE
        one = image[ya:yb,0:xa,:]
        two = tf.zeros([yb-ya,xb-xa,3]) 
        three = image[ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        image = tf.concat([image[0:ya,:,:],middle,image[yb:DIM,:,:]],axis=0)
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR 
    image = tf.reshape(image,[DIM,DIM,3])
    return image

#### Transform_one

Thanks to Chris Deotte for sharing this next data Augmentation Function. [link](https://www.kaggle.com/cdeotte/rotation-augmentation-gpu-tpu-0-96/comments)

- alter the parameters in the transform function in order to tune the data augmentation parameters

In [None]:
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear = math.pi * shear / 180.
    
    # ROTATION MATRIX
    c1 = tf.math.cos(rotation)
    s1 = tf.math.sin(rotation)
    one = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    rotation_matrix = tf.reshape( tf.concat([c1,s1,zero, -s1,c1,zero, zero,zero,one],axis=0),[3,3] )
        
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)
    shear_matrix = tf.reshape( tf.concat([one,s2,zero, zero,c2,zero, zero,zero,one],axis=0),[3,3] )    
    
    # ZOOM MATRIX
    zoom_matrix = tf.reshape( tf.concat([one/height_zoom,zero,zero, zero,one/width_zoom,zero, zero,zero,one],axis=0),[3,3] )
    
    # SHIFT MATRIX
    shift_matrix = tf.reshape( tf.concat([one,zero,height_shift, zero,one,width_shift, zero,zero,one],axis=0),[3,3] )
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), K.dot(zoom_matrix, shift_matrix))

In [None]:
def transform_one(image,label):
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    DIM = IMAGE_SIZE[0]
    XDIM = DIM%2 #fix for size 331
    
    rot = 45. * tf.random.normal([1],dtype='float32')
    shr = 0. * tf.random.normal([1],dtype='float32') #need to test with and without shear
    h_zoom = 1.1 #+ tf.random.normal([1],dtype='float32')/10.
    w_zoom = 1.1 #+ tf.random.normal([1],dtype='float32')/10.
    h_shift = 20. * tf.random.normal([1],dtype='float32') 
    w_shift = 20. * tf.random.normal([1],dtype='float32') 
  
    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x = tf.repeat( tf.range(DIM//2,-DIM//2,-1), DIM )
    y = tf.tile( tf.range(-DIM//2,DIM//2),[DIM] )
    z = tf.ones([DIM*DIM],dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m,tf.cast(idx,dtype='float32'))
    idx2 = K.cast(idx2,dtype='int32')
    idx2 = K.clip(idx2,-DIM//2+XDIM+1,DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack( [DIM//2-idx2[0,], DIM//2-1+idx2[1,]] )
    d = tf.gather_nd(image,tf.transpose(idx3))
    
    #PERFORMING DROPOUT ON RETURN STATEMENT
        
    return tf.reshape(d,[DIM,DIM,3]),label

#### transform_two (Cutmix + Mixup)

Cutmix basically adds pieces of other images to an image in a similar way to the dropout augmentation technique. 

Mix-up basically adds a different translucent image ontop of an image. 

Note there is also a one-hot encode function that encodes the labels 

In [None]:
def onehot(image,label):
    CLASSES = NUM_OF_CLASSES
    return image,tf.one_hot(label,CLASSES)

In [None]:
def cutmix(image, label, PROBABILITY = 1.0):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with cutmix applied
    DIM = IMAGE_SIZE[0]
    CLASSES = NUM_OF_CLASSES
    
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        # DO CUTMIX WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.int32)
        # CHOOSE RANDOM IMAGE TO CUTMIX WITH
        k = tf.cast( tf.random.uniform([],0,AUG_BATCH),tf.int32)
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        b = tf.random.uniform([],0,1) # this is beta dist with alpha=1.0
        WIDTH = tf.cast( DIM * tf.math.sqrt(1-b),tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)
        # MAKE CUTMIX IMAGE
        one = image[j,ya:yb,0:xa,:]
        two = image[k,ya:yb,xa:xb,:]
        three = image[j,ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        img = tf.concat([image[j,0:ya,:,:],middle,image[j,yb:DIM,:,:]],axis=0)
        imgs.append(img)
        # MAKE CUTMIX LABEL
        a = tf.cast(WIDTH*WIDTH/DIM/DIM,tf.float32)
        if len(label.shape)==1:
            lab1 = tf.one_hot(label[j],CLASSES)
            lab2 = tf.one_hot(label[k],CLASSES)
        else:
            lab1 = label[j,]
            lab2 = label[k,]
        labs.append((1-a)*lab1 + a*lab2)
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return image2,label2

In [None]:
def mixup(image, label, PROBABILITY = 1.0):
    # input image - is a batch of images of size [n,dim,dim,3] not a single image of [dim,dim,3]
    # output - a batch of images with mixup applied
    DIM = IMAGE_SIZE[0]
    CLASSES = NUM_OF_CLASSES
    
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        # DO MIXUP WITH PROBABILITY DEFINED ABOVE
        P = tf.cast( tf.random.uniform([],0,1)<=PROBABILITY, tf.float32)
        # CHOOSE RANDOM
        k = tf.cast( tf.random.uniform([],0,AUG_BATCH),tf.int32)
        a = tf.random.uniform([],0,1)*P # this is beta dist with alpha=1.0
        # MAKE MIXUP IMAGE
        img1 = image[j,]
        img2 = image[k,]
        imgs.append((1-a)*img1 + a*img2)
        # MAKE CUTMIX LABEL
        if len(label.shape)==1:
            lab1 = tf.one_hot(label[j],CLASSES)
            lab2 = tf.one_hot(label[k],CLASSES)
        else:
            lab1 = label[j,]
            lab2 = label[k,]
        labs.append((1-a)*lab1 + a*lab2)
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image2 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label2 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return image2,label2

In [None]:
def transform_two(image,label):
    # THIS FUNCTION APPLIES BOTH CUTMIX AND MIXUP
    DIM = IMAGE_SIZE[0]
    CLASSES = NUM_OF_CLASSES
    SWITCH = 0.5
    CUTMIX_PROB = 0
    MIXUP_PROB = 0.666
    # FOR SWITCH PERCENT OF TIME WE DO CUTMIX AND (1-SWITCH) WE DO MIXUP
    image2, label2 = cutmix(image, label, CUTMIX_PROB)
    image3, label3 = mixup(image, label, MIXUP_PROB)
    imgs = []; labs = []
    for j in range(AUG_BATCH):
        P = tf.cast( tf.random.uniform([],0,1)<=SWITCH, tf.float32)
        imgs.append(P*image2[j,]+(1-P)*image3[j,])
        labs.append(P*label2[j,]+(1-P)*label3[j,])
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR (maybe use Python typing instead?)
    image4 = tf.reshape(tf.stack(imgs),(AUG_BATCH,DIM,DIM,3))
    label4 = tf.reshape(tf.stack(labs),(AUG_BATCH,CLASSES))
    return image4,label4

### Choose Augmentation Function

Here I just define the final augmentation technique that I selected. This enables me to make make variations of the augmentation function without having to change the order in which the functions are defined. 

### Loading Data

Make sure to have the correct augmenter selected in the next function. 

- simple_data_augmenter - quicker but more simple augmenter
- transform_one - Chris Deottes Augmenter
- transform_two - Cutmix and Mixup augmentation option

In [None]:
def get_training_dataset(dataset, do_aug=True, do_onehot=False):
    dataset = dataset.map(simple_data_augmenter, num_parallel_calls=AUTOTUNE)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.batch(AUG_BATCH)
    if do_onehot: dataset = dataset.map(onehot, num_parallel_calls=AUTOTUNE) #onehot happens in do_aug as well 
    if do_aug: dataset = dataset.map(transform_two, num_parallel_calls=AUTOTUNE) # note we put AFTER batching
    dataset = dataset.unbatch()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(dataset, do_onehot=True):
    dataset = dataset.batch(BATCH_SIZE)
    if do_onehot: dataset = dataset.map(onehot, num_parallel_calls=AUTOTUNE) # we must use one hot like augmented train data
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

### Visualizing Transformations

Visualizing transformations function, comment out during actual model trainig to save RAM.

visualize_transformations_one is for all augmentations except cutmix and mixup which need to be visualized with visualize_transformations_two.

In [None]:
def visualize_transformations_one():
    row = 2; col = 4;
    all_elements = get_training_dataset().unbatch()
    one_element = tf.data.Dataset.from_tensors(next(iter(all_elements)) )
    augmented_element = one_element.repeat().map(transform_one).batch(row*col)
    

    for (img,label) in augmented_element:
        plt.figure(figsize=(15,int(15*row/col)))
        for j in range(row*col):
            plt.subplot(row,col,j+1)
            plt.axis('off')
            plt.imshow(img[j,])
        plt.show()
        break
        
#visualize_transformations()

In [None]:
def visualize_transformations_two():
    row = 6; col = 4;
    row = min(row,AUG_BATCH//col)
    all_elements = get_training_dataset(load_dataset(TRAINING_FILENAMES),do_aug=False).unbatch()
    augmented_element = all_elements.repeat().batch(AUG_BATCH).map(transform_two)

    for (img,label) in augmented_element:
        plt.figure(figsize=(15,int(15*row/col)))
        for j in range(row*col):
            plt.subplot(row,col,j+1)
            plt.axis('off')
            plt.imshow(img[j,])
        plt.show()
        break

#visualize_transformations_two()

# Building Model

Also note that we're using sparse_categorical_crossentropy as our loss function, because we did not one-hot encode our labels.

--

There is a very useful repository on github where I am trying the 'noisy-student' starting weights for the EfficientNet models. Check old notebook version for direct download, but I used the same thing from a dataset so I can still load this in when internet is turned off. 

Github Repo -> [Link](https://github.com/qubvel/efficientnet)

#### Custom Loss Function? 

- Tried Symmetrical Categorical Crossentropy but CC with label smoothing gave me better results.

In [None]:
CatCross_loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False, 
                                               label_smoothing=0.1, 
                                               name='categorical_crossentropy' ) 

### Defining model

In [None]:
def create_model():
    model = models.Sequential()
    
    model.add(efn.EfficientNetB3(include_top = False, weights = 'noisy-student', 
                              input_shape = (TARGET_SIZE, TARGET_SIZE, 3)))
    
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(DROPOUT_RATE))
    model.add(layers.Dense(5, activation = "softmax"))# 5 is the dimensionality of the output space "5 options"

    model.compile(optimizer = 'adam',
                  loss = CatCross_loss, #use sparse_catgeorical_crossentropy of if not one_hot_encoding
                  metrics = ["acc"]) #try sparse_categorical_accuracy here
    return model

In order to ensure that our model is trained on the TPU, we build it using strategy.scope()

In [None]:
with strategy.scope():
    model = create_model()

model.save('./EffNet_untrained_TPU_model.h5')

### Callbacks


#### Custom Learning Rate Schedulers

1. The first option updates the Learning Rate at the end of every epoch.
2. The second custom LR scheduler updates the learning rate every step.

In [None]:
# # Learning rate schedule for TPU, GPU and CPU.
# # Using an LR ramp up because fine-tuning a pre-trained model.
# # Starting with a high LR would break the pre-trained weights.

LR_START = 0.00001
LR_MAX = 0.0001
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 3
LR_SUSTAIN_EPOCHS = 0
WARMUP_STEPS = LR_RAMPUP_EPOCHS * (NUM_TRAINING_IMAGES//BATCH_SIZE)
TOTAL_STEPS = EPOCHS * (NUM_TRAINING_IMAGES//BATCH_SIZE)
LR_EXP_DECAY = 0.85

# def lrfn(epoch):
#     if epoch < LR_RAMPUP_EPOCHS:
#         lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
#     elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
#         lr = LR_MAX
#     else:
#         #cosine decay
#         progress = (epoch - LR_RAMPUP_EPOCHS) / (EPOCHS - LR_RAMPUP_EPOCHS)
#         lr = LR_MAX * (0.5 * (1.0 + tf.math.cos(np.pi * ((1.0 * progress) % 1.0))))
        
#         #exponential decay
#         #lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
#     return lr

    
# #setting verbose=True allows us to see LR in model training
# lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

# #visualizing the learning rate schedule
# rng = [i for i in range(EPOCHS)]
# y = [lrfn(x) for x in rng]

# sns.set(style='whitegrid')
# plt.figure(figsize=(13, 5))
# plt.xlabel('Epoch')
# plt.ylabel('Learning Rate')
# plt.plot(rng, y)
# print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

This second learning rate function updates every step, and should be a much better alternative. 

You can include print statements in the on_train_batch_beging function to ensure the training rate is updating in the way you want to.

For more info on custom callbacks, check out Tensorflow's guide here --> [Link](https://www.tensorflow.org/guide/keras/custom_callback)

In [None]:
def lrfn_step(step):
    if step < WARMUP_STEPS:
        lr = (LR_MAX - LR_START) / WARMUP_STEPS * step + LR_START
    else:
        progress = (step - WARMUP_STEPS) / (TOTAL_STEPS - WARMUP_STEPS)
        lr = LR_MAX * (0.5 * (1.0 + tf.math.cos(np.pi * ((1.0 * progress) % 1.0))))
    return lr

class CustomCallback(keras.callbacks.Callback):
    def __init__(self, schedule):
        super(CustomCallback, self).__init__()
        self.schedule = schedule
        self.epoch = 0
        
    def on_train_batch_begin(self, batch, logs=None):
        actual_step = (self.epoch*STEPS_PER_EPOCH) + batch
        # Call schedule function to get the scheduled learning rate.
        scheduled_lr = self.schedule(actual_step)
        # Set the value back to the optimizer before this epoch starts
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        if batch == 0:
            print("--Learning Rate: {:.6f} --".format(scheduled_lr))
        
    def on_epoch_end(self, epoch, logs=None):
        self.epoch+=1
        
    

#visualizing Learning Rate Schedule
rng = [i for i in range(TOTAL_STEPS)]
y = [lrfn_step(tf.cast(x, tf.float32)) for x in rng]

sns.set(style='whitegrid')
fig, ax = plt.subplots(figsize=(20, 6))
plt.plot(rng, y)

print(f'{TOTAL_STEPS} total steps and {NUM_TRAINING_IMAGES//BATCH_SIZE} steps per epoch')
print(f'Learning rate schedule: {y[0]:.3g} to {max(y):.3g} to {y[-1]:.3g}')

#### Early_Stopping + ModelCheckpoint

Since the data is quite noisy, I am monitoring the validation accuracy rather than the validation loss for saving the best model weights. 

In [None]:
#note if you monitor=val_loss then mode=min, if monitor=val_sparse_categorical_accuracy then mode=max
model_save = ModelCheckpoint('./Effnet_TPU_Model_best_weights.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_acc', 
                             mode = 'max',
                             verbose = 1)

my_early_stopper = EarlyStopping(monitor = 'val_acc', min_delta = 0.001, 
                           patience = 3, mode = 'max', verbose = 1,
                           restore_best_weights = False)

### Fitting the Model (single-fold)

In [None]:
# history = model.fit(x=get_training_dataset(load_dataset(TRAINING_FILENAMES)),
#                     epochs=EPOCHS,
#                     steps_per_epoch = STEPS_PER_EPOCH,
#                     validation_steps=VALID_STEPS,
#                     validation_data=get_validation_dataset(load_dataset(VALIDATION_FILENAMES)),
#                     callbacks = [CustomCallback(lrfn_step), model_save, my_early_stopper],
#                     verbose=1,
#                    )

In [None]:
# model.save_weights('./Effnet_TPU_Model_final_weights.h5')

### Visualizing Model History

In [None]:
# plt.figure(figsize=(13, 5))
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title("Model Loss")
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend(['Train', 'Test'])
# plt.ylim(ymax = 2, ymin = 0)
# plt.grid()
# plt.show()

In [None]:
# plt.figure(figsize=(13, 5))
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('Model Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(['Train','Test'])
# plt.grid()
# plt.show()

### Fitting the models (5-fold)

Here I am tryng to ensemble five models and find the optimal weight to give each model prediction. I started off by defining a new early stopping callback which restores the best model weights at the end of training.

In [None]:
#defining a second early_stopper that restores model best weights
fold_early_stopper = EarlyStopping(monitor = 'val_acc', min_delta = 0.001, 
                           patience = 5, mode = 'max', verbose = 1,
                           restore_best_weights = True)

Making predictions in a seperate notebook.

In [None]:
TRAIN_ROUND = 0
MODELS = []
MODELS_HISTORY = []
MODELS_PREDICTIONS = []
kf = KFold(n_splits=5)
model = 0

random.shuffle(TRAINING_FILENAMES) #seeded at top of notebook

def lrfn_step(step):
    if step < WARMUP_STEPS:
        lr = (LR_MAX - LR_START) / WARMUP_STEPS * step + LR_START
    else:
        progress = (step - WARMUP_STEPS) / (TOTAL_STEPS - WARMUP_STEPS)
        lr = LR_MAX * (0.5 * (1.0 + tf.math.cos(np.pi * ((1.0 * progress) % 1.0))))
    return lr

class CustomCallback(keras.callbacks.Callback):
    def __init__(self, schedule):
        super(CustomCallback, self).__init__()
        self.schedule = schedule
        self.epoch = 0

    def on_train_batch_begin(self, batch, logs=None):
        actual_step = (self.epoch*STEPS_PER_EPOCH) + batch
        # Call schedule function to get the scheduled learning rate.
        scheduled_lr = self.schedule(actual_step)
        # Set the value back to the optimizer before this epoch starts
        tf.keras.backend.set_value(self.model.optimizer.lr, scheduled_lr)
        if batch == 0:
            print("--Learning Rate: {:.6f} --".format(scheduled_lr))

    def on_epoch_end(self, epoch, logs=None):
        self.epoch+=1

with strategy.scope():
    for train_index, test_index in kf.split(ALL_TRAINING_FILENAMES):

        TRAINING_FILENAMES_SPLIT = []
        VALIDATION_FILENAMES_SPLIT = []

        for i in train_index:
            TRAINING_FILENAMES_SPLIT.append(ALL_TRAINING_FILENAMES[i])
        for i in test_index:
            VALIDATION_FILENAMES_SPLIT.append(ALL_TRAINING_FILENAMES[i])
        
        #counting number of imgs in each tfrecord + setting fit parameters
        NUM_VALIDATION_IMAGES_SPLIT = count_data_items(VALIDATION_FILENAMES_SPLIT)
        NUM_TRAINING_IMAGES_SPLIT = count_data_items(TRAINING_FILENAMES_SPLIT)

        STEPS_PER_EPOCH =  NUM_TRAINING_IMAGES_SPLIT // BATCH_SIZE
        VALID_STEPS = NUM_VALIDATION_IMAGES_SPLIT // BATCH_SIZE
        WARMUP_STEPS = LR_RAMPUP_EPOCHS * (STEPS_PER_EPOCH)
        TOTAL_STEPS = EPOCHS * (STEPS_PER_EPOCH)
        
        #fitting each model fold
        print("TRAINING MODEL: {}".format(TRAIN_ROUND))
        
        MODELS.append(create_model())

        MODELS[TRAIN_ROUND].fit(x=get_training_dataset(load_dataset(TRAINING_FILENAMES_SPLIT)),
                                    epochs = EPOCHS,
                                    steps_per_epoch = STEPS_PER_EPOCH,
                                    validation_steps = VALID_STEPS,
                                    validation_data=get_validation_dataset(load_dataset(VALIDATION_FILENAMES_SPLIT)),
                                    callbacks = [CustomCallback(lrfn_step), fold_early_stopper],
                                    verbose=1,
                                   )
        MODELS[TRAIN_ROUND].save_weights('Model_{}_best_weights.h5'.format(TRAIN_ROUND))
        TRAIN_ROUND+=1
        model+=1