# Initialize Environment

In [None]:
!pip install -q efficientnet >> /dev/null
import efficientnet.tfkeras as efn

In [None]:
import pandas as pd, numpy as np
from kaggle_datasets import KaggleDatasets
import tensorflow as tf, re, math
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler

## Configuration
In order to be a proper cross validation with a meaningful overall CV score (aligned with LB score), **you need to choose the same** `IMG_SIZES`, and `EFF_NETS` **for each fold**. If your goal is to just run lots of experiments, then you can choose to have a different experiment in each fold. Then each fold is like a holdout validation experiment. When you find a configuration you like, you can use that configuration for all folds. 
* DEVICE - is GPU or TPU
* SEED - a different seed produces a different triple stratified kfold split.
* FOLDS - number of folds. Best set to 3, 5, or 15 but can be any number between 2 and 15
* IMG_SIZES - is a Python list of length FOLDS. These are the image sizes to use each fold
* BATCH_SIZES - is a list of length FOLDS. These are batch sizes for each fold. For maximum speed, it is best to use the largest batch size your GPU or TPU allows.
* EPOCHS - is a list of length FOLDS. These are maximum epochs. Note that each fold, the best epoch model is saved and used. So if epochs is too large, it won't matter.
* EFF_NETS - is a list of length FOLDS. These are the EfficientNets to use each fold. The number refers to the B. So a number of `0` refers to EfficientNetB0, and `1` refers to EfficientNetB1, etc.
* TTA - test time augmentation. Each test image is randomly augmented and predicted TTA times and the average prediction is used. TTA is also applied to OOF during validation.

In [None]:
DEVICE = "TPU" #or "GPU"

# USE DIFFERENT SEED FOR DIFFERENT STRATIFIED KFOLD
SEED = 42

# NUMBER OF FOLDS. USE 3, 5, OR 15 
FOLDS = 5

# DIMENSION OF THE SLICES
IMG_SIZES = [512]*FOLDS

# FILE TO TAKE IMAGES FROM
# image_file = 'osicallscanssimple'
# image_file = 'scannormalised'
image_file = 'osic-scans-tfrecords-512'

# CUTOUT AUGMENTATION PARAMETERS
DROP_FREQ = [0.75]*FOLDS
DROP_CT = [20]*FOLDS
DROP_SIZE = [0.2]*FOLDS

# BATCH SIZE AND EPOCHS
BATCH_SIZES = [32]*FOLDS #TPU
# BATCH_SIZES = [8]*FOLDS # GPU
EPOCHS = [12]*FOLDS

# WHICH EFFICIENTNET B? TO USE
EFF_NETS = [3]*FOLDS

# TEST TIME AUGMENTATION STEPS
TTA = 11

In [None]:
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

# Step 1: Preprocess
Preprocess has already been done and saved to TFRecords. Here we choose which size to load. We can use either 128x128, 192x192, 256x256, 384x384, 512x512, 768x768 by changing the `IMG_SIZES` variable in the preceeding code section. These TFRecords are discussed [here][1]. The advantage of using different input sizes is discussed [here][2]

[1]: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/155579
[2]: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/160147

In [None]:
GCS_PATH = [KaggleDatasets().get_gcs_path(image_file)]*FOLDS;
files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH[0] + '/train*.tfrec')))

In [None]:
n_im = []; file_extension = []
for file in files_train:
    n_im+=[int(file.split('-')[-1].split('.')[0])]
    file_extension += ['/'+file.split('/')[-1]]

The fits come from [this](https://www.kaggle.com/samklein/probabilistic-fits) notebook

In [None]:
# LOAD TRAIN META DATA
# It is a small issue that one patient is dropped from this but not from the fits.
meta = pd.read_csv('../input/clean-data/train')
try: meta.drop('Unnamed: 0',inplace=True,axis=1)
except: pass
encoded = False

fit_names = np.load('../input/probabilistic-fits-studentt/names.npy')
fit_samples = np.load('../input/probabilistic-fits-studentt/samples.npy')

In [None]:
# Encode Meta Data
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# One hot encode the meta data
# Should you give current smokers the label of smoked as well? Because they used to smoke, and they still do.
if not encoded:
    meta['Sex'] = (meta['Sex']=='Male').astype('int')
    meta['Smoked'] = (meta['SmokingStatus']=='Ex-smoker').astype('int')
    meta['CurrentlySmokes'] = (meta['SmokingStatus']=='Currently smokes').astype('int')
    meta.drop(['SmokingStatus'],axis=1,inplace=True)
    
    # Scale the meta data
    num_enc = StandardScaler()
    num_cols_to_scale = ['FVC','Percent','Weeks','Age']
    meta[num_cols_to_scale] = num_enc.fit_transform(meta[num_cols_to_scale])

    # Scale the targets
    slope_enc = MinMaxScaler(feature_range=(1, 10))
    scaled_slope = slope_enc.fit_transform(fit_samples[0::2].ravel().reshape(-1, 1))
    intercept_enc = MinMaxScaler(feature_range=(1, 20))
    scaled_intercept = intercept_enc.fit_transform(fit_samples[1::2].ravel().reshape(-1, 1))
    
    encoded = True

# Look at the preprocessed meta data
meta.head()

In [None]:
# One thing that is clear is that there is an imbalance in the targets, some kind of sample weighting or upsampling
# is probably required to address this.
fig, ax = plt.subplots(1,2,figsize=(14,5))
ax[0].hist(fit_samples[0::2].ravel())
ax[0].set_title('Slope')
ax[1].hist(fit_samples[1::2].ravel())
ax[1].set_title('Intercept')
plt.show()

In [None]:
# Doing things this way leaves me the freedom to change labels or how I preprocess without having to worry about
# regenerating tfrecords.
names = meta['Patient'].unique()
data = meta.drop(['Patient','intercept','slope'],axis=1)
target = meta[['slope','intercept']]

acc = []; ordered_slope = np.zeros(scaled_slope.shape); ordered_intercept = np.zeros(scaled_intercept.shape)
cin=0
for i,pid in enumerate(names):
    locs = np.where(meta['Patient']==pid)[0]
    if cin > locs[0]:
        print('The dataframe is not ordered')
    cin=locs[-1]
    acc += [[locs[0],locs[-1]]]
    ordered_slope[i] = scaled_slope[np.where(fit_names==pid)]
    ordered_intercept[i] = scaled_intercept[np.where(fit_names==pid)]

# The number of samples drawn for the intercept and slope.
nsamples = ordered_intercept.shape[1]

#Make a lookup table for the data
with strategy.scope():
    get_index = tf.lookup.StaticHashTable(
      tf.lookup.KeyValueTensorInitializer(names, np.arange( len(names) )), -1
    )
    meta_access = tf.constant(np.array(acc))
    meta_tensor = tf.constant(data)
    target_slope = tf.constant(ordered_slope)
    target_intercept = tf.constant(ordered_intercept)

# Step 2: Data Augmentation
This notebook uses rotation, sheer, zoom, shift augmentation first shown in this notebook [here][1] and successfully used in Melanoma comp by AgentAuers [here][2]. This notebook also uses horizontal flip, hue, saturation, contrast, brightness augmentation similar to last years winner and also similar to AgentAuers' notebook.

Additionally we can decide to use external data by changing the variables `INC2019` and `INC2018` in the preceeding code section. These variables respectively indicate whether to load last year 2019 data and/or year 2018 + 2017 data. These datasets are discussed [here][3]

Consider experimenting with different augmenation and/or external data. The code to load TFRecords is taken from AgentAuers' notebook [here][2]. Thank you AgentAuers, this is great work.

[1]: https://www.kaggle.com/cdeotte/rotation-augmentation-gpu-tpu-0-96
[2]: https://www.kaggle.com/agentauers/incredible-tpus-finetune-effnetb0-b6-at-once
[3]: https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/164910

In [None]:
ROT_ = 180.0
SHR_ = 2.0
HZOOM_ = 8.0
WZOOM_ = 8.0
HSHIFT_ = 8.0
WSHIFT_ = 8.0

In [None]:
def dropout(image, DIM=256, PROBABILITY = 0.75, CT = 8, SZ = 0.2):
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image with CT squares of side size SZ*DIM removed
    
    # DO DROPOUT WITH PROBABILITY DEFINED ABOVE
    P = tf.cast( tf.random.uniform([],0,1)<PROBABILITY, tf.int32)
    if (P==0)|(CT==0)|(SZ==0): return image
    
    for k in range(CT):
        # CHOOSE RANDOM LOCATION
        x = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        y = tf.cast( tf.random.uniform([],0,DIM),tf.int32)
        # COMPUTE SQUARE 
        WIDTH = tf.cast( SZ*DIM,tf.int32) * P
        ya = tf.math.maximum(0,y-WIDTH//2)
        yb = tf.math.minimum(DIM,y+WIDTH//2)
        xa = tf.math.maximum(0,x-WIDTH//2)
        xb = tf.math.minimum(DIM,x+WIDTH//2)
        # DROPOUT IMAGE
        one = image[ya:yb,0:xa,:]
        two = tf.zeros([yb-ya,xb-xa,3]) 
        three = image[ya:yb,xb:DIM,:]
        middle = tf.concat([one,two,three],axis=1)
        image = tf.concat([image[0:ya,:,:],middle,image[yb:DIM,:,:]],axis=0)
            
    # RESHAPE HACK SO TPU COMPILER KNOWS SHAPE OF OUTPUT TENSOR 
    image = tf.reshape(image,[DIM,DIM,3])
    return image

def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear    = math.pi * shear    / 180.

    def get_3x3_mat(lst):
        return tf.reshape(tf.concat([lst],axis=0), [3,3])
    
    # ROTATION MATRIX
    c1   = tf.math.cos(rotation)
    s1   = tf.math.sin(rotation)
    one  = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    
    rotation_matrix = get_3x3_mat([c1,   s1,   zero, 
                                   -s1,  c1,   zero, 
                                   zero, zero, one])    
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)    
    
    shear_matrix = get_3x3_mat([one,  s2,   zero, 
                                zero, c2,   zero, 
                                zero, zero, one])        
    # ZOOM MATRIX
    zoom_matrix = get_3x3_mat([one/height_zoom, zero,           zero, 
                               zero,            one/width_zoom, zero, 
                               zero,            zero,           one])    
    # SHIFT MATRIX
    shift_matrix = get_3x3_mat([one,  zero, height_shift, 
                                zero, one,  width_shift, 
                                zero, zero, one])
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), 
                 K.dot(zoom_matrix,     shift_matrix))


def transform(image, DIM=256):    
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    XDIM = DIM%2 #fix for size 331
    
    rot = ROT_ * tf.random.normal([1], dtype='float32')
    shr = SHR_ * tf.random.normal([1], dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / HZOOM_
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / WZOOM_
    h_shift = HSHIFT_ * tf.random.normal([1], dtype='float32') 
    w_shift = WSHIFT_ * tf.random.normal([1], dtype='float32') 

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x   = tf.repeat(tf.range(DIM//2, -DIM//2,-1), DIM)
    y   = tf.tile(tf.range(-DIM//2, DIM//2), [DIM])
    z   = tf.ones([DIM*DIM], dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM//2+XDIM+1, DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack([DIM//2-idx2[0,], DIM//2-1+idx2[1,]])
    d    = tf.gather_nd(image, tf.transpose(idx3))
        
    return tf.reshape(d,[DIM, DIM,3])

## Methods for gathering data 

In [None]:
tf.random.set_seed(5)

def prep_meta(example):
    query = get_index.lookup(example['image_name'])
    ind_range = tf.gather(meta_access, query)
    indx = tf.random.uniform([1], minval=ind_range[0], maxval=ind_range[1], dtype=tf.dtypes.int64)
    ## Uncomment the below to only take the first date during training
#     indx = ind_range[0]
    meta_data = tf.gather(meta_tensor, indx)
    indtarget = tf.random.uniform([1], minval=0, maxval=nsamples, dtype=tf.dtypes.int64)
    ts = tf.squeeze(tf.gather(target_slope[query], indtarget))
    ti = tf.squeeze(tf.gather(target_intercept[query], indtarget))
    return tf.squeeze(meta_data), tf.stack([ts,ti],axis=-1)

def read_labeled_tfrecord(example):

    tfrec_format = {
      'image': tf.io.FixedLenFeature([], tf.string),
      'image_name': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)

    meta_data, target = prep_meta(example)
    return (example['image'],meta_data), target


# The unlabelled data is the test set and we cannot read this in with tfrecord
def read_unlabeled_tfrecord(example, return_image_name):
    tfrec_format = {
      'image': tf.io.FixedLenFeature([], tf.string),
      'image_name': tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    meta_data, _ = prep_meta(example)
    return (example['image'],meta_data), example['image_name'] if return_image_name else 0

 
def prepare_image(data, augment=True, dim=256, droprate=0, dropct=0, dropsize=0):   
    img=data[0]; meta = data[1]
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    
    if augment:
        img = transform(img,DIM=dim)
        if (droprate!=0)&(dropct!=0)&(dropsize!=0): 
            img = dropout(img, DIM=dim, PROBABILITY=droprate, CT=dropct, SZ=dropsize)
        img = tf.image.random_flip_left_right(img)
        #img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)
                      
    img = tf.reshape(img, [dim,dim, 3])
            
    return (img,meta)

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

In [None]:
def get_dataset(files, augment = False, shuffle = False, repeat = False, 
                batch_size=16, dim=512, labeled = True, return_image_names=False,
                droprate=0, dropct=0, dropsize=0):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(lambda example: read_labeled_tfrecord(example),
                    num_parallel_calls=AUTO)
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_names), 
                    num_parallel_calls=AUTO)      
    
    ds = ds.map(lambda img, imgname_or_label: (prepare_image(img, augment=augment, dim=dim,
                                                            droprate=droprate, dropct=dropct, dropsize=dropsize), 
                                               imgname_or_label), 
                num_parallel_calls=AUTO)

    ds = ds.batch(batch_size * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
# fold=0
# files_debug = files_train = tf.io.gfile.glob([GCS_PATH[fold] + file_extension[i] for i in [0]])

# # ds_valid = get_dataset(files_valid,augment=False,shuffle=False,repeat=False,dim=IMG_SIZES[fold])
# ds_train = get_dataset(files_debug, augment=True, shuffle=True, repeat=True,
#                 dim=IMG_SIZES[fold], batch_size = BATCH_SIZES[fold], labeled=True,
#                    droprate=DROP_FREQ[fold], dropct=DROP_CT[fold], dropsize=DROP_SIZE[fold])
# for img, target in iter(ds_train.unbatch()):
#         print(target)

# Step 3: Build Model
This is a common model architecute. Consider experimenting with different backbones, custom heads, losses, and optimizers. Also consider inputing meta features into your CNN.

In [None]:
from tensorflow.keras import layers as L
import tensorflow_addons as tfa

In [None]:
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, efn.EfficientNetB6]

def build_model(dim=128, ef=0):
    inp = tf.keras.layers.Input(shape=(dim,dim,3))
    base = EFNS[ef](input_shape=(dim,dim,3),weights='noisy-student',include_top=False)
    x = base(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    fdim = 128
    x = L.Dense(fdim, activation='relu')(x)
    meta_dim = data.shape[1]
    meta_inp = tf.keras.layers.Input(shape=(meta_dim,))
    xm = L.concatenate((x,meta_inp))
    xm = L.Dropout(0.2)(xm)
    xm = L.Dense(fdim, activation='relu')(xm)
    xm = L.Dropout(0.1)(xm)
    xm = L.Dense(2, activation='relu')(xm)
    model = tf.keras.Model(inputs=(inp, meta_inp), outputs=xm)
#     step = tf.Variable(0, trainable=False)
#     schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
#         [2, 5, 7], [1e-2, 1e-1, 1e-2, 1e-3])
#     # lr and wd can be a function or a tensor
#     lr = 1e-2 * schedule(step)
#     wd = lambda: 1e-2 * schedule(step)
#     opt = tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=wd)
    opt = tfa.optimizers.AdamW(learning_rate=0.001,weight_decay=0.001)
#     loss = tf.keras.losses.MeanAbsoluteError()
    loss = tf.keras.losses.MeanSquaredError()
#     loss = tf.keras.losses.MeanAbsolutePercentageError()
    model.compile(optimizer=opt,loss=loss)
    return model

# Step 3: Metric scoring
We need special ways to make predictions

In [None]:
# Calculate the predictions, confidence and metric for samples.
# Should be done using tensors directly
untrans_meta = pd.read_csv('../input/clean-data/train')

def metric(estimate,confidence,samples):
    sig = np.where(confidence<70, 70, confidence)
    abs_diff = np.abs(estimate-samples)
    delta = np.where(abs_diff>1000,1000,abs_diff)
    return np.mean(-2**(1/2) * delta / sig - np.log(2**(1/2) * sig))

def get_metric(names,preds,confidence):
    score=[]
    for i,pid in enumerate(names):
        mx = untrans_meta['Patient']==pid
        pdata = untrans_meta.loc[mx]
        inds = pdata['Weeks']; FVCs = pdata['FVC']
        score += [metric(preds[i][inds],confidence[i][inds],FVCs)]
    return score

def predict_full(ids,oof_slope,oof_intercept):
    oof_slope = np.array(oof_slope); oof_intercept = np.array(oof_intercept)
    weeks = np.array(range(-12,134))
    names = np.unique(ids)
    preds = np.zeros((len(names),len(weeks)));confidence = np.zeros((len(names),len(weeks)))
    for i,pid in enumerate(names):
        inds = np.where(ids==pid)
        slopes = oof_slope[inds].ravel()
        intercepts = oof_intercept[inds].ravel()
        preds_tta = np.outer(slopes,weeks) + intercepts[:,np.newaxis]
        preds[i] = np.mean(preds_tta,axis=0)
        confidence[i] = np.std(preds_tta,axis=0)
    return names, preds, confidence

# Step 4: Train Schedule
This is a common train schedule for transfer learning. The learning rate starts near zero, then increases to a maximum, then decays over time. Consider changing the schedule and/or learning rates. Note how the learning rate max is larger with larger batches sizes. This is a good practice to follow.

In [None]:
def get_lr_callback(batch_size=8):
    lr_start   = 0.000005
    lr_max     = 0.00000125 * REPLICAS * batch_size
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

## Train Model
Our model will be trained for the number of FOLDS and EPOCHS you chose in the configuration above. Each fold the model with lowest validation loss will be saved and used to predict OOF and test. Adjust the variables `VERBOSE` and `DISPLOY_PLOT` below to determine what output you want displayed. The variable `VERBOSE=1 or 2` will display the training and validation loss and auc for each epoch as text. The variable `DISPLAY_PLOT` shows this information as a plot. 

In [None]:
# USE VERBOSE=0 for silent, VERBOSE=1 for interactive, VERBOSE=2 for commit
VERBOSE = 0
DISPLAY_PLOT = True

skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)
# oof_tar_slope = []; oof_tar_intercept = []; 
oof_val = []; oof_names = []; oof_folds = []; oof_score = []
nimages = count_data_items(file_extension)
oof_slope_pred = np.zeros((nimages,TTA)); oof_intercept_pred = np.zeros((nimages,TTA)); fcnt = 0
# oof_tta_save = []; test_tta_save = []

for fold,(idxT,idxV) in enumerate(skf.split(np.arange(len(file_extension)))):
    
    if fold > 0:
        break
    
    # DISPLAY FOLD INFO
    if DEVICE=='TPU':
        if tpu: tf.tpu.experimental.initialize_tpu_system(tpu)
    print('#'*25); print('#### FOLD',fold+1)
    print('#### Image Size %i with EfficientNet B%i and batch_size %i'%
          (IMG_SIZES[fold],EFF_NETS[fold],BATCH_SIZES[fold]*REPLICAS))
    
    # CREATE TRAIN AND VALIDATION SUBSETS
        
    files_train = tf.io.gfile.glob([GCS_PATH[fold] + file_extension[i] for i in idxT])
        
    np.random.shuffle(files_train); print('#'*25)
    files_valid = tf.io.gfile.glob([GCS_PATH[fold] + file_extension[i] for i in idxV])
    
    # BUILD MODEL
    K.clear_session()
    with strategy.scope():
        model = build_model(dim=IMG_SIZES[fold],ef=EFF_NETS[fold])
        
    # SAVE BEST MODEL EACH FOLD
    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')
   
#     # TRAIN
    print('Training...')
    history = model.fit(
        get_dataset(files_train, augment=True, shuffle=True, repeat=True,
                dim=IMG_SIZES[fold], batch_size = BATCH_SIZES[fold],
                   droprate=DROP_FREQ[fold], dropct=DROP_CT[fold], dropsize=DROP_SIZE[fold]), 
        epochs=EPOCHS[fold], 
        callbacks = [sv,get_lr_callback(BATCH_SIZES[fold])],
#         epochs=EPOCHS[fold], callbacks = [sv],
        steps_per_epoch=count_data_items(files_train)/BATCH_SIZES[fold]//REPLICAS,
        validation_data=get_dataset(files_valid,augment=False,shuffle=False,
                repeat=False,dim=IMG_SIZES[fold]),
        verbose=VERBOSE
    )
    
#     print('Loading best model...')
#     model.load_weights('fold-%i.h5'%fold)
    
    # PREDICT OOF USING TTA
    print('Predicting OOF with TTA...')
    ds_valid = get_dataset(files_valid,labeled=False,return_image_names=False,augment=True,
            repeat=True,shuffle=False,dim=IMG_SIZES[fold],batch_size=BATCH_SIZES[fold]*4,
            droprate=DROP_FREQ[fold], dropct=DROP_CT[fold], dropsize=DROP_SIZE[fold])
    ct_valid = count_data_items(files_valid); STEPS = TTA * ct_valid/BATCH_SIZES[fold]/4/REPLICAS
    pred = model.predict(ds_valid,steps=STEPS,verbose=VERBOSE)[:TTA*ct_valid,]
#     pred = target_enc.inverse_transform(pred_model) # Transform back to the proper scale
    oof_slope_pred[fcnt:fcnt+ct_valid] = slope_enc.inverse_transform(pred[:,0].reshape(-1, 1)).reshape((ct_valid,TTA),order='F')
    oof_intercept_pred[fcnt:fcnt+ct_valid] = intercept_enc.inverse_transform(pred[:,1].reshape(-1, 1)).reshape((ct_valid,TTA),order='F')             
    
    # GET OOF TARGETS AND NAMES
    ds_valid = get_dataset(files_valid, augment=False, repeat=False, dim=IMG_SIZES[fold],
            labeled=True,shuffle=False, return_image_names=True)
#     tslope=[];tint=[]
#     for img, target in iter(ds_valid.unbatch()):
#         tslope.append(target.numpy()[0]); tint.append(target.numpy()[1])
#     oof_tar_slope.append( slope_enc.inverse_transform(np.array(tslope).reshape(-1, 1)) ); 
#     oof_tar_intercept.append( intercept_enc.inverse_transform(np.array(tint).reshape(-1, 1)) )
    ds = get_dataset(files_valid, augment=False, repeat=False, dim=IMG_SIZES[fold],
                labeled=False, return_image_names=True)
    nms = np.array([img_name.numpy().decode("utf-8") for img, img_name in iter(ds.unbatch())])
    oof_names.append( nms )
    oof_folds.append( np.ones(nms.shape[0],dtype='int8')*fold )
    
    # REPORT RESULTS
    ids, full_preds, confidence = predict_full(nms,oof_slope_pred[fcnt:fcnt+ct_valid],
                                               oof_intercept_pred[fcnt:fcnt+ct_valid])
    scores = get_metric(ids, full_preds, confidence)
    oof_score.append( scores )
    oof_val.append(np.max( history.history['val_loss'] ))
    print('#### FOLD %i OOF Loss = %.3f, Metric score of %.2f'%(fold+1,oof_val[-1],np.mean(scores)))
    fcnt+=ct_valid
    
    # PLOT TRAINING
    if DISPLAY_PLOT:
        plt.figure(figsize=(15,5))
        plt2 = plt.gca().twinx()
        plt2.plot(np.arange(EPOCHS[fold]),history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
        plt2.plot(np.arange(EPOCHS[fold]),history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
        plt.ylabel('Loss',size=14)
        plt.title('Image Size %i, EfficientNet B%i'%
                (IMG_SIZES[fold],EFF_NETS[fold]),size=18)
        plt.legend(loc=3)
        plt.show()  

In [None]:
tscores = np.concatenate(oof_score)
plt.hist(tscores)
plt.show()

## Calculate OOF AUC
The OOF (out of fold) predictions are saved to disk. If you wish to ensemble multiple models, use the OOF to determine what are the best weights to blend your models with. Choose weights that maximize OOF CV score when used to blend OOF. Then use those same weights to blend your test predictions.

In [None]:
# COMPUTE OVERALL OOF AUC
# true_slope = np.concatenate(oof_tar_slope); true_intercept = np.concatenate(oof_tar_intercept)
names = np.concatenate(oof_names); folds = np.concatenate(oof_folds);
print('Overall OOF Score = %.3f'%np.mean(tscores))

# SAVE OOF TO DISK
oof_inf = pd.DataFrame(dict(
    image_name = names, fold=folds))

preds_df = pd.DataFrame(np.concatenate((oof_slope_pred[fcnt:(fcnt+1)*ct_valid],oof_intercept_pred[fcnt:(fcnt+1)*ct_valid]),axis=1),
                        index=list(range(len(oof_slope_pred[fcnt:(fcnt+1)*ct_valid]))),
                        columns=['slope' + str(i) for i in range(TTA)]+['intercept' + str(i) for i in range(TTA)])

df_oof = pd.concat((oof_inf,preds_df),axis=1)
df_oof.to_csv('oof.csv',index=False)
df_oof.head()

# Step 5: Post process
I hope somebody pursues this.