## Summary


* Preprocessing: deleted 77 duplicates detected with `image_hash` library, stratified labels
* Training strategy: 5 folds CV, unweighted ensemble
* Backbone: EfficientNetB4, `noisy-student` weights, single dense layer on top
* Optimizer: Adam, learning rate of 1e-3, ReduceLROnPlateau
* Image size: 600x600
* Augmentations: heavy augmentations with `albumentations` library, pre-recorded


In [None]:
import sys
sys.path.append('/kaggle/input/efficientnet-keras-dataset/efficientnet_kaggle')

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm.notebook import tqdm
import efficientnet.tfkeras as efn
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import os

In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')


class CFG:
    
    '''
    keep these
    '''
    strategy = tf.distribute.get_strategy()
    batch_size = 16 * strategy.num_replicas_in_sync
    
    img_size = 600
    
    classes = np.array([
        'complex', 
        'frog_eye_leaf_spot', 
        'powdery_mildew', 
        'rust', 
        'scab'])
    root = '../input/plant-pathology-2021-fgvc8/test_images'
    
    '''
    tweak these
    '''
    seed = 42 # random seed we use for each operation
    tta_steps = 0 # number of TTA folds, run without TTA if 0

### Helper functions

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.reshape(image, [CFG.img_size, CFG.img_size, 3])
    image = tf.cast(image, tf.float32) / 255.
    return image


def data_augment(image, label):
    image = tf.image.random_flip_left_right(image, seed=CFG.seed)
    image = tf.image.random_flip_up_down(image, seed=CFG.seed)
    
    k = tf.tf.random.uniform([], minval=0, maxval=4, dtype=tf.int64, seed=CFG.seed)
    image = tf.image.rot90(image, k=k)
    
    image = tf.image.random_hue(image, .1, seed=CFG.seed)
    image = tf.image.random_saturation(image, .8, 1.2, seed=CFG.seed)
    image = tf.image.random_contrast(image, .8, 1.2, seed=CFG.seed)
    image = tf.image.random_brightness(image, .1, seed=CFG.seed)
    
    return image, label


feature_map = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'image_name': tf.io.FixedLenFeature([], tf.string)}


def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_map)
    image = decode_image(example['image'])
    label = example['image_name']
    return image, label


def get_dataset(filenames, ordered=True, shuffled=False, repeated=False, 
                augmented=False, cached=False, distributed=False):
    auto = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    if not ordered:
        ignore_order = tf.data.Options()
        ignore_order.experimental_deterministic = False
        dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    if shuffled:
        dataset = dataset.shuffle(2048, seed=CFG.seed)
    if repeated:
        dataset = dataset.repeat()
    dataset = dataset.batch(CFG.batch_size)
    if augmented:
        dataset = dataset.map(data_augment, num_parallel_calls=auto)
    if cached:
        dataset = dataset.cache()
    dataset = dataset.prefetch(auto)
    if distributed:
        dataset = CFG.strategy.experimental_distribute_dataset(dataset)
    return dataset


def get_model():
    model = tf.keras.models.Sequential(name='EfficientNetB4')
    
    model.add(efn.EfficientNetB4(
        include_top=False,
        input_shape=(CFG.img_size, CFG.img_size, 3),
        weights=None,
        pooling='avg'))
    
    model.add(tf.keras.layers.Dense(len(CFG.classes), 
        kernel_initializer=tf.keras.initializers.RandomUniform(seed=CFG.seed),
        bias_initializer=tf.keras.initializers.Zeros(), name='dense_top'))
    model.add(tf.keras.layers.Activation('sigmoid', dtype='float32'))
    
    return model

In [None]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    image = tf.cast(image, tf.uint8)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(image, name):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'image_name': tf.train.Feature(bytes_list=tf.train.BytesList(value=[name]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize_test():
    samples = []
    
    for path in os.listdir(CFG.root):
        image = _serialize_image(os.path.join(CFG.root, path))
        name = path.encode()
        samples.append(_serialize_sample(image, name))
    
    with tf.io.TFRecordWriter('test.tfrec') as writer:
        [writer.write(x) for x in tqdm(samples, total=len(samples))]

### Test images serialization
I assume this would take over 10 minutes during inference, but save much more time when submitting large models.

In [None]:
serialize_test()

### Inspect test images 

In [None]:
paths = os.listdir(CFG.root)[:3]

figure, axes = plt.subplots(1, 3, figsize=[20, 10])

for i, path in enumerate(paths):
    image = plt.imread(os.path.join(CFG.root, path))

    axes[i].imshow(image)
    axes[i].axis('off')
    
plt.show()

## Run prediction

In [None]:
size = len(os.listdir(CFG.root))
filenames = tf.io.gfile.glob('*.tfrec')

if CFG.tta_steps > 0:
    dataset = get_dataset(filenames, repeated=True, augmented=False)
else:
    dataset = get_dataset(filenames)   
    
predicts = np.zeros((size, len(CFG.classes)))
paths = tf.io.gfile.glob('../input/modelweights/*.h5')
print(tf.io.gfile.glob('../input/efficientnet-keras-dataset/weights/*.h5'))
for path in tqdm(paths, total=len(paths)):
    print(path)
    with CFG.strategy.scope():
        model = get_model()
        model.load_weights(path)
    
    if CFG.tta_steps > 0:
        steps = CFG.tta_steps * (size / CFG.batch_size + 1)

        predict = model.predict(dataset, steps=steps)[:size * CFG.tta_steps] / len(paths)
        predicts += np.mean(
        predict.reshape(size, CFG.tta_steps, len(CFG.classes), order='F'), axis=1)
        

    else:
        predicts += model.predict(dataset) / len(paths)
       
print(predicts)

## Threshold configuration


In [None]:
df_true = pd.read_csv(
    '../input/pp2021-kfold-tfrecords-0/train.csv', index_col='image').drop('healthy', axis=1)
df_pred = pd.read_csv(
    '../input/predict-weight/oof_predicts.csv', index_col='image')

df_true = df_true.reindex(df_pred.index)

y_true = df_true.values
y_pred = df_pred.values

'''
run evaluation for each threshold in [0, 1)
'''
thresholds = np.arange(.01, 1., .01)
scores = []

for threshold in thresholds:
    metric = tfa.metrics.F1Score(
        num_classes=len(CFG.classes), 
        average=None, 
        threshold=threshold)
    metric.update_state(y_true, y_pred)
    scores.append(metric.result().numpy())
    
df = pd.DataFrame(columns=CFG.classes, data=scores, index=pd.Index(thresholds, name='threshold'))

'''
find maximum value for each class and the corresponding threshold
'''
thresholds = []
scores = []

for x in CFG.classes:
    thresholds.append(df[x].idxmax())
    scores.append(df[x].max())
    print(f'{x}: {df.loc[.5, x]:.4f} >>> {df.loc[thresholds[-1], x]:.4f} ({thresholds[-1]:.2f})')

print(f'\nmean score: {df.loc[.5].mean():.4f} >>> {np.mean(scores):.4f}')

## Create submission.csv


In [None]:
# for i in range(len(predicts)):
#     predicts[i] = predicts[i] > thresholds
print(predicts)
predict = predicts.astype('bool')

labels = []

for i in range(len(predict)):
    labels.append(' '.join(CFG.classes[predict[i]]))
    
labels = ['healthy' if ('healthy' in x or x == '') else x for x in labels]
    
df = pd.DataFrame({
    'image': os.listdir(CFG.root),
    'labels': labels})

df.to_csv('submission.csv', index=False)
display(df.head())