## Hello!

This is a pretty basic implementation of a **TensorFlow** and **Keras** pipeline for this competition yet without any fancy parts. 
The main aim here is achieving training and inference speed as high as possible. 

Thus, we divide the pipeline into 3 steps:
1. Making own `.tfrec` files as they are not available in the competition data (see the corresponding **[dataset](https://www.kaggle.com/nickuzmenkov/plant-pathology-2021-train-tfrecords)**) and testing whether everything works just fine. Here we use 512x512 image size, but feel free tweak parameters to set your desirable size and number of files
2. Training 5 folds EfficientNetB4 on TPU in **[this notebook](https://www.kaggle.com/nickuzmenkov/pp2021-tpu-tf-training)**
3. Separate **[inference notebook](https://www.kaggle.com/nickuzmenkov/pp2021-tpu-tf-inference)** where we also do image pre-serialization to a single `.tfrec` file for maximum speed

Average time consuming per steps:

| Step # | Accelerator | Approximate time | Comment |
| --- | --- | --- | --- |
| 1 | CPU | 36 min | for 512x512 |
| 2 | TPU | 17 min | per fold |
| 3 | GPU | 20 min | for 5 models |

### So, let's go ahead

### Imports

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
import os

### Configuration
Define here the desirable number of output files (`subfolds`) and image size (`img_size`)

In [None]:
class CFG:
    
    '''
    keep these
    '''
    strategy = tf.distribute.get_strategy()
    batch_size = 16 * strategy.num_replicas_in_sync
    
    root = '../input/plant-pathology-2021-fgvc8/train_images'
    
    '''
    tweak these
    '''
    seed = 42 # random seed for shuffling the initial dataframe
    img_size = 512 # desirable image size
    subfolds = 64 # number of output files

### Helper functions (serialization)

In [None]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
    image = tf.cast(image, tf.uint8)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(image, name, labels):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'name': tf.train.Feature(bytes_list=tf.train.BytesList(value=[name])),
        'cider_apple_rust': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[0]])),
        'complex': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[1]])),
        'frog_eye_leaf_spot': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[2]])),
        'healthy': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[3]])),
        'powdery_mildew': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[4]])),
        'rust': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[5]])),
        'scab': tf.train.Feature(int64_list=tf.train.Int64List(value=[labels[6]]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize_fold(fold, filename, transform=None):
    samples = []
    
    for path, labels in fold.iterrows():
        image = _serialize_image(os.path.join(CFG.root, path))
        name = path.encode()
        samples.append(_serialize_sample(image, name, labels))
    
    with tf.io.TFRecordWriter(filename + '.tfrec') as writer:
        [writer.write(x) for x in samples]

### Initial DataFrame preprocessing

In [None]:
df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv', index_col='image')

df['labels'] = [x.split(' ') for x in df['labels']]

binarizer = MultiLabelBinarizer()
labels = binarizer.fit_transform(df['labels'].values)

df = pd.DataFrame(
    index=df.index,
    columns=binarizer.classes_,
    data=labels)

df.to_csv('train.csv')
display(df.head())

## 1. Serialization

In [None]:
df = shuffle(df, random_state=CFG.seed)

with tqdm(total=CFG.subfolds) as bar:

    folder = f'train_tfrecords'
    os.mkdir(folder)
        
    for i, subfold in enumerate(np.array_split(df, CFG.subfolds)):
        filename=os.path.join(folder, '%.2i-%.3i' % (i, len(subfold)))
        serialize_fold(subfold, filename=filename)
                
        bar.update(1)
                
bar.close()

### Helper functions (parsing & training)

In [None]:
feature_map = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'name': tf.io.FixedLenFeature([], tf.string),
    'cider_apple_rust': tf.io.FixedLenFeature([], tf.int64),
    'complex': tf.io.FixedLenFeature([], tf.int64),
    'frog_eye_leaf_spot': tf.io.FixedLenFeature([], tf.int64),
    'healthy': tf.io.FixedLenFeature([], tf.int64),
    'powdery_mildew': tf.io.FixedLenFeature([], tf.int64),
    'rust': tf.io.FixedLenFeature([], tf.int64),
    'scab': tf.io.FixedLenFeature([], tf.int64)}


def count_data_items(filenames):
    return np.sum([int(x[:-6].split('-')[-1]) for x in filenames])


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.reshape(image, [CFG.img_size, CFG.img_size, 3])
    image = tf.cast(image, tf.float32) / 255.
    return image


def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_map)
    image = decode_image(example['image'])
    target = [
        tf.cast(example['cider_apple_rust'], tf.float32),
        tf.cast(example['complex'], tf.float32),
        tf.cast(example['frog_eye_leaf_spot'], tf.float32),
        tf.cast(example['healthy'], tf.float32),
        tf.cast(example['powdery_mildew'], tf.float32),
        tf.cast(example['rust'], tf.float32),
        tf.cast(example['scab'], tf.float32)]
    return image, target


def get_dataset(filenames):
    auto = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    dataset = dataset.batch(CFG.batch_size)
    dataset = dataset.prefetch(auto)
    return CFG.strategy.experimental_distribute_dataset(dataset)


def get_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(CFG.img_size, CFG.img_size, 3),
            weights=None,
            pooling='avg'),
        tf.keras.layers.Dense(len(feature_map) - 2),
        tf.keras.layers.Activation('sigmoid', dtype='float32')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])

    return model

## 2. Testing
Replace this part with your training pipeline if you want to train in this notebook

In [None]:
filenames = tf.io.gfile.glob('./train_tfrecords/*.tfrec')[:1]
dataset = get_dataset(filenames)

steps_per_epoch = count_data_items(filenames) // CFG.batch_size

with CFG.strategy.scope():
    model = get_model()

model.summary()

history = model.fit(
    dataset, 
    steps_per_epoch=steps_per_epoch,
    epochs=1,
    verbose=2)