### Hello
Using TFRecords can really boost up training, but sometimes those provided in the original competition dataset are not enough, e.g.:
* you need to add more data, but external data is only available in .jpeg format
* you need to prepare augmented data instead of creating it on the fly
* you need to perform knowledge distillation on soft labels, etc.

A one-stop solution is `from_tensor_slices` method, which makes it just as easy as feeding a dataframe into the network but results in slower (really slower) runtime. E.g. training EfficientNetB4 on dataset of 25K 512x512 images takes nearly 5x more time:

| | from tensor slices | from tfrecord files |
| --- | --- | --- |
| Max time per epoch | 996s | **132s** |
| Average time per epoch | 502s | **81s** | 

On the other hand, serializing this dataset to TFRecords can be done in just 30 lines of code. Taking only 10 minutes on GPU, this step would save you up to few hours on TPU when training an ensemble or a large model.

In this short notebook I will take a dataset of soft targets to create TFRecords and then check whether this dataset is readable by feeding EfficientNetB0 with new TFRecords.

### Imports 

In [None]:
from sklearn.preprocessing import LabelBinarizer
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import os

### Hardware configuration

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPUv3-8')
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    STRATEGY = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = 16 * STRATEGY.num_replicas_in_sync
except:
    print('Running on GPU/CPU')
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 8
    
print('Number of replicas:', STRATEGY.num_replicas_in_sync)
print('Using tensorflow %s' % tf.__version__)

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    print('Seeding everything with seed %.i' % seed)
    

AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16 * STRATEGY.num_replicas_in_sync
IMG_SIZE = 600

seed_everything(42)

### Helper functions (serialization)

In [None]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image)
    image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE, IMG_SIZE)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(image, image_id, p0, p1, p2, p3, p4):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'image_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_id])),
        'p0': tf.train.Feature(float_list=tf.train.FloatList(value=[p0])),
        'p1': tf.train.Feature(float_list=tf.train.FloatList(value=[p1])),
        'p2': tf.train.Feature(float_list=tf.train.FloatList(value=[p2])),
        'p3': tf.train.Feature(float_list=tf.train.FloatList(value=[p3])),
        'p4': tf.train.Feature(float_list=tf.train.FloatList(value=[p4]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize_fold(fold, name):
    samples = []
    
    for image, (image_id, p0, p1, p2, p3, p4) in fold.iterrows():
        image = _serialize_image(image)
        samples.append(_serialize_sample(image, image_id.encode(), p0, p1, p2, p3, p4))
    
    with tf.io.TFRecordWriter(name + '.tfrec') as writer:
        [writer.write(x) for x in samples]

### Run converter
Here we combine soft predictions with hard labels (ground truth) and serialize images and new predictions to TFRecords.

In [None]:
df_hard = pd.read_csv('../input/cassava-leaf-disease-merged/merged.csv')

df_hard_2020 = df_hard[df_hard['source'] == 2020].set_index('image_id')
df_hard_2019 = df_hard[df_hard['source'] == 2019].set_index('image_id')

df_soft_2020 = pd.read_csv('../input/cassava-leaf-disease-soft-targets-09-model/soft_targets_2020.csv')
df_soft_2019 = pd.read_csv('../input/cassava-leaf-disease-soft-targets-09-model/soft_targets_2019.csv')

hard_labels_2020 = LabelBinarizer().fit_transform(df_hard_2020['label']).astype('float32')
hard_labels_2019 = LabelBinarizer().fit_transform(df_hard_2019['label']).astype('float32')

mixed_labels_2020 = 3 * df_soft_2020[['p0', 'p1', 'p2', 'p3', 'p4']].values + 7 * hard_labels_2020
mixed_labels_2020 = np.array([x / np.sum(x) for x in mixed_labels_2020])

mixed_labels_2019 = 3 * df_soft_2019[['p0', 'p1', 'p2', 'p3', 'p4']].values + 7 * hard_labels_2019
mixed_labels_2019 = np.array([x / np.sum(x) for x in mixed_labels_2019])

root = '../input/cassava-leaf-disease-merged/train'

df_soft_2020.index = [os.path.join(root, x) for x in df_soft_2020['image_id']]
df_soft_2019.index = [os.path.join(root, x) for x in df_soft_2019['image_id']]

df_soft_2020.loc[:, 'p0':'p4'] = mixed_labels_2020
df_soft_2019.loc[:, 'p0':'p4'] = mixed_labels_2019

In [None]:
os.mkdir('./2020')
os.mkdir('./2019')

n_folds = 50
samples = []

for i, fold in tqdm(enumerate(np.array_split(df_soft_2020, n_folds)), total=n_folds):
    serialize_fold(fold, name='./2020/cldc-%.2i-%.i' % (i + 1, len(fold)))
    
for i, fold in tqdm(enumerate(np.array_split(df_soft_2019, n_folds)), total=n_folds):
    serialize_fold(fold, name='./2019/cldc-%.2i-%.i' % (i + 1, len(fold)))

### Helper functions (parsing & testing)

In [None]:
feature_map = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'image_id': tf.io.FixedLenFeature([], tf.string),
    'p0': tf.io.FixedLenFeature([], tf.float32),
    'p1': tf.io.FixedLenFeature([], tf.float32),
    'p2': tf.io.FixedLenFeature([], tf.float32),
    'p3': tf.io.FixedLenFeature([], tf.float32),
    'p4': tf.io.FixedLenFeature([], tf.float32)}


def count_data_items(filenames):
    return np.sum([int(x[:-6].split('-')[-1]) for x in filenames])


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    return image


def scale_image(image, target):
    image = tf.cast(image, tf.float32) / 255.
    return image, target


def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_map)
    image = decode_image(example['image'])
    target = [
        example['p0'],
        example['p1'],
        example['p2'],
        example['p3'],
        example['p4']]
    return image, target


def get_dataset(filenames):
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTO)
    dataset = dataset.map(scale_image, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return STRATEGY.experimental_distribute_dataset(dataset)


def get_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(None, None, 3),
            weights=None,
            pooling='avg'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])

    return model

### Run test
Here we check that new TFRecords can be parsed and fed into our convnet

In [None]:
filenames = tf.io.gfile.glob('./2020/cldc-*.tfrec')[:2]
dataset = get_dataset(filenames)

steps_per_epoch = count_data_items(filenames) // BATCH_SIZE

with STRATEGY.scope():
    model = get_model()

model.summary()

history = model.fit(
    dataset, 
    steps_per_epoch=steps_per_epoch,
    epochs=1,
    verbose=2)

### Acknowledgements
* Many thanks to @dimitreoliveira for his **[amazing work](https://www.kaggle.com/dimitreoliveira/cassava-leaf-disease-training-with-tpu-v2-pods)** I learnt and took a lot from