Some basic ideas are credited to [DimitreOliveira](https://www.kaggle.com/dimitreoliveira) for this [notebook](https://www.kaggle.com/dimitreoliveira/cassava-leaf-disease-stratified-tfrecords-256x256), please upvote it also if you like.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm.autonotebook import tqdm
import random

In [None]:
ROOT_DIR = '../input/cassava-leaf-disease-classification'
HEIGHT, WIDTH = 512, 512

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

SEED = 0
N_TRAINFILES = 50
N_FOLDS = 5
seed_everything(SEED)

In [None]:
train_df = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
train_df.head()

In [None]:
train_df['label'].value_counts()

## Label distribution

In [None]:
sns.countplot(x='label', data=train_df)

## Oversample

In [None]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image, image_name, target):
    feature = {
        'image': _bytes_feature(image),
        'image_name': _bytes_feature(image_name),
        'target': _int64_feature(target)
    }
    # Create a Features message using tf.train.Example.

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


### Write to tfrecord files

In [None]:
# Create file for training
skf = StratifiedKFold(n_splits=N_TRAINFILES, shuffle=True, random_state=SEED)
idx_os = [] 
for fold, (train_idx, test_idx) in tqdm(enumerate(skf.split(train_df, train_df['label']))):
    filename = 'ld_train{:02d}-{}.tfrec'.format(fold, len(test_idx))
#     idx_os.append(test_idx)
    with tf.io.TFRecordWriter(filename) as writer:
        for item in test_idx:
            image_name = train_df.loc[item]['image_id']
            image = cv2.imread(os.path.join(ROOT_DIR, 'train_images', image_name))
            image = cv2.resize(image, (HEIGHT, WIDTH))
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
            target = train_df.loc[item]['label']
            example = serialize_example(image, image_name.encode(), target)
            writer.write(example)
    fold_df = train_df.loc[test_idx]
    for c in [0, 1, 2, 4]:
        class_imgnames = fold_df[fold_df['label']==c]['image_id'].values
        filename = 'class{}_{:02d}-{}.tfrec'.format(c, fold, len(class_imgnames))
        with tf.io.TFRecordWriter(filename) as writer:
            for image_name in class_imgnames:
                image = cv2.imread(os.path.join(ROOT_DIR, 'train_images', image_name))
                image = cv2.resize(image, (HEIGHT, WIDTH))
                image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tostring()
                target = c
                example = serialize_example(image, image_name.encode(), target)
                writer.write(example)