Forked from [RANZCR CLiP - GroupKFold with TFRecords](https://www.kaggle.com/nickuzmenkov/ranzcr-clip-groupkfold-with-tfrecords)

This version is for creating tfrecords with annotated images.

### Hello!

As mentioned numerous times on the competition forum, the most proper way to organize folds is as follows:
* all folds must share nearly same number of samples
* label-wise distributions must be kept close to those in the entire dataset, as there are some extremely rare cases (e.g. `ETT - Abnormal`)
* no `PatientID` can appear in different folds to prevent data leaks

I've found two solutions so far: one by @underwearfitting **[here](https://www.kaggle.com/underwearfitting/how-to-properly-split-folds)** and another by @virilo **[here](https://www.kaggle.com/virilo/ranzcr-clip-stratified-kfold-to-team-up-v3)**.

If you are a **TensorFlow** user, then having this splits the easiest way to start your efficient data workflow is `tf.data.Dataset.from_tensor_slices` which makes it just as easy as feeding a dataframe into the network but results in longer (really longer) runtime.

On the other hand, serializing this dataset to TFRecords can be done in just 10 minutes without any acceleration. However, this step would save you up to few hours on TPU when training an ensemble or a large model.

In this short notebook I will take splits made by @virilo and re-serialize the original dataset to TFRecords of 600x600 image size grouped into 5 folds. You can then adjust the `IMG_SIZE` and replace the placeholder at the end of the notebook by your training pipeline.

### Imports

In [None]:
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import IPython.display as display
import matplotlib.pyplot as plt
import ast
import cv2

In [None]:
STRATEGY = tf.distribute.get_strategy()    
BATCH_SIZE = 16
IMG_SIZE = 900
SEED = 42
    
COLOR_MAP = {'ETT - Abnormal': (255, 0, 0),
             'ETT - Borderline': (0, 255, 0),
             'ETT - Normal': (0, 0, 255),
             'NGT - Abnormal': (255, 255, 0),
             'NGT - Borderline': (255, 0, 255),
             'NGT - Incompletely Imaged': (0, 255, 255),
             'NGT - Normal': (128, 0, 0),
             'CVC - Abnormal': (0, 128, 0),
             'CVC - Borderline': (0, 0, 128),
             'CVC - Normal': (128, 128, 0),
             'Swan Ganz Catheter Present': (128, 0, 128),
            }

train_annotations = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train_annotations.csv')
annotation_thickness = 1
print('Using tensorflow %s' % tf.__version__)
clahe = cv2.createCLAHE(clipLimit=8, tileGridSize=(8, 8))

In [None]:
file_name = '1.2.826.0.1.3680043.8.498.83331936392921199432218327504041001669'
query_string = f"StudyInstanceUID == '{file_name}'"
df = train_annotations.query(query_string)

image_path = f'../input/ranzcr-clip-catheter-line-classification/train/{file_name}.jpg'
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

mask = image > 0
image = image[np.ix_(mask.any(1), mask.any(0))]

# image = 255-image
image = clahe.apply(image)
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
for i, row in df.iterrows():
    label = row["label"]
    data = np.array(ast.literal_eval(row["data"]))
    prev_d = None
    for d in data:
        if prev_d is not None:
            image = cv2.line(image, (d[0], d[1]), (prev_d[0], prev_d[1]), COLOR_MAP[label], thickness=annotation_thickness) 
        prev_d = d

plt.imshow(image)
plt.grid(False)


### Serialization functions

In [None]:
def _serialize_image(path, df):
    image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    
    mask = image > 0
    image = image[np.ix_(mask.any(1), mask.any(0))]

#     image = 255-image
    image = clahe.apply(image)
    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    image_orig = tf.identity(image)
    for i, row in df.iterrows():
        label = row["label"]
        data = np.array(ast.literal_eval(row["data"]))
        prev_d = None
        for d in data:
            if prev_d is not None:
                image = cv2.line(image, (d[0], d[1]), (prev_d[0], prev_d[1]), COLOR_MAP[label], thickness=annotation_thickness) 
            prev_d = d
            
#     target_size = max(image.shape)
    target_size = min(image.shape[:2])
    image = tf.image.resize_with_crop_or_pad(image, target_height=target_size, target_width=target_size)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    
    image_orig = tf.image.resize_with_crop_or_pad(image_orig, target_height=target_size, target_width=target_size)
    image_orig = tf.image.resize(image_orig, [IMG_SIZE, IMG_SIZE])
    image_orig = tf.cast(image_orig, tf.uint8)
    
    return tf.image.encode_jpeg(image).numpy(), tf.image.encode_jpeg(image_orig).numpy()


def _serialize_sample(uid, image, image_orig, proba):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_orig])),
        'image_annotation': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'StudyInstanceUID': tf.train.Feature(bytes_list=tf.train.BytesList(value=[uid])),
        'ETT - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
        'ETT - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
        'ETT - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
        'NGT - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
        'NGT - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
        'NGT - Incompletely Imaged': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
        'NGT - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
        'CVC - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
        'CVC - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
        'CVC - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
        'Swan Ganz Catheter Present':  tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]])),
        'is_annotate': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[11]])),
        'width': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[12]])),
        'height': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[13]]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize_fold(fold, name):
    samples = []
    
    for uid, (_, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) in fold.iterrows():
        
        query_string = f"StudyInstanceUID == '{uid}'"
        df = train_annotations.query(query_string)   
        if len(df) > 0:
            samples.append(_serialize_sample(
                uid.encode(), 
                *_serialize_image(os.path.join(f'../input/ranzcr-clip-catheter-line-classification/train/{uid}.jpg'), df), 
                [p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13]))
    
    with tf.io.TFRecordWriter(name + '.tfrec') as writer:
        [writer.write(x) for x in samples]
        print(len(samples))

### Here we check, again:
1. Whether folds have close number of samples
2. Whether no patient occures in multiple folds
3. Whether label-wise distributions across folds is close to those of the entire dataset

In [None]:
df = pd.read_csv('../input/ranzcr-train-fix/train.more.fix.csv', 
                 index_col='StudyInstanceUID')
df_fold = pd.read_csv('../input/ranzcr-clip-stratified-kfold-to-team-up-v3/stratified_5_folds.csv', 
                      index_col='StudyInstanceUID')
df_fold = df_fold.reindex(df.index)

for i in range(5):
    print(f'Fold %.i. Number of samples: %.i' % (i, len(df[df_fold['fold'] == i])))

In [None]:
print('Number of patients occuring in multiple folds: %.i' % len(set.intersection(
    set(df[df_fold['fold'] == 0]['PatientID']),
    set(df[df_fold['fold'] == 1]['PatientID']),
    set(df[df_fold['fold'] == 2]['PatientID']),
    set(df[df_fold['fold'] == 3]['PatientID']),
    set(df[df_fold['fold'] == 4]['PatientID']))))

In [None]:
df = df.drop('PatientID', axis=1)

value_counts = lambda x: pd.Series.value_counts(x, normalize=True)

df_occurence = pd.DataFrame({
    'origin': df.apply(value_counts).loc[0],
    'fold_0': df[df_fold['fold'] == 0].apply(value_counts).loc[0],
    'fold_1': df[df_fold['fold'] == 1].apply(value_counts).loc[0],
    'fold_2': df[df_fold['fold'] == 2].apply(value_counts).loc[0],
    'fold_3': df[df_fold['fold'] == 3].apply(value_counts).loc[0],
    'fold_4': df[df_fold['fold'] == 4].apply(value_counts).loc[0]})

bar = df_occurence.plot.barh(figsize=[14, 14], colormap='plasma')

### Run serialization

In [None]:
n_subfolds = 5

for i in tqdm(range(5)):    
    for j, fold in enumerate(np.array_split(df[df_fold['fold'] == i].sample(frac=1), n_subfolds)):
        serialize_fold(fold, name=f"{i}_{j}")

### Data workflow functions

In [None]:
# feature_map = {
#     'image': tf.io.FixedLenFeature([], tf.string),
#     'StudyInstanceUID': tf.io.FixedLenFeature([], tf.string),  
#     'ETT - Abnormal': tf.io.FixedLenFeature([], tf.int64),
#     'ETT - Borderline': tf.io.FixedLenFeature([], tf.int64),
#     'ETT - Normal': tf.io.FixedLenFeature([], tf.int64),
#     'NGT - Abnormal': tf.io.FixedLenFeature([], tf.int64),
#     'NGT - Borderline': tf.io.FixedLenFeature([], tf.int64),
#     'NGT - Incompletely Imaged': tf.io.FixedLenFeature([], tf.int64),
#     'NGT - Normal': tf.io.FixedLenFeature([], tf.int64),
#     'CVC - Abnormal': tf.io.FixedLenFeature([], tf.int64),
#     'CVC - Borderline': tf.io.FixedLenFeature([], tf.int64),
#     'CVC - Normal': tf.io.FixedLenFeature([], tf.int64),
#     'Swan Ganz Catheter Present': tf.io.FixedLenFeature([], tf.int64)}


# def count_data_items(filenames):
#     return np.sum([int(x[:-6].split('-')[-1]) for x in filenames])


# def decode_image(image_data):
#     image = tf.image.decode_jpeg(image_data, channels=1)
#     image = tf.reshape(image, [IMG_SIZE, IMG_SIZE, 1])
#     return image


# def scale_image(image, target):
#     image = tf.cast(image, tf.float32) / 255.
#     return image, target


# def read_tfrecord(example):
#     example = tf.io.parse_single_example(example, feature_map)
#     image = decode_image(example['image'])
#     target = [
#         example['ETT - Abnormal'],
#         example['ETT - Borderline'],
#         example['ETT - Normal'],
#         example['NGT - Abnormal'],
#         example['NGT - Borderline'],
#         example['NGT - Incompletely Imaged'],
#         example['NGT - Normal'],
#         example['CVC - Abnormal'],
#         example['CVC - Borderline'],
#         example['CVC - Normal'],
#         example['Swan Ganz Catheter Present']]
#     return image, target


# def data_augment(image, target):
#     image = tf.image.random_flip_left_right(image, seed=SEED)
#     image = tf.image.random_flip_up_down(image, seed=SEED)
#     return image, target


# def get_dataset(filenames, shuffled=False, repeated=False, 
#                 cached=False, augmented=False, distributed=True):
#     auto = tf.data.experimental.AUTOTUNE
#     dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
#     dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
#     if augmented:
#         dataset = dataset.map(data_augment, num_parallel_calls=auto)
#     dataset = dataset.map(scale_image, num_parallel_calls=auto)
#     if shuffled:
#         dataset = dataset.shuffle(2048, seed=SEED)
#     if repeated:
#         dataset = dataset.repeat()
#     dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
#     if cached:
#         dataset = dataset.cache()
#     dataset = dataset.prefetch(auto)
#     if distributed:
#         dataset = STRATEGY.experimental_distribute_dataset(dataset)
#     return dataset


# def get_model():
#     model = tf.keras.models.Sequential([
#         tf.keras.applications.EfficientNetB0(
#             include_top=False,
#             input_shape=(None, None, 1),
#             weights=None,
#             pooling='avg'),
#         tf.keras.layers.Dense(11, activation='sigmoid')
#     ])
#     model.compile(
#         optimizer='adam',
#         loss='binary_crossentropy',
#         metrics=tf.keras.metrics.AUC(multi_label=True))

#     return model

### Train placeholder
Just making shure that new TFRecords are readable.

Place your training pipeline here if you train your model in this notebook.

In [None]:
# kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)
# folders = os.listdir('./')

# for i, (train_index, val_index) in enumerate(kfold.split(folders)):
    
#     if i > 0:
#         break

#     tf.keras.backend.clear_session()
    
#     train_filenames = []
#     for j in train_index:
#         train_filenames += tf.io.gfile.glob(os.path.join('./', folders[j], '*.tfrec'))
#     np.random.shuffle(train_filenames)
    
#     val_filenames = []
#     for j in val_index:
#         val_filenames += tf.io.gfile.glob(os.path.join('./', folders[j], '*.tfrec'))
#     np.random.shuffle(val_filenames)
    
#     train_filenames = train_filenames[:1]
#     val_filenames = val_filenames[:1]
        
#     train_dataset = get_dataset(train_filenames, shuffled=True, augmented=True, repeated=True)
#     val_dataset = get_dataset(val_filenames, shuffled=False, cached=True)

#     steps_per_epoch = count_data_items(train_filenames) // BATCH_SIZE
#     validation_steps = count_data_items(val_filenames) // BATCH_SIZE
    
#     with STRATEGY.scope():
#         model = get_model()
        
#         model.compile(
#             loss='binary_crossentropy',
#             optimizer='adam',
#             metrics=[tf.keras.metrics.AUC(multi_label=True)])
    
#     history = model.fit(
#         train_dataset,
#         steps_per_epoch=steps_per_epoch,
#         epochs=1,
#         validation_data=val_dataset,
#         validation_steps=validation_steps,
#         verbose=2)