### Hello

Considering **[this topic](https://www.kaggle.com/c/ranzcr-clip-catheter-line-classification/discussion/205243)** by @hengck23 training your models with annotations can be a bit tricky yet powerfull approach potentially boosting your solution by 1-2%. So I've decided to do some quick EDA (which I'm honestly not so good at) and preparation for this training pipeline by simply putting these colourfull annotations from `train_annotations.csv` on training images with the help of **OpenCV** library and saving the results in `.jpg` and `.tfrec` format.

My code is mostly ambiguous, sorry for that. I'm sure there's much fancier implementation for things done here.

## Contents
1. [Quick EDA](#EDA)
2. [Saving annotated jpegs](#Jpegs)
3. [Going even further: TFRecords](#Tfrecords)

### Imports

In [None]:
%matplotlib inline
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import cv2
import ast
import os

In [None]:
IMG_SIZE = 600

try: 
    os.mkdir('./train')
except FileExistsError:
    pass

try:
    os.mkdir('./train_tfrecords')
except FileExistsError:
    pass

CLASSES = [
    'ETT - Abnormal',
    'ETT - Borderline',
    'ETT - Normal',
    'NGT - Abnormal',
    'NGT - Borderline',
    'NGT - Incompletely Imaged',
    'NGT - Normal',
    'CVC - Abnormal',
    'CVC - Borderline',
    'CVC - Normal',
    'Swan Ganz Catheter Present']

root = '../input/ranzcr-clip-catheter-line-classification'

main_df = pd.read_csv(
    os.path.join(root, 'train.csv'),
    index_col='StudyInstanceUID').drop('PatientID', axis=1)
anno_df = pd.read_csv(
    os.path.join(root, 'train_annotations.csv'), 
    index_col='StudyInstanceUID')

<a id='EDA'></a>
## 1. Quick EDA
Here we quickly look through annotation distributions across classes.

In [None]:
plt.figure(figsize=[12, 6])

main_df.apply(pd.Series.value_counts).loc[1].plot.barh(label='not annotated', color='red')
anno_df['label'].value_counts().reindex(CLASSES).plot.barh(label='annotated', color='blue')

legend = plt.legend()

In [None]:
print(f'Found annotations for {anno_df.index.nunique()}/{len(main_df)} samples, from which:')

main_df_count = main_df.reindex(anno_df.index.drop_duplicates())


def _parse(raw):
    parsed = np.zeros((len(CLASSES),))
    for i in range(len(CLASSES)):
        if raw == CLASSES[i]:
            parsed[i] = 1
            return parsed
        

index=main_df_count.index
anno_df_count = pd.DataFrame(
    columns=CLASSES, 
    index=index,
    data=np.zeros((len(index), len(CLASSES))))

for index in main_df_count.index:
    
    annotations = anno_df.loc[index]
    
    if isinstance(annotations, pd.DataFrame):
        for label, _ in anno_df.loc[index].values:
            anno_df_count.loc[index] += _parse(label)
    else:
        anno_df_count.loc[index] += _parse(annotations['label'])  
        
print(f'  - {((anno_df_count < main_df_count).sum(axis=1) != 0).sum()} are incomplete')
print(f'  - {((anno_df_count != main_df_count).sum(axis=1) == 0).sum()} are complete')
print(f'  - {((anno_df_count > main_df_count).sum(axis=1) != 0).sum()} are overcomplete')

print('Saving filtered DataFrame.')

main_df = pd.read_csv(
    os.path.join(root, 'train.csv'), 
    index_col='StudyInstanceUID').reindex(anno_df.index.drop_duplicates())
main_df = main_df[(main_df_count == anno_df_count).all(axis=1)]
main_df.to_csv('train_annotated.csv')

As you see, `9095` training images have annotations, and most of them are complete, i.e. only `24` images miss some of annotations corresponding for each of the catheters inserted. Another interesting thing to point out is that at least `1349` cases has **multiple catheters of each class**. Lastly, we notice is that `24 + 7723 + 1349 = 9096`. It seems that one image has the both missing and extra annotaions. 

We save only `7723` fully-annotated samples with no extra annotations for further training. However, it's up to you to decide which images should be filtered out.


### How do annotated images look like?

Here we visualize some fully annotated samples (with a minimum of 4 catheters inserted). Note that each of 11 classes has its own unique colour (colours are extracted from `matplotlib` `gist_rainbow` colormap). For now we just plot annotations over the images without affecting them.

In [None]:
colors = [
    (1.0000, 0.0000, 0.1600),
    (1.0000, 0.3678, 0.0000),
    (1.0000, 0.9189, 0.0000),
    (0.5511, 1.0000, 0.0000),
    (0.0000, 1.0000, 0.0000),
    (0.0000, 1.0000, 0.5482),
    (0.0000, 0.9239, 1.0000),
    (0.0000, 0.3698, 1.0000),
    (0.1630, 0.0000, 1.0000),
    (0.7172, 0.0000, 1.0000),
    (1.0000, 0.0000, 0.7500)]

cmap = {key: color for key, color in zip(CLASSES, colors)}

def _parse_annotation(raw):
    annotation = ast.literal_eval(raw)
    annotation = np.array(annotation, dtype=np.int32)
    return annotation

In [None]:
plt.figure(figsize=[16, 8])

for i, index in enumerate(main_df[main_df.sum(axis=1) >= 4].index[:6]):
    plt.subplot(2, 3, i + 1)
    
    img = cv2.imread(os.path.join(root, 'train', index + '.jpg'), 1)
    
    annotations = anno_df.loc[index]
    
    if isinstance(annotations, pd.DataFrame):
        for target, annotation in anno_df.loc[index].values:
            annotation = _parse_annotation(annotation)
            plt.plot(annotation[:, 0], annotation[:, 1], label=target, color=cmap[target])
    else:
        target = annotations['label']
        annotation = _parse_annotation(annotations['data'])
        plt.plot(annotation[:, 0], annotation[:, 1], label=target, color=cmap[target])
    
    plt.imshow(img)

    plt.legend(loc='lower right')
    plt.axis('off')

<a id='Jpegs'></a>
## 2. Saving annotated jpegs

Now we manually insert annotations inside the images with **OpenCV** `polylines` method and save the output in `.jpg` format.
### Annotation functions

In [None]:
colors = [
    (255, 0, 40),
    (255, 93, 0),
    (255, 234, 0),
    (140, 255, 0),
    (0, 255, 0),
    (0, 255, 139),
    (0, 235, 255),
    (0, 94, 255),
    (41, 0, 255),
    (182, 0, 255),
    (255, 0, 191)]

cmap = {key: color for key, color in zip(CLASSES, colors)}


def _parse_annotation(raw):
    annotation = ast.literal_eval(raw)
    annotation = np.array(annotation, dtype=np.int32)
    annotation = np.expand_dims(annotation, axis=0)
    return annotation


def annotate(img, target, annotation):
    annotation = _parse_annotation(annotation)
    cv2.polylines(img, annotation, False, cmap[target], 10)
    return img

### Run annotation

In [None]:
for index in tqdm(main_df.index, total=len(main_df)):
    
    img = cv2.imread(os.path.join(root, 'train', index + '.jpg'), 1)

    annotations = anno_df.loc[index]
    
    if isinstance(annotations, pd.DataFrame):
        for target, annotation in anno_df.loc[index].values:
            img = annotate(img, target, annotation)
    else:
        img = annotate(img, annotations['label'], annotations['data'])
        
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    cv2.imwrite(os.path.join('./train', index + '.jpg'), img)

<a id='Tfrecords'></a>
## 3. Going even further: TFRecords
If you are a **TensorFlow** user, working with `.tfrec` files can significantly boost up performance. Here we serialize our previous results and save the output in `.tfrec` format.
### Serialization functions

In [None]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(uid, image, proba):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'StudyInstanceUID': tf.train.Feature(bytes_list=tf.train.BytesList(value=[uid])),
        'ETT - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
        'ETT - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
        'ETT - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
        'NGT - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
        'NGT - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
        'NGT - Incompletely Imaged': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
        'NGT - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
        'CVC - Abnormal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
        'CVC - Borderline': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
        'CVC - Normal': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
        'Swan Ganz Catheter Present':  tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()


def serialize_fold(fold, name):
    samples = []
    
    for uid, proba in fold.iterrows():
        samples.append(_serialize_sample(
            uid.encode(), 
            _serialize_image(os.path.join('./train', uid + '.jpg')), 
            proba))
    
    with tf.io.TFRecordWriter(name + '.tfrec') as writer:
        [writer.write(x) for x in samples]

### Run serialization

In [None]:
n_files = 16

for i, fold in tqdm(enumerate(np.array_split(main_df, n_files)), total=n_files):
    serialize_fold(fold, name='./train_tfrecords/%.2i-%.3i' % (i, len(fold)))

In [None]:
!zip -rm -qq train train
!zip -rm -qq train_tfrecords train_tfrecords