![TF](https://www.gstatic.com/devrel-devsite/prod/ve312520032ba2ac0c4d23f7b46fc670cbbe051886a2d1f04563a5e4768ad9787/tensorflow/images/lockup.svg)

# TFRecords
This notebook will create TFRecords of the SIIM COVID-19 X-rays for image classification. Object detection is also part of the task, but a separate set of TFRecords will be created for that. Images are resized to 1024 while preserving aspect ratio (change to any desired value of IMAGE_SIZE). The dataset will be split into K files (for K-Folds cross validation), and the data is stratified on multiple features from [this notebook](https://www.kaggle.com/mistag/k-folds-cv-stratification-on-multiple-features).

In [None]:
# need some additional libraries to process compressed dicom data
!conda install '../input/pydicom-conda-helper/libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -c conda-forge -y
!conda install '../input/pydicom-conda-helper/libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -c conda-forge -y
!conda install '../input/pydicom-conda-helper/gdcm-2.8.9-py37h500ead1_1.tar.bz2' -c conda-forge -y
!conda install '../input/pydicom-conda-helper/conda-4.10.1-py37h89c1867_0.tar.bz2' -c conda-forge -y
!conda install '../input/pydicom-conda-helper/certifi-2020.12.5-py37h89c1867_1.tar.bz2' -c conda-forge -y
!conda install '../input/pydicom-conda-helper/openssl-1.1.1k-h7f98852_0.tar.bz2' -c conda-forge -y

In [None]:
from PIL import Image, ImageFont, ImageDraw
import pydicom
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import hashlib
import os
import glob
from io import BytesIO
import cv2
import contextlib2
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Load the stratified dataset:

In [None]:
df = pd.read_pickle('../input/k-folds-cv-stratification-on-multiple-features/dataset.pkl')
df.sample(7)

# Helper functions
The images are digitized in 12-14bit resolution - converting this to JPEG will cause quite a bit of information to be lost. To preserve all image information, the images could be saved as 16bit PNG. But here we use Contrast Limiting Adaptive Histogram Equalization (CLAHE). This image pre-processing step must then also be used during inference time.

In [None]:
IMAGE_SIZE = 1024 # change this to desired value
CLIP_LIMIT = 2.
GRID_SIZE = (8,8)

def read_image(fname, target_size=IMAGE_SIZE, use_clahe=True):
    ds = dcmread(fname)
    data = apply_voi_lut(ds.pixel_array, ds)
    im = data - np.min(data)
    im = 255. * im / np.max(im)
    if ds.PhotometricInterpretation == "MONOCHROME1": # check for inverted image
        im = 255. - im
    if use_clahe:
        clahe = cv2.createCLAHE(clipLimit=CLIP_LIMIT, tileGridSize=GRID_SIZE)
        climg = clahe.apply(im.astype('uint8'))
        img = Image.fromarray(climg.astype('uint8'), 'L')
    else:
        img = Image.fromarray(im.astype('uint8'), 'L')
    org_size = img.size
    if max(img.size) > target_size:
        img.thumbnail((target_size, target_size), Image.ANTIALIAS)
    return img, org_size

Let's see what CLAHE does:

In [None]:
fname = glob.glob('../input/siim-covid19-detection/train/278fc970196c/**/*.dcm')[0]
fig = plt.figure(figsize=(20,20))
axes = fig.add_subplot(1, 2, 1)
img, size = read_image(fname, use_clahe=False)
axes.set_title('Original')
plt.imshow(img, cmap='gray')
axes = fig.add_subplot(1, 2, 2)
img, size = read_image(fname, use_clahe=True)
axes.set_title('CLAHE')
plt.imshow(img, cmap='gray');

# TFRecord format
The TFRecords are protocol buffer format. A good overview of the different fields can be [found here](https://github.com/visipedia/tfrecords).

In [None]:
TPATH = '../input/siim-covid19-detection/train/'
LABELS = ['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

def create_tf_example(study, label_idx, longest_edge=IMAGE_SIZE, use_clahe=True):  
    fname = glob.glob(TPATH+study+'/**/*.dcm')[0]
    filename=fname.split('/')[-1] # exclude path    
    img, org_size = read_image(fname, target_size=longest_edge, use_clahe=use_clahe)
    height = img.size[1] # Image height
    width = img.size[0] # Image width
    buf= BytesIO()
    img.save(buf, format= 'JPEG') # encode to jpeg in memory
    encoded_image_data= buf.getvalue()
    image_format = b'jpeg'
    source_id = study
    # A hash of the image is used in some frameworks
    key = hashlib.sha256(encoded_image_data).hexdigest()   

    tf_record = tf.train.Example(features=tf.train.Features(feature={
        'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
        'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
        'image/channels': tf.train.Feature(int64_list=tf.train.Int64List(value=[1])),
        'image/filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename.encode()])),
        'image/source_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[source_id.encode()])),
        'image/encoded': tf.train.Feature(bytes_list=tf.train.BytesList(value=[encoded_image_data])),
        'image/key/sha256': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode()])),
        'image/format': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_format])),
        'image/class/label': tf.train.Feature(int64_list=tf.train.Int64List(value=[1+label_idx])),
        'image/class/text': tf.train.Feature(bytes_list=tf.train.BytesList(value=[LABELS[label_idx].encode()]))
    }))
    return tf_record

def open_sharded_tfrecords(exit_stack, base_path, num_shards):
    tf_record_output_filenames = [
        '{}-{:02d}-of-{:02}.tfrecord'.format(base_path, idx, num_shards)
        for idx in range(num_shards)
        ]
    tfrecords = [
        exit_stack.enter_context(tf.io.TFRecordWriter(file_name))
        for file_name in tf_record_output_filenames
    ]
    return tfrecords

# Create Stratified TFRecords
Here we stratify the studies based on the Group column created in [this notebook](https://www.kaggle.com/mistag/k-folds-cv-stratification-on-multiple-features).

In [None]:
from sklearn.model_selection import StratifiedKFold

K_FOLDS = 5
IMG_SIZES = [528, 600, 1024] # change list according to needs

for size in IMG_SIZES:
    output_filebase='./covid-{}'.format(size)
    labcnt = np.zeros((K_FOLDS, len(LABELS)), dtype=int)
    with contextlib2.ExitStack() as tf_record_close_stack:
        output_tfrecords = open_sharded_tfrecords(tf_record_close_stack, output_filebase, K_FOLDS)
        idx = 0
        skf = StratifiedKFold(n_splits=K_FOLDS)
        for _, test_index in skf.split(df, df.Group):
            for i in test_index:
                study = df.Study.iloc[i]
                for j in range(4):
                    if df[LABELS[j]].iloc[i] == 1:
                        label_idx = j
                labcnt[idx, label_idx] += 1
                tf_record = create_tf_example(study, label_idx, longest_edge=size)
                output_tfrecords[idx].write(tf_record.SerializeToString())
            idx += 1
        
ldf = pd.DataFrame(labcnt, columns=LABELS)
ldf.to_pickle('fold_stats.pkl') # statistics for later use

# Visualize result
Verify the result by reading and plotting a few X-rays.

In [None]:
dataset = tf.data.TFRecordDataset('./covid-1024-00-of-05.tfrecord')
fig = plt.figure(figsize=(18, 12))
idx=1
for raw_record in dataset.take(6):
    axes = fig.add_subplot(2, 3, idx)
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    classes=example.features.feature['image/class/text'].bytes_list.value[:]
    study=example.features.feature['image/source_id'].bytes_list.value[:]
    img_encoded=example.features.feature['image/encoded'].bytes_list.value[0]
    img = Image.open(BytesIO(img_encoded))
    #plot_img(img, axes, xmin, ymin, xmax, ymax, classes, class_label, "")
    plt.setp(axes, xticks=[], yticks=[])
    axes.set_title(study[0].decode()+'\n'+classes[0].decode())
    plt.imshow(img, cmap='gray')
    idx=idx+1