# TFRecords
This notebook will create TFRecords of the VinBigData Chest X-rays. The TFRecords can then be used as training data with the TensorFlow Object Detection API. The labels in this dataset are very noisy, so the objects are filtered using [Weighted Boxes Fusion](https://github.com/ZFTurbo/Weighted-Boxes-Fusion). Images are resized to 1024 while preserving aspect ratio (change to any desired value of IMAGE_SIZE).

In [None]:
import sys
sys.path.insert(0, "../input/weightedboxesfusion")
from ensemble_boxes import *
import pydicom
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import hashlib
import os
from io import BytesIO
from PIL import Image, ImageFont, ImageDraw
import cv2
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Start by reading training metadata. 

In [None]:
raw_df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
raw_df.head()

# Helper functions
The images are digitized in 12-14bit resolution - converting this to JPEG will cause quite a bit of information to be lost. To preserve all image information, the images could be saved as 16bit PNG. But here we use Contrast Limiting Adaptive Histogram Equalization (CLAHE). This image pre-processing step must then also be used during inference time. 

In [None]:
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
#for masking
from skimage.measure import label,regionprops
from sklearn.cluster import KMeans
from skimage.segmentation import clear_border

import scipy.ndimage as ndimage
from scipy.ndimage.interpolation import zoom
from skimage import measure, morphology, segmentation
from skimage.transform import resize


In [None]:
import pydicom as dcm

def get_first_of_dicom_field_as_int(x):
    #get x[0] as in int is x is a 'pydicom.multival.MultiValue', otherwise get int(x)
    if type(x) == dcm.multival.MultiValue: return int(x[0])
    else: return int(x)

    
def get_windowing(data):
    dicom_fields = [data[('0028','1050')].value, #window center
                    data[('0028','1051')].value] #window_width
    return [get_first_of_dicom_field_as_int(x) for x in dicom_fields]

In [None]:
np.pi/np.e

In [None]:
IMAGE_SIZE = 1024 # change this to desired value
CLIP_LIMIT = 8
GRID_SIZE = (16,16)


def window_image(data):
    #window_center, window_width = get_windowing(data)
    data = data.pixel_array
    #data = (data*slope +intercept) #for translation adjustments given in the dicom file. 
    data_min = data.min()
    data_max = data.max()
    dmean = (data_max-data_min)//2
    d1 = int(dmean*(0.05))
    #data[data<dmean] += d1 #set data_min for all HU levels less than minimum HU level
    data[data>dmean] -= d1 #set data_max for all HU levels higher than maximum HU level

    return data


def read_image(fname, target_size=IMAGE_SIZE, use_clahe=True):
    ds = dcmread(fname)
    ds_ = window_image(ds)
    data = apply_voi_lut(ds_, ds)
    #data = (data*data)/4-data
    data = data - np.min(data)
    data = 255. * data / np.max(data)
    if ds.PhotometricInterpretation == "MONOCHROME1": # check for inverted image
        data = 255. - data
    #data[data<5] = 0
    #data[data>250] = 0
    if use_clahe:
        clahe = cv2.createCLAHE(clipLimit=CLIP_LIMIT, tileGridSize=GRID_SIZE)
        climg = clahe.apply(data.astype('uint8'))
        #climg = (climg-data)
        img = Image.fromarray(climg.astype('uint8'), 'L')
    else:
        img = Image.fromarray(data.astype('uint8'), 'L')
    org_size = img.size
    if max(img.size) > target_size:
        img.thumbnail((target_size, target_size), Image.ANTIALIAS)
    
    return img, org_size

Let's see what CLAHE does:

In [None]:
fname = '../input/vinbigdata-chest-xray-abnormalities-detection/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom'
fig = plt.figure(figsize=(20,20))
axes = fig.add_subplot(1, 2, 1)
img, size = read_image(fname, use_clahe=False)
axes.set_title('Original')
plt.imshow(img, cmap='gray')
axes = fig.add_subplot(1, 2, 2)
img, size = read_image(fname, use_clahe=True)
axes.set_title('CLAHE')
plt.imshow(img, cmap='gray');

# Weighted Boxes Fusion
The first task is to run WBF on the raw data to filter out overlapping objects. Images with no findings will also be removed.

In [None]:
def plot_boxes(img, boxes, labels, thickness=5):
    for i in range(len(boxes)):
        box = boxes[i].astype(int)
        cv2.rectangle(img, (box[0], box[1]), (box[2],  box[3]), LABEL_COLORS[labels[i].astype(int)], thickness)
    return img

def plot_two(fname, idf):
    image, size = read_image(fname)
    image = cv2.cvtColor(np.array(image), cv2.COLOR_GRAY2RGB)
    image2 = image.copy()
    fig = plt.figure(figsize=(20,20))
    fig.tight_layout()
    axes = fig.add_subplot(1, 2, 1)
    plt.setp(axes, xticks=[], yticks=[])
    axes.set_title('Original')
    boxes = idf[['x_min', 'y_min', 'x_max', 'y_max']].values * IMAGE_SIZE / max(size)
    labels = idf.class_id.values
    image = plot_boxes(image, boxes, labels)
    plt.imshow(image, cmap='gray')
    # wbf
    axes = fig.add_subplot(1, 2, 2)
    plt.setp(axes, xticks=[], yticks=[])
    axes.set_title('After WBF')
    boxes_list = boxes / 1024.
    boxes_list = boxes_list.tolist()
    boxes1, _, labels1 = weighted_boxes_fusion([boxes_list], [np.ones(len(labels)).tolist()], [labels.tolist()], 
                                               weights=None, iou_thr=0.42, skip_box_thr=0.0001)
    boxes1 *= 1024
    image2 = plot_boxes(image2, boxes1, labels1)
    plt.imshow(image2, cmap='gray')

In [None]:
NUM_CLASSES = 14
LABEL_COLORS = [(230, 25, 75), (60, 180, 75), (255, 225, 25), (0, 130, 200), 
                (245, 130, 48), (145, 30, 180), (70, 240, 240), (240, 50, 230), 
                (210, 245, 60), (250, 190, 212), (0, 128, 128), (220, 190, 255), 
                (170, 110, 40), (255, 250, 200), (128, 0, 0), (170, 255, 195), 
                (128, 128, 0), (255, 215, 180), (0, 0, 128), (128, 128, 128), 
                (255, 255, 255), (0, 0, 0)]

findings = raw_df[raw_df.class_id != 14]
xrays = findings.image_id.unique()
class_names = []
for i in range(NUM_CLASSES):
    class_names.append(findings[findings.class_id == i].class_name.iloc[0])
class_names

In [None]:
i = 105
idf = raw_df[raw_df.image_id == xrays[i]]
fname = '../input/vinbigdata-chest-xray-abnormalities-detection/train/'+xrays[i]+'.dicom'
plot_two(fname, idf)

## Run WBF on entire dataset

In [None]:
wbf = []

for i in range(len(xrays)):
    idf = raw_df[raw_df.image_id == xrays[i]]
    boxes = idf[['x_min', 'y_min', 'x_max', 'y_max']].values
    max_pos = np.max(boxes)
    boxes /= max_pos
    boxes_list = boxes.tolist()
    labels = idf.class_id.values
    boxes1, _, labels1 = weighted_boxes_fusion([boxes_list], [np.ones(len(labels)).tolist()], [labels.tolist()], 
                                               weights=None, iou_thr=0.42, skip_box_thr=0.0001)
    boxes1 *= max_pos
    boxes1 = np.floor(boxes1)
    for j in range(len(boxes1)):
        wbf.append([xrays[i], class_names[labels1[j].astype(int)], labels1[j].astype(int), boxes1[j][0], boxes1[j][1], boxes1[j][2], boxes1[j][3]])

wbf_df = pd.DataFrame (wbf, columns=['image_id', 'class_name', 'class_id', 'x_min', 'y_min', 'x_max', 'y_max'])
wbf_df.to_csv('wbf_objects.csv')

In [None]:
wbf_df.head()

In [None]:
wbf_df.class_id.value_counts()

In [None]:
findings.class_id.value_counts()

# Stratified K-Folds
To make sure that each shard has about the same class distribution, we use statified K-Folds on the data set. Modified code from [this notebook](https://www.kaggle.com/backtracking/smart-data-split-train-eval-for-object-detection/comments).

In [None]:
from sklearn.model_selection import StratifiedKFold

NUM_SHARDS = 20

skf = StratifiedKFold(n_splits=NUM_SHARDS, shuffle=True, random_state=42)
df_folds = wbf_df[['image_id']].copy()

df_folds.loc[:, 'bbox_count'] = 1
df_folds = df_folds.groupby('image_id').count()
df_folds.loc[:, 'object_count'] = wbf_df.groupby('image_id')['class_id'].nunique()

df_folds.loc[:, 'stratify_group'] = np.char.add(
    df_folds['object_count'].values.astype(str),
    df_folds['bbox_count'].apply(lambda x: f'_{x // 15}').values.astype(str))

df_folds.loc[:, 'fold'] = 0
for fold_number, (train_index, val_index) in enumerate(skf.split(X=df_folds.index, y=df_folds['stratify_group'])):
    df_folds.loc[df_folds.iloc[val_index].index, 'fold'] = fold_number
df_folds.reset_index(inplace=True)

Check the distribution of objects between the shards:

In [None]:
df_shard = pd.merge(wbf_df, df_folds[df_folds['fold'] == 0], on='image_id')
dfs = df_shard.class_name.value_counts().to_frame('S0').sort_index()
for i in range(1,20):
    df_shard = pd.merge(wbf_df, df_folds[df_folds['fold'] == i], on='image_id')
    dfs['S'+str(i)] = df_shard.class_name.value_counts().to_frame().sort_index()
dfs

# Create TFRecords
The records will be compatible with TensorFlow Object Detection API. We only add images with objects. 

In [None]:
TPATH = '../input/vinbigdata-chest-xray-abnormalities-detection/train/'

# Create example for TensorFlow Object Detection API
def create_tf_example(imagedf, longest_edge=IMAGE_SIZE):  
    fname = TPATH+imagedf.image_id.iloc[0]+'.dicom'
    filename=fname.split('/')[-1] # exclude path    
    img, org_size = read_image(fname, target_size=IMAGE_SIZE, use_clahe=True)
    height = img.size[1] # Image height
    width = img.size[0] # Image width
    buf= BytesIO()
    img.save(buf, format= 'JPEG') # encode to jpeg in memory
    encoded_image_data= buf.getvalue()
    image_format = b'jpeg'
    source_id = imagedf.image_id.iloc[0]
    # A hash of the image is used in some frameworks
    key = hashlib.sha256(encoded_image_data).hexdigest()   
    # object bounding boxes 
    xmins = imagedf.x_min.values/org_size[0] # List of normalized left x coordinates in bounding box 
    xmaxs = imagedf.x_max.values/org_size[0] # List of normalized right x coordinates in bounding box
    ymins = imagedf.y_min.values/org_size[1] # List of normalized top y coordinates in bounding box 
    ymaxs = imagedf.y_max.values/org_size[1] # List of normalized bottom y coordinates in bounding box
    # List of string class name & id of bounding box (1 per box)
    object_cnt = len(imagedf)
    classes_text = []
    classes = []
    for i in range(object_cnt):
        classes_text.append(imagedf.class_name.iloc[i].encode())
        classes.append(1+imagedf.class_id.iloc[i]) # 0 is not a valid class
        
    # unused features from Open Image 
    depiction = np.zeros(object_cnt, dtype=int)
    group_of = np.zeros(object_cnt, dtype=int)
    occluded = np.zeros(object_cnt, dtype=int) #also Pascal VOC
    truncated = np.zeros(object_cnt, dtype=int) # also Pascal VOC
    # Pascal VOC
    view_text = []
    for i in range(object_cnt):
        view_text.append('frontal'.encode())
    difficult = np.zeros(object_cnt, dtype=int)

    tf_record = tf.train.Example(features=tf.train.Features(feature={
        'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
        'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
        'image/filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename.encode()])),
        'image/source_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[source_id.encode()])),
        'image/encoded': tf.train.Feature(bytes_list=tf.train.BytesList(value=[encoded_image_data])),
        'image/key/sha256': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode()])),
        'image/format': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_format])),
        'image/object/bbox/xmin': tf.train.Feature(float_list=tf.train.FloatList(value=xmins)),
        'image/object/bbox/xmax': tf.train.Feature(float_list=tf.train.FloatList(value=xmaxs)),
        'image/object/bbox/ymin': tf.train.Feature(float_list=tf.train.FloatList(value=ymins)),
        'image/object/bbox/ymax': tf.train.Feature(float_list=tf.train.FloatList(value=ymaxs)),
        'image/object/class/text': tf.train.Feature(bytes_list=tf.train.BytesList(value=classes_text)),
        'image/object/class/label': tf.train.Feature(int64_list=tf.train.Int64List(value=classes)),
        'image/object/depiction': tf.train.Feature(int64_list=tf.train.Int64List(value=depiction)),
        'image/object/group_of': tf.train.Feature(int64_list=tf.train.Int64List(value=group_of)),
        'image/object/occluded': tf.train.Feature(int64_list=tf.train.Int64List(value=occluded)),
        'image/object/truncated': tf.train.Feature(int64_list=tf.train.Int64List(value=truncated)),
        'image/object/difficult': tf.train.Feature(int64_list=tf.train.Int64List(value=difficult)),
        'image/object/view': tf.train.Feature(bytes_list=tf.train.BytesList(value=view_text))
    }))
    return tf_record

We use sharding to create 20 TFRecords. This gives us a 5% resolution when creating train/validation split.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import contextlib2

def open_sharded_tfrecords(exit_stack, base_path, num_shards):
    tf_record_output_filenames = [
        '{}-{:03d}-of-{:03}.tfrecord'.format(base_path, idx, num_shards)
        for idx in range(num_shards)
        ]
    tfrecords = [
        exit_stack.enter_context(tf.io.TFRecordWriter(file_name))
        for file_name in tf_record_output_filenames
    ]
    return tfrecords

output_filebase='./VinBig'

img_cnt = np.zeros(NUM_SHARDS, dtype=int)
with contextlib2.ExitStack() as tf_record_close_stack:
    output_tfrecords = open_sharded_tfrecords(tf_record_close_stack, output_filebase, NUM_SHARDS)
    for i in range(NUM_SHARDS):
        df_shard = pd.merge(wbf_df, df_folds[df_folds['fold'] == i], on='image_id')
        ids = df_shard.image_id.unique()
        for j in range (len(ids)):
            imagedf = df_shard[df_shard.image_id == ids[j]]
            tf_record = create_tf_example(imagedf, longest_edge=IMAGE_SIZE)            
            output_tfrecords[i].write(tf_record.SerializeToString())
            img_cnt[i] += 1
print("Converted {} images".format(np.sum(img_cnt)))
print("Images per shard: {}".format(img_cnt))

Parameters used in the TFRecord creation are saved in a .json file for use in training and inference notebooks.

In [None]:
import json

dparams = {
    "IMAGE_SIZE": IMAGE_SIZE,
    "CLIP_LIMIT": CLIP_LIMIT,
    "GRID_SIZE": GRID_SIZE}
with open("dparams.json", "w") as json_file:
    json_file.write(json.dumps(dparams, indent = 4))

## Label data
We also need to create a label data file. Note that the TF Object Detection API expects the first class to be "1" and not "0". 

In [None]:
labels = ['Aortic enlargement', 'Atelectasis', 'Calcification', 'Cardiomegaly', 'Consolidation',
          'ILD', 'Infiltration', 'Lung Opacity', 'Nodule/Mass', 'Other lesion', 'Pleural effusion',
          'Pleural thickening', 'Pneumothorax', 'Pulmonary fibrosis']

with open('./VinBig.pbtxt', 'w') as f:
    for i in range (len(labels)): 
        f.write('item {{\n id: {}\n name:\'{}\'\n}}\n\n'.format(i+1, labels[i])) 

# Check TFRecords
Verify the result by reading and plotting a few X-rays.

In [None]:
# Some helper functions to draw image with object boundary boxes
fontname = '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf'
font = ImageFont.truetype(fontname, 40) if os.path.isfile(fontname) else ImageFont.load_default()

def bbox(img, xmin, ymin, xmax, ymax, color, width, label, score):
    draw = ImageDraw.Draw(img)
    xres, yres = img.size[0], img.size[1]
    box = np.multiply([xmin, ymin, xmax, ymax], [xres, yres, xres, yres]).astype(int).tolist()
    txt = " {}: {}%" if score >= 0. else " {}"
    txt = txt.format(label, round(score, 1))
    ts = draw.textsize(txt, font=font)
    draw.rectangle(box, outline=color, width=width)
    if len(label) > 0:
        if box[1] >= ts[1]+3:
            xsmin, ysmin = box[0], box[1]-ts[1]-3
            xsmax, ysmax = box[0]+ts[0]+2, box[1]
        else:
            xsmin, ysmin = box[0], box[3]
            xsmax, ysmax = box[0]+ts[0]+2, box[3]+ts[1]+1
        draw.rectangle([xsmin, ysmin, xsmax, ysmax], fill=color)
        draw.text((xsmin, ysmin), txt, font=font, fill='white')

def plot_img(img, axes, xmin, ymin, xmax, ymax, classes, class_label, by):
    img = img.convert("RGB")
    for i in range(len(xmin)):
        color = LABEL_COLORS[class_label[i]]
        bbox(img, xmin[i], ymin[i], xmax[i], ymax[i], color, 5, classes[i].decode(), -1)
    plt.setp(axes, xticks=[], yticks=[])
    axes.set_title(by)
    plt.imshow(img)

In [None]:
fname='./VinBig-000-of-020.tfrecord' 
dataset3 = tf.data.TFRecordDataset(fname)
fig = plt.figure(figsize=(20,30))
idx=1
for raw_record in dataset3.take(6):
    axes = fig.add_subplot(3, 2, idx)
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    xmin=example.features.feature['image/object/bbox/xmin'].float_list.value[:]
    xmax=example.features.feature['image/object/bbox/xmax'].float_list.value[:]
    ymin=example.features.feature['image/object/bbox/ymin'].float_list.value[:]
    ymax=example.features.feature['image/object/bbox/ymax'].float_list.value[:]
    classes=example.features.feature['image/object/class/text'].bytes_list.value[:]
    class_label=example.features.feature['image/object/class/label'].int64_list.value[:]
    img_encoded=example.features.feature['image/encoded'].bytes_list.value[0]
    img = Image.open(BytesIO(img_encoded))
    plot_img(img, axes, xmin, ymin, xmax, ymax, classes, class_label, "")
    idx=idx+1