In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!# Download models
!git clone --depth 1 https://github.com/tensorflow/models

!# Compile proto files 
! # sudo apt install -y protobuf-compiler # Already present
%cd models/research
!protoc object_detection/protos/*.proto --python_out=.
%cd ..
%cd ..

!# Install cocoapi
!pip install cython 
!git clone https://github.com/cocodataset/cocoapi.git
%cd cocoapi/PythonAPI
!make
%cd ..
%cd ..
!cp -r cocoapi/PythonAPI/pycocotools models/research/

!# Install object detection api
%cd models/research
!cp object_detection/packages/tf2/setup.py .
!python -m pip install .
%cd ..
%cd ..

In [None]:
!pip install ensemble-boxes
!pip install tensorflow_io

<div align='center'><font size="5" color='#353B47'>Chest-X-ray</font></div>
<div align='center'><font size="4" color="#353B47">Exploratory Data Analysis</font>
<br>
<hr>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pydicom
from glob import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection'

# **dicom2array**

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
def draw_bboxes(img, boxes, thickness=10, color=(255, 0, 0), img_size=(500,500)):
    img_copy = img.copy()
    if len(img_copy.shape) == 2:
        img_copy = np.stack([img_copy, img_copy, img_copy], axis=-1)
    for box in boxes:
        img_copy = cv2.rectangle(
            img_copy,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness)
    if img_size is not None:
        img_copy = cv2.resize(img_copy, img_size)
    return img_copy

In [None]:
dicom_paths = glob(f'{dataset_dir}/train/*.dicom')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

In [None]:
imgs = [exposure.equalize_hist(img) for img in imgs]
plot_imgs(imgs)

# **EDA**

In [None]:
from bokeh.plotting import figure as bokeh_figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
import pandas as pd
from PIL import Image
from sklearn import preprocessing
import random
from random import randint

In [None]:
def get_bbox_area(row):
    return (row['x_max']-row['x_min'])*(row['y_max']-row['y_min'])

train_df = pd.read_csv(f'{dataset_dir}/train.csv')
le = preprocessing.LabelEncoder()  # encode rad_id
train_df['rad_label'] = le.fit_transform(train_df['rad_id'])

finding_df = train_df[train_df['class_name'] != 'No finding']
finding_df['bbox_area'] = finding_df.apply(get_bbox_area, axis=1)
finding_df.head()

In [None]:
(finding_df['class_name'] == 'No finding').unique()

## 1. **Plot Bounding Box**

In [None]:
imgs = []
img_ids = finding_df['image_id'].values
class_ids = finding_df['class_id'].unique()

# map label_id to specify color
label2color = {class_id:[randint(0,255) for i in range(3)] for class_id in class_ids}
thickness = 3
scale = 5


for i in range(8):
    img_id = random.choice(img_ids)
    img_path = f'{dataset_dir}/train/{img_id}.dicom'
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    boxes = finding_df.loc[finding_df['image_id'] == img_id, ['x_min', 'y_min', 'x_max', 'y_max']].values/scale
    labels = finding_df.loc[finding_df['image_id'] == img_id, ['class_id']].values.squeeze()
    
    for label_id, box in zip(labels, boxes):
        color = label2color[label_id]
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

From the images it can be seen that the boxes with the same color tend to overlap. This indicates that the boxes were made form multiple radiologist. There are also some overlap with boxes of different color.

## **2. Histogram**

In [None]:
def hist_hover(dataframe, column, color=["#94c8d8", "#ea5e51"], bins=30, title="", value_range=None):
    """
    Plot histogram
    """
    hist, edges = np.histogram(dataframe[column], bins=bins, range=value_range)
    hist_frame = pd.DataFrame({
        column: hist,
        "left": edges[:-1],
        "right": edges[1:]
    })
    hist_frame["interval"] = ["%d to %d" %
                              (left, right) for left, right in zip(edges[:-1], edges[1:])]
    src = ColumnDataSource(hist_frame)
    plot = bokeh_figure(
        plot_height=400, plot_width=600,
        title=title, x_axis_label=column,
        y_axis_label="Count"
    )
    plot.quad(
        bottom=0, top=column, left="left", right="right",
        source=src, fill_color=color[0], line_color="#35838d",
        fill_alpha=0.7, hover_fill_alpha=0.7,
        hover_fill_color=color[1]
    )
    hover = HoverTool(
        tooltips=[("Interval", "@interval"), ("Count", str(f"@{column}"))]
    )
    plot.add_tools(hover)
    output_notebook()
    show(plot)

In [None]:
import plotly.graph_objects as go

In [None]:
def plot_distribution_classes(x_values, y_values):
    
    colors = ['rgb(26, 118, 255)',] * 15
    colors[0] = 'lightslategray'

    fig = go.Figure(data=[go.Bar(
        x=x_values, 
        y=y_values,
        text=y_values,
        marker_color=colors
    )])

    fig.update_layout(height=600, width=900, title_text="Distribution of radiographic observations")
    fig.update_xaxes(type="category")

    fig.show()

In [None]:
train_df.head()

In [None]:
indexes = train_df.class_name.unique()
counts = train_df.class_name.value_counts()

sorted_dict = dict(zip(indexes, counts))
sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse = True)}

x = list(sorted_dict.keys())
y = list(sorted_dict.values())

plot_distribution_classes(x, y)

There are some imbalance in the data. Class_id 14 corresponds to 'No Finding'

In [None]:
def plot_distribution_radiologist_obs(x_values, y_values):
    
    colors = ['lightslategray',] * 17
    colors[0] = 'crimson'
    colors[1] = 'crimson'
    colors[2] = 'crimson'
    
    fig = go.Figure(data=[go.Bar(
        x=x_values, 
        y=y_values,
        text=y_values,
        marker_color=colors
    )])

    fig.update_layout(height=600, width=900, title_text="Distribution of radiologist observations")
    fig.update_xaxes(type="category")

    fig.show()

In [None]:
# Show distribution of count of radio
indexes = train_df[['rad_id', 'image_id']].groupby(['rad_id']).agg(['count']).index
counts = train_df[['rad_id', 'image_id']].groupby(['rad_id']).agg(['count']).values.ravel()

sorted_dict = dict(zip(indexes, counts))
sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse = True)}

x = list(sorted_dict.keys())
y = list(sorted_dict.values())

plot_distribution_radiologist_obs(x, y)

Radiologist 'R9' 'R10' and 'R8' contributes to the observations significantly more than other radiologist

In [None]:
train_df.rad_id

## **3. Dicom Embedded Files**

In [None]:
import warnings
warnings.filterwarnings('ignore')

from matplotlib.patches import Rectangle
import numpy as np
import pandas as pd
import os
import re
import random
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
from pydicom import dcmread
from tqdm import tqdm
import multiprocessing as mp

In [None]:
PATH = "../input/vinbigdata-chest-xray-abnormalities-detection"
ds = dcmread(os.path.join(PATH, 'train', '000434271f63a053c4128a0ba6352c7f.dicom'))
ds

In [None]:
def infer_one_observation(file_path):
    
    ds = dcmread(file_path)
    image_id = os.path.basename(file_path)
        
    observation_dict = {}
    observation_dict['image_id'] = image_id.split(sep=".")[0]
    
    file_meta_keys = list(ds.file_meta._dict.keys())
    remaining_meta_keys = list(ds._dict.keys())
    
    for key in file_meta_keys:
        observation_dict[str(key)] = str(ds.file_meta[key].value)
        
    # Not taking into account pixel value
    for key in remaining_meta_keys:
        if key != (0x7fe0, 0x0010):
            observation_dict[str(key)] = str(ds[key].value)
        
    return observation_dict

In [None]:
mapper_dict = {'image_id':'image_id',
               '(0002, 0000)':"File Meta Information Group Length",
               '(0002, 0001)':"File Meta Information Version",
               '(0002, 0002)':"Media Storage SOP Class UID",
               '(0002, 0003)':"Media Storage SOP Instance UID",
               '(0002, 0010)':"Transfer Syntax UID",
               '(0002, 0012)':"Implementation Class UID",
               '(0002, 0013)':"Implementation Version Name",
               '(0002, 0016)':"Source Application Entity Title",
               '(0010, 0040)':"Patient's Sex",
               '(0010, 1010)':"Patient's Age",
               '(0010, 1020)':"Patient's Size",
               '(0010, 1030)':"Patient's Weight",
               '(0028, 0002)':"Samples per Pixel",
               '(0028, 0004)':"Photometric Interpretation",
               '(0028, 0008)':"Number of Frames",
               '(0028, 0010)':"Rows",
               '(0028, 0011)':"Columns",
               '(0028, 0030)':"Pixel Spacing",
               '(0028, 0034)':"Pixel Aspect Ratio",
               '(0028, 0100)':"Bits Allocated",
               '(0028, 0101)':"Bits Stored",
               '(0028, 0102)':"High Bit",
               '(0028, 0103)':"Pixel Representation",
               '(0028, 0106)':"Smallest Image Pixel Value",
               '(0028, 0107)':"Largest Image Pixel Value",
               '(0028, 1050)':"Window Center",
               '(0028, 1051)':"Window Width",
               '(0028, 1052)':"Rescale Intercept",
               '(0028, 1053)':"Rescale Slope",
               '(0028, 2110)':"Lossy Image Compression",
               '(0028, 2112)':"Lossy Image Compression Ratio",
               '(0028, 2114)':"Lossy Image Compression Method"
              }

In [None]:
def extract_meta_information(folder):
    
    folder_filenames = os.listdir(os.path.join(PATH, folder))
    one_obs = infer_one_observation(os.path.join(PATH, folder, folder_filenames[0]))
    metadata = pd.DataFrame(columns = one_obs.keys())

    for filename in tqdm(folder_filenames):
        one_obs = infer_one_observation(os.path.join(PATH, folder, filename))
        metadata = metadata.append(one_obs, ignore_index=True)
        
    metadata.columns = metadata.columns.map(mapper_dict)
    metadata.to_csv(f"{folder}_dicom_metadata.csv", index=False)
    
    return metadata

In [None]:
train_filenames = os.listdir(os.path.join(PATH, "train"))
BATCH_SIZES = list(map(lambda x:int(x/100), [0, 4000, 8000, 12000, 15000]))

one_obs_train = infer_one_observation(os.path.join(PATH, 'train', train_filenames[0]))

def extract_train_meta(BATCH):
    
    print("----- Train metadata extraction starting -----")
    
    index_loop = 0
    metadata = pd.DataFrame(columns = one_obs_train.keys())
    
    for filename in train_filenames[BATCH_SIZES[BATCH]:BATCH_SIZES[BATCH+1]]:
        one_obs = infer_one_observation(os.path.join(PATH, 'train', filename))
        metadata = metadata.append(one_obs_train, ignore_index=True)

        if index_loop%10==0:
            print(f"{index_loop} train DICOM metadata successfully extracted")
        index_loop+=1
    
    metadata.columns = metadata.columns.map(mapper_dict)
    metadata.to_csv(f"batch_{BATCH}_dicom_train_metadata.csv", index=False)
    
    print("----- Train metadata extraction fully completed -----")
    
    return metadata

In [None]:
test_filenames = os.listdir(os.path.join(PATH, "test"))
BATCH_SIZES = list(map(lambda x:int(x/10), [0, 750, 1500, 2250, 3000]))

one_obs_test = infer_one_observation(os.path.join(PATH, 'test', test_filenames[0]))

def extract_test_meta(BATCH):
    print("----- Test metadata extraction starting -----")
    index_loop = 0
    metadata = pd.DataFrame(columns = one_obs_test.keys())
    
    for filename in test_filenames[BATCH_SIZES[BATCH]:BATCH_SIZES[BATCH+1]]:
        one_obs = infer_one_observation(os.path.join(PATH, 'test', filename))
        metadata = metadata.append(one_obs_test, ignore_index=True)

        if index_loop%10==0:
            print(f"{index_loop} test DICOM metadata successfully extracted")
        index_loop+=1
    
    metadata.columns = metadata.columns.map(mapper_dict)
    metadata.to_csv(f"batch_{BATCH}_dicom_test_metadata.csv", index=False)
    
    print("----- Test metadata extraction fully completed -----")
    
    return metadata

In [None]:
# this function only select small batch sizes for time execution purpose. If you want to rerun the whole extraction, please change these lines:

# BATCH_SIZES = list(map(lambda x:int(x/100), [0, 4000, 8000, 12000, 15000]))

# and

# if index_loop%10==0:

# to

# BATCH_SIZES = [0, 4000, 8000, 12000, 15000]

# and

# if index_loop%1000==0:

# for train. And for the test, change these following lines:

# BATCH_SIZES = list(map(lambda x:int(x/10), [0, 750, 1500, 2250, 3000]))

# and

# if index_loop%10==0:

# to

# BATCH_SIZES = [0, 750, 1500, 2250, 3000]

# and

# if index_loop%100==0:

In [None]:
pool = mp.Pool(mp.cpu_count())
pool.map(extract_train_meta, [i for i in range(4)])
pool.map(extract_test_meta, [i for i in range(4)])
pool.close()

## **4. Preprocessing DICOM Features**

In [None]:
import pydicom

ROOT = '../input/vinbigdata-chestxray-metadata'
train = pd.read_csv('../input/vinbigdata-chestxray-metadata/train_dicom_metadata.csv')
train_cleaned = pd.read_csv('../input/vinbigdata-chestxray-metadata/train_dicom_metadata_cleaned.csv')
test = pd.read_csv('../input/vinbigdata-chestxray-metadata/test_dicom_metadata.csv')
test_cleaned = pd.read_csv('../input/vinbigdata-chestxray-metadata/test_dicom_metadata_cleaned.csv')

In [None]:
train.shape

In [None]:
# Keep some columns only
train_metadata_filtered = train[["Patient's Sex", "Patient's Age", "Patient's Size", "Patient's Weight"]]
test_metadata_filtered = test[["Patient's Sex", "Patient's Age", "Patient's Size", "Patient's Weight"]]
train_metadata_filtered_cleaned = train_cleaned[["Patient's Sex", "Patient's Age", "Patient's Size", "Patient's Weight"]]
test_metadata_filtered_cleaned = test_cleaned[["Patient's Sex", "Patient's Age", "Patient's Size", "Patient's Weight"]]

In [None]:
train_metadata_filtered.info()

In [None]:
train_metadata_filtered_cleaned.head()

In [None]:
print(train_metadata_filtered["Patient's Size"].value_counts())
print(train_metadata_filtered["Patient's Weight"].value_counts())

print(test_metadata_filtered["Patient's Size"].value_counts())
print(test_metadata_filtered["Patient's Weight"].value_counts())

train_metadata_filtered = train_metadata_filtered.drop(["Patient's Size", "Patient's Weight"], axis=1)
test_metadata_filtered = test_metadata_filtered.drop(["Patient's Size", "Patient's Weight"], axis=1)

In [None]:
def get_first_el(row):
    resu = 'NA'
    if len(str(row))>5:
        resu = re.search(r"(?<=\[)(.*?)(?=\,)", row).group()
    return resu

In [None]:
train_metadata_filtered["Patient's Sex"] = train_metadata_filtered["Patient's Sex"].fillna("NA")
train_metadata_filtered.loc[train_metadata_filtered["Patient's Sex"]=="O"] = np.nan

train_metadata_filtered["Patient's Age"] = train_metadata_filtered["Patient's Age"].fillna("0")
train_metadata_filtered["Patient's Age"] = train_metadata_filtered["Patient's Age"].apply(lambda x:re.search(r"\d*", str(x)).group())
train_metadata_filtered.loc[train_metadata_filtered["Patient's Age"]== '']= np.nan
train_metadata_filtered["Patient's Age"] = train_metadata_filtered["Patient's Age"].astype(float)

In [None]:
test_metadata_filtered["Patient's Sex"] = test_metadata_filtered["Patient's Sex"].fillna("NA")
test_metadata_filtered.loc[test_metadata_filtered["Patient's Sex"]=="O"] = np.nan

test_metadata_filtered["Patient's Age"] = test_metadata_filtered["Patient's Age"].fillna("0")
test_metadata_filtered["Patient's Age"] = test_metadata_filtered["Patient's Age"].apply(lambda x:re.search(r"\d*", str(x)).group())
test_metadata_filtered.loc[test_metadata_filtered["Patient's Age"]== '']= np.nan
test_metadata_filtered["Patient's Age"] = test_metadata_filtered["Patient's Age"].astype(float)

In [None]:
train_metadata_filtered.to_csv("train_dicom_metadata_filtered.csv", index=False)
test_metadata_filtered.to_csv("test_dicom_metadata_filtered.csv", index=False)

In [None]:
train_metadata_age = train_metadata_filtered.loc[(train_metadata_filtered["Patient's Age"] > 0) & 
                                                 (train_metadata_filtered["Patient's Age"] < 100), :]

fig = px.histogram(train_metadata_age, x="Patient's Age",
                   marginal="box",
                   hover_data=train_metadata_age.columns)

fig.update_layout(
    title="Age distribution (train)")

fig.show()

del(train_metadata_age)

In [None]:
def plot_distribution_age_imageid(x_values, y_values):
    
    colors = ['lightslategray',] * 17
    colors[0] = 'crimson'
    colors[1] = 'crimson'
    colors[2] = 'crimson'
    
    fig = go.Figure(data=[go.Bar(
        x=x_values, 
        y=y_values,
        text=y_values,
        marker_color=colors
    )])

    fig.update_layout(height=600, width=900, title_text="Distribution of age observations")
    fig.update_xaxes(type="category")

    fig.show()

In [None]:
# Show distribution of count of image_id by age
# data = train_metadata_filtered.loc[(train_metadata_filtered["Patient's Age"] > 0) &
#                                    (train_metadata_filtered["Patient's Age"] < 100), :]

# indexes = data[["Patient's Age", 'image_id']].groupby(["Patient's Age"]).agg(['count']).index
# counts = data[["Patient's Age", 'image_id']].groupby(["Patient's Age"]).agg(['count']).values.ravel()

# sorted_dict = dict(zip(indexes, counts))
# sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse = True)}

# x = list(sorted_dict.keys())
# y = list(sorted_dict.values())

# plot_distribution_age_imageid(x, y)

In [None]:
train_metadata_counts = list(train_metadata_filtered.loc[train_metadata_filtered["Patient's Sex"] != "NA", "Patient's Sex"].value_counts())
train_metadata_labels = list(train_metadata_filtered.loc[train_metadata_filtered["Patient's Sex"] != "NA", "Patient's Sex"].value_counts().index)

fig = go.Figure(data=[go.Pie(labels=train_metadata_labels, 
                             values=train_metadata_counts, 
                             hole=.3,
                             title_text="Sex distribution (train)")])
fig.show()

del(train_metadata_counts, train_metadata_labels)

In [None]:
train_metadata_filtered["Patient's Age"].describe()

In [None]:
data = train_metadata_filtered.loc[(train_metadata_filtered["Patient's Sex"] != "NA") &
                                   (train_metadata_filtered["Patient's Age"] > 0) &
                                   (train_metadata_filtered["Patient's Age"] < 100), :]

fig = px.histogram(data, 
                   x="Patient's Age", 
                   color="Patient's Sex", 
                   marginal="box",
                   hover_data=data.columns,
                   histnorm = "probability")

fig.update_layout(
    title="Age distribution by sex (train)")

fig.show()

del(data)

In [None]:
train_filtered_w_imageid = train_metadata_filtered
train_filtered_w_imageid['image_id'] = train.image_id
train_filtered_w_imageid

In [None]:
train_filtered_w_imageid.value_counts()

## **5. Radiocardiographic Representation**

In [None]:
def get_rectangle_parameter(dataframe, index):
    
    "Adapt coordinates of bounding box for patch.Rectangle function"
    
    x_min = dataframe.loc[index, 'x_min']
    y_min = dataframe.loc[index, 'y_min']
    x_max = dataframe.loc[index, 'x_max']
    y_max = dataframe.loc[index, 'y_max']
    
    anchor_point = (x_min, y_min)
    height = y_max - y_min
    width = x_max - x_min
    
    return anchor_point, height, width

In [None]:
def select_9_from_each(dataframe):
    
    "For each class, returns 9 indexes and image paths"
    
    # Initialize dictionaries
    class_id_index_examples, class_id_image_examples = {}, {}
    # Loop over different classes
    for class_id in range(14):
        # Infer indexes
        class_id_index_examples[str(class_id)] = dataframe[dataframe.class_id == class_id].sample(9).index
        # Infer image paths
        class_id_image_examples[str(class_id)] = dataframe.loc[class_id_index_examples[str(class_id)],'image_id'].tolist()
        
    return class_id_index_examples, class_id_image_examples

class_id_index_examples, class_id_image_examples = select_9_from_each(train_df)

In [None]:
def display_images(class_id, graph_indexes = np.arange(9)):
    
    # Get files
    files_index = class_id_index_examples[str(class_id)]
    files_list = class_id_image_examples[str(class_id)]
    
    # define subplot
    fig, axs = plt.subplots(3,3, figsize=(12,12))
    for graph_index in graph_indexes:
        
        full_filename = files_list[graph_index]+'.dicom'
        ds = dcmread(os.path.join(PATH, 
                                  'train',
                                  full_filename))
        

#         axs[graph_index%3, (graph_index)//3].set_title('Label: %s \n'%class_id,
#                   fontsize=18)
        axs[graph_index%3, (graph_index)//3].imshow(ds.pixel_array, cmap=plt.get_cmap('gray'))
                  
        if str(class_id) != '14':
            
            # Add rectangle
            anchor_point, height, width = get_rectangle_parameter(train_dataframe, 
                                                                  files_index[graph_index])
            rect = Rectangle(anchor_point, 
                                     height, 
                                     width, 
                                     edgecolor='r', 
                                     facecolor="none")
            axs[graph_index%3, (graph_index)//3].add_patch(rect)
                     
    # the bottom of the subplots of the figure
    plt.subplots_adjust(bottom = 0.001)
    plt.subplots_adjust(top = 0.99)
    
    # show the figure
    plt.show()

## **6. X-Ray Visualization**

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from ensemble_boxes import *
from tqdm.notebook import tqdm

import pydicom
from pydicom.tag import Tag

import tensorflow as tf
import tensorflow_io as tfio

from object_detection.protos.string_int_label_map_pb2 import StringIntLabelMap, StringIntLabelMapItem
from object_detection.dataset_tools import tf_record_creation_util
from object_detection.utils import dataset_util
import contextlib2

from google.protobuf import text_format

In [None]:
def read_dicom(path, max_dim):
    image_bytes = tf.io.read_file(path)
    image = tfio.image.decode_dicom_image(
        image_bytes, 
        dtype = tf.uint16
    )
    
    image = tf.squeeze(image, axis = 0)
    
    h, w, _ = image.shape
    
    image = tf.image.resize(
        image, 
        (max_dim, max_dim), 
        preserve_aspect_ratio = True
    )
    
    image = image - tf.reduce_min(image)
    image = image / tf.reduce_max(image)
    image = tf.cast(image * 255, tf.uint8)
    
    return image, h, w

In [None]:
path = "../input/vinbigdata-chest-xray-abnormalities-detection"
df = pd.read_csv(os.path.join(path, "train.csv"))

In [None]:
%matplotlib inline

max_dim = 500
demo_image = "6d5acf3f8a973a26844d617fffe72998.dicom"
image, h, w = read_dicom(os.path.join(path, "train", demo_image), max_dim)

plt.figure(figsize = (5, 5))
plt.imshow(tf.squeeze(image), 'gray')

In [None]:
# Image Processing technique CLAHE (Contrast Limited Adaptive Histogram Equalization).
# This image pre-processing redistributes the lightness values of the image making patterns more apparent 
def CLAHE(image):
    clahe = cv2.createCLAHE(
        clipLimit = 2., 
        tileGridSize = (10, 10)
    )
    
    image = clahe.apply(image.numpy()) 
    image = tf.expand_dims(image, axis = 2)
    
    return image

In [None]:
%matplotlib inline

fig = plt.figure(figsize = (8, 8))

axes = fig.add_subplot(1, 2, 1)
plt.imshow(tf.squeeze(image), cmap = "gray")
axes.set_title("Original")

axes = fig.add_subplot(1, 2, 2)
image = CLAHE(image)
plt.imshow(tf.squeeze(image), cmap = "gray")
axes.set_title("Post CLAHE")

The API requires the classes to be from 1 to n and outputs 0 when no class is found. Since our labels start with 0, we make unit increment to the class_id and use the new label-map.

In [None]:
# Creating LabelMap
df["class_id"] = df["class_id"] + 1 # Incrementing by 1
LabelMap = df.loc[df["class_name"] != "No finding", ["class_name", "class_id"]] # Removing the examples with no finding
LabelMap = LabelMap.drop_duplicates().reset_index(drop = True)
LabelMap

In [None]:
# Using 14 unique colors to annotate the abnormalities.
LABEL_COLORS = [
    (230, 25, 75), (60, 180, 75), (255, 225, 25), (0, 130, 200), (245, 130, 48), (145, 30, 180), (70, 240, 240), 
    (240, 50, 230), (210, 245, 60), (250, 190, 212), (0, 128, 128), (220, 190, 255), (170, 110, 40), (255, 250, 200), 
]
LabelMap["colors"] = LABEL_COLORS

In [None]:
# Save mappings as .pbtxt
def save_mapping(LabelMap):
    msg = StringIntLabelMap()
    
    for i, row in LabelMap.iterrows():
        msg.item.append(StringIntLabelMapItem(id = row["class_id"], name = row["class_name"]))
    
    text = str(text_format.MessageToBytes(msg, as_utf8 = True), 'utf-8')
    
    f = open("LabelMap.pbtxt", "w")
    f.write(text)
    f.close()
    
save_mapping(LabelMap)

In [None]:
# Remove examples with no findings (won't be used for training)
df = df.dropna().reset_index(drop = True)

# Change data types
df = df.astype({
    "x_min": int, 
    "y_min": int, 
    "x_max": int, 
    "y_max": int,
    "class_id": str
})

In [None]:
def plot_boxes(image, data, title):    
    img = cv2.cvtColor(image.numpy(), cv2.COLOR_GRAY2RGB)
    
    for i, row in data.iterrows():
    
        x1, y1 = row["x_min"], row["y_min"]
        x2, y2 = row["x_max"], row["y_max"]
    
        cv2.rectangle(
            img,
            pt1 = (x1, y1),
            pt2 = (x2, y2),
            color = row["colors"],
            thickness = 2
        )
    
        cv2.putText(
            img, 
            row["class_name"], 
            (x1, y1-5), 
            cv2.FONT_HERSHEY_SIMPLEX, 
            0.5, 
            row["colors"], 
            1
        )

    plt.figure(figsize = (8, 8))
    plt.imshow(img) 
    plt.title(title)

In [None]:
# Selecting a particular radiologist
demo_rad = "R9"

# Preprocessing metadata to suit needs
data = df.loc[
    (df["image_id"] == demo_image[:-6]) & (df["rad_id"] == demo_rad),
    ["class_name", "x_min", "y_min", "x_max", "y_max"]
]

H, W, _ = image.shape
data[["x_min", "x_max"]] = (data[["x_min", "x_max"]]* W/w).astype(int)
data[["y_min", "y_max"]] = (data[["y_min", "y_max"]]* H/h).astype(int)

data = pd.merge(data, LabelMap)

# Plotting annotation by radiologist
plot_boxes(image, data, "Labels for " + demo_image + " by " + demo_rad)

Explore annotations from other radiologist for this x-ray

In [None]:
# Preprocessing metadata to suit needs
data = df.loc[
    (df["image_id"] == demo_image[:-6]),
    ["class_name", "x_min", "y_min", "x_max", "y_max"]
]

H, W, _ = image.shape
data[["x_min", "x_max"]] = (data[["x_min", "x_max"]]* W/w).astype(int)
data[["y_min", "y_max"]] = (data[["y_min", "y_max"]]* H/h).astype(int)

data = pd.merge(data, LabelMap)

# Plotting annotation by all radiologists
plot_boxes(image, data, "Labels for " + demo_image + " by all radiologists")

## **7. Image Data Preprocessing**

We shall use a technique called Weighted Boxes Fusion (WBF) to provide us with the best annotation. This will definitely reduce the metadata size by a lot.

In [None]:
# Preprocessing as needed for WBF
data = df.loc[
    (df["image_id"] == demo_image[:-6]),
    ["class_name", "x_min", "y_min", "x_max", "y_max"]
]

data[["x_min", "x_max"]] = data[["x_min", "x_max"]]/w
data[["y_min", "y_max"]] = data[["y_min", "y_max"]]/h

data = pd.merge(data, LabelMap)

boxes_list = data[["x_min", "y_min", "x_max", "y_max"]].values.tolist()
scores_list = [1]*len(boxes_list)
labels_list = list(data["class_id"])

# Applying WBF
boxes, _, labels = weighted_boxes_fusion(
    boxes_list = [boxes_list],
    scores_list = [scores_list],
    labels_list = [labels_list],
    weights = None, 
    iou_thr = 0.3, 
    skip_box_thr = 0.0001
)

In [None]:
# Postprocessing after applying WBF 
data = pd.DataFrame(boxes, columns = ["x_min", "y_min", "x_max", "y_max"])

H, W, _ = image.shape
data[["x_min", "x_max"]] = (data[["x_min", "x_max"]]* W).astype(int)
data[["y_min", "y_max"]] = (data[["y_min", "y_max"]]* H).astype(int)

data["class_id"] = labels.astype(int)

data = pd.merge(data, LabelMap)

# Plotting annotation by all radiologists
plot_boxes(image, data, "Labels for " + demo_image + " post WBF")

The TFRecord format is a simple format for storing a sequence of binary records. This format is efficient in terms of storage and retrieval. It is the desired input format for the API. But before creating TFRecords, we must first apply WBF to the metadata. To apply WBF we must normalize the coordinates. Reading each image to extract dimensions can be time consuming. Using PyDICOM we can obtain x-ray metadata from which dimensions can be quickly extracted.

In [None]:
# Dropping rad_id as it is not required for training
df = df.drop(columns = ["rad_id"])

# Obtaining set of x-rays with at least one finding
xrays = set(df["image_id"]) # Only 4394 x-rays, not 15000. Roughly 30% of the x-rays remain.

In [None]:
dimensions = []
for i, xray in tqdm(enumerate(xrays)):
    ds = pydicom.dcmread(
        os.path.join(path, "train", xray + ".dicom"), 
        specific_tags = [
            Tag("0028", "0010"), # Tag for Rows (Height)
            Tag("0028", "0011")  # Tag for Columns (Width)
        ]
    )
    
    dimensions.append([xray, ds.Rows, ds.Columns])

In [None]:
dimensions = pd.DataFrame(dimensions, columns = ["image_id", "height", "width"])
df = pd.merge(dimensions, df)

In [None]:
# Normalize coordinates
df["x_min"], df["x_max"] = df["x_min"]/df["width"], df["x_max"]/df["width"]
df["y_min"], df["y_max"] = df["y_min"]/df["height"], df["y_max"]/df["height"]

In [None]:
# Before applying WBF we had 36096 rows
df_list = []
for i, xray in tqdm(enumerate(xrays)):
    data = df[df["image_id"] == xray]

    boxes_list = data[["x_min", "y_min", "x_max", "y_max"]].values.tolist()
    scores_list = [1]*len(boxes_list)
    labels_list = list(data["class_id"])

    # Applying WBF
    boxes, _, labels = weighted_boxes_fusion(
        boxes_list = [boxes_list],
        scores_list = [scores_list],
        labels_list = [labels_list],
        weights = None, 
        iou_thr = 0.3, 
        skip_box_thr = 0.0001
    )
    
    data = pd.DataFrame(boxes, columns = ["x_min", "y_min", "x_max", "y_max"]) 
    # Leaving the coordinates normalized since the API expects them to be so. 
    
    data["class_id"] = labels.astype(int)
    
    data["image_id"] = xray 
    
    df_list.append(data)

In [None]:
df = pd.concat(df_list) # After applying WBF we have 21836 rows
df = pd.merge(df, LabelMap)
df = df.drop(columns = ["colors"])

Since we have more than a few thousand examples, it is beneficial to shard the dataset into multiple files:

* Parallel reading improves throughput.
* Easy shuffling improves performance.

Sharding is cool but you know what's cooler? Stratified K-Fold Sharding. Basically we break down our dataset into multiple ("K") TFRecords (each is a shard) in such a way that:

* The distribution of abnormalities remains the same in each shard.
* Each x-ray is part of exactly one shard (to avoid information leak).

We can conveniently use these shards for training, validation and testing.

In [None]:
# Stratified K-Fold Sharding

num_shards = 25

skf = StratifiedKFold(
    n_splits = num_shards, 
    shuffle = True, 
    random_state = 0
)

df_folds = df[['image_id']].copy()

df_folds.loc[:, 'bbox_count'] = 1
df_folds = df_folds.groupby('image_id').count()   # Number of bounding boxes in the image
df_folds.loc[:, 'object_count'] = df.groupby('image_id')['class_id'].nunique() # Number of classes in the image

# Preparing stratify groups
df_folds.loc[:, 'stratify_group'] = np.char.add(
    df_folds['object_count'].values.astype(str),
    df_folds['bbox_count'].apply(lambda x: f'_{x // 15}').values.astype(str)
)

# Determining which fold the x-ray will fall in
df_folds.loc[:, 'fold'] = 0
skf_split = skf.split(
    X = df_folds.index, 
    y = df_folds['stratify_group']
)

for fold_number, (train_index, val_index) in enumerate(skf_split):
    df_folds.loc[df_folds.iloc[val_index].index, 'fold'] = fold_number
    
df_folds.reset_index(inplace = True)

In [None]:
df = pd.merge(df, df_folds)

temp = df.groupby(["fold", "class_name"]).agg(
    count = pd.NamedAgg("class_name", "count")
).reset_index()

temp = temp.pivot_table(
    index = "class_name",
    columns = "fold",
    values = "count"
)

In [None]:
plt.figure(figsize = (20, 10))
sns.heatmap(
    temp,
    annot = True,
    cmap = "YlGnBu",
    fmt = "g"
)
plt.title("Heatmap of class distribution")

Notice how color is similar along a row. The color distribution indicates the similar class disturbution across all folds (shards).

Once sharding is done, it is important to create TFRecords after applying CLAHE to each x-ray. We must remember to apply the same transformations to the x-rays we intend to make predictions for.

In [None]:
def create_tf_record(img_path, max_dim, img_df):
    
    filename = img_path.split("/")[-1].encode()
    source_id = img_path.encode()
    
    # Preprocess image 
    img, _, _ = read_dicom(img_path, max_dim)
    height, width, _ = img.shape
    img = CLAHE(img)
    
    # Encode as JPEG (Lossy compression)
    img = tf.io.encode_jpeg(
        img, 
        quality = 100, 
        format = 'grayscale'
    )
    
    img_bytes = img.numpy()
    
    img_format = b'jpeg'

    xmin_list = list(img_df["x_min"])
    xmax_list = list(img_df["x_max"])
    ymin_list = list(img_df["y_min"])
    ymax_list = list(img_df["y_max"])
    
    class_name_list = list(img_df["class_name"])
    class_name_list = [c.encode() for c in class_name_list]
    
    class_id_list = list(img_df["class_id"])
    
    # Creating TFRecord
    tf_record = tf.train.Example(
        features = tf.train.Features(
            feature = {
                'image/height': dataset_util.int64_feature(height),
                'image/width': dataset_util.int64_feature(width),
                'image/filename': dataset_util.bytes_feature(filename),
                'image/source_id': dataset_util.bytes_feature(source_id),
                'image/encoded': dataset_util.bytes_feature(img_bytes),
                'image/format': dataset_util.bytes_feature(img_format),
                'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_list),
                'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_list),
                'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_list),
                'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_list),
                'image/object/class/text': dataset_util.bytes_list_feature(class_name_list),
                'image/object/class/label': dataset_util.int64_list_feature(class_id_list),
            }
        )
    )
    
    return tf_record

In [None]:
annot_path = "workspace/annotations" 
os.makedirs(annot_path, exist_ok = True) 

In [None]:
img_cnt = np.zeros(num_shards, dtype = int)

with contextlib2.ExitStack() as tf_record_close_stack:
    output_tfrecords = tf_record_creation_util.open_sharded_output_tfrecords(
        tf_record_close_stack, 
        annot_path, 
        num_shards
    )
    
    for i in tqdm(range(num_shards)):
        df_shard = df[df["fold"] == i]
        xrays = set(df_shard["image_id"])
        
        for xray in xrays:
            df_image = df_shard[df_shard["image_id"] == xray]
            
            img_path = os.path.join(path, "train", xray + ".dicom")
            tf_record = create_tf_record(img_path, max_dim, df_image)
            output_tfrecords[i].write(tf_record.SerializeToString())
            
            img_cnt[i] += 1

print("Converted {} images".format(np.sum(img_cnt)))
print("Images per shard: {}".format(img_cnt))

In [None]:
# Save dataframe
df.to_csv("data.csv", index = False)

Credits

* https://www.kaggle.com/bhallaakshit/dicom-wrangling-and-enhancement
* https://www.kaggle.com/bryanb/vinbigdata-chest-x-ray-eda-fusing-boxes