In [None]:
import pandas as pd
import numpy as np
import os
import pydicom
from glob import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')

from bokeh.plotting import figure as bokeh_figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
from PIL import Image
from sklearn import preprocessing
import random
from random import randint

# ---- #
from sklearn.utils import shuffle
from IPython.core.display import display, HTML, Javascript
from string import Template
import json, random
import IPython.display
from plotly.offline import init_notebook_mode, iplot
from plotly import subplots
import plotly.figure_factory as ff
import plotly as py
import plotly.graph_objects as go
init_notebook_mode(connected=True)


# Defining all our palette colors
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

# "coffee" pallette turqoise-gold.
f1 = "#a2885e"
f2 = "#e9cf87"
f3 = "#f1efd9"
f4 = "#8eb3aa"
f5 = "#235f83"
f6 = "#b4cde3"

In [None]:
html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
    <style>
    .toc h2{
        color: white;
        background: #3f4d63;
        font-weight: 600;
        font-family: Helvetica;
        font-size: 23px;
        padding: 6px 12px;
        margin-bottom: 2px;
    }
    
    .toc ol li{
        list-style:none;
        line-height:normal;
        }
     
    .toc li{
        background: #235f83;
        color: white;
        font-weight: 600;
        font-family: Helvetica;
        font-size: 18px;
        margin-bottom: 2px;
        padding: 6px 12px;
    }

    .toc ol ol li{
        background: #fff;
        color: #4d4d4d;
        font-weight: 400;
        font-size: 15px;
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        margin-top: 0px;
        margin-bottom: 0px;
        padding: 3px 12px;
    } 
    
    .section_title{
        background-color: #3f4d63;
        color: white;
        font-family: Helvetica;
        font-size: 25px;
        padding: 6px 12px;
        margin-bottom: 5px;
    }
    .subsection_title{
        background: #235f83;
        color: white;
        font-family: Helvetica;
        font-size: 21px;
        padding: 6px 12px;
        margin-bottom: 0px;
    }
    .sidenote{
        font-size: 13px;
        border: 1px solid #d7d7d7;
        padding: 1px 10px 2px;
        box-shadow: 1px 1px 2px 1px rgba(0,0,0,0.3);
        margin-bottom: 3px;
    }
    </style>
    </head>
    <body>
        <div class="toc">
        
        <ol> 
        <h2> Table of Contents </h2>
        <li>1. Introduction </li> 
        <li>2. Dicom to Numpy array</li>
        <li>3. EDA csv</li>
        <ol> 
            <li>3.1 Plot Bounding Box </li>
            <li>3.2 Plot Histogram </li> 
        </ol>
        <li>4. References </li>
        </ol>
        </div>
    </body>
</html>
"""

HTML(html_contents)

# 1. Introduction

In this competition, we will classify 14 types of thoracic abnormalities from chest radiographs. We will work with 18K scans, annotated by experienced radiologists. We will train the model with 15K independently labeled images and then will evaluate on a test of 3K images. These annotations have been collected vua VinBigData's web-based platform, VinLab. Details on building the dataset can be found in our recent paper “VinDr-CXR: An open dataset of chest X-rays with radiologist's annotations”.



# 2. Dicom to Numpy array

All the images are in Dicom format, so we will convert them into numpy array.


In [None]:
# Loading Data

dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection'

In [None]:
def dicom2array(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
        
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    
    return data

def plot_img(img, size = (12,12), is_rgb = True, title = "", cmap = 'gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap = cmap)
    plt.suptitle(title)
    plt.show()
    
def plot_imgs(imgs, cols = 3, size = 10, is_rgb=True, title = "", cmap = 'gray', img_size = (300, 300)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
            
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
    

In [None]:
img = dicom2array('../input/vinbigdata-chest-xray-abnormalities-detection/train/0108949daa13dc94634a7d650a05c0bb.dicom')
plt.figure(figsize= (15,15))
plt.imshow(img, 'gray')

Image without fixing MONOCHROME issue:

In [None]:
img = dicom2array('../input/vinbigdata-chest-xray-abnormalities-detection/train/0108949daa13dc94634a7d650a05c0bb.dicom', fix_monochrome = False)
plt.figure(figsize = (15,15))
plt.imshow(img, 'gray')

In [None]:
dicom_paths = glob(f'{dataset_dir}/train/*.dicom')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

In [None]:
# Let's try preprocess like equalize histograms to see the difference between before and after

imgs = [exposure.equalize_hist(img) for img in imgs]
plot_imgs(imgs)

# 3. EDA csv

Let's perform EDA to find out important features in this dataset

In [None]:
def get_bbox_area(row):
    return (row['x_max']-row['x_min'])*(row['y_max']-row['y_min'])

train_df = pd.read_csv(f'{dataset_dir}/train.csv')
le = preprocessing.LabelEncoder()
train_df['rad_label'] = le.fit_transform(train_df['rad_id'])

finding_df = train_df[train_df['class_name'] != 'No finding']
finding_df['bbox_area'] = finding_df.apply(get_bbox_area, axis = 1)
finding_df.head()

# 3.1 Plot Bounding Box

In [None]:
imgs = []
img_ids = finding_df['image_id'].values
class_ids = finding_df['class_id'].unique()

# map label_id to specify color
label2color = {class_id:[randint(0,255) for i in range(3)] for class_id in class_ids}
thickness = 3
scale = 6

for i in range(9):
    img_id = random.choice(img_ids)
    img_path = f'{dataset_dir}/train/{img_id}.dicom'
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy = 1/scale)
    img = np.stack([img, img, img], axis = -1)
    
    boxes = finding_df.loc[finding_df['image_id'] == img_id, ['x_min', 'y_min', 'x_max', 'y_max']].values/scale
    labels = finding_df.loc[finding_df['image_id'] == img_id, ['class_id']].values.squeeze()
    
    
    for label_id, box in zip(labels, boxes):
        color = label2color[label_id]
        img = cv2.rectangle(img,
                            (int(box[0]), int(box[1])),
                            (int(box[2]), int(box[3])),
                            color, thickness
                           )
    img = cv2.resize(img, (600, 600))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

# 3.2 Plot Histogram

In [None]:
def hist_hover(dataframe, column, color = ['#94c8d8', '#ea5e51'], bins=30, title="", value_range=None):
    """
    Plotting Histogram
    """
    hist, edges = np.histogram(dataframe[column], bins = bins, range=value_range)
    hist_frame = pd.DataFrame({
        column: hist,
        "left": edges[:-1],
        "right": edges[1:]
    })
    hist_frame["interval"] = ["%d to %d" %
                             (left, right) for left, right in zip(edges[:-1], edges[1:])]
    src = ColumnDataSource(hist_frame)
    plot = bokeh_figure(
    plot_height=400, plot_width=600,
        title=title, x_axis_label=column,
        y_axis_label = 'Count'
    )
    
    plot.quad(
    bottom = 0, top = column, left = "left", right = "right",
        source = src, fill_color = color[0], line_color = '#35838d',
        fill_alpha=0.7, hover_fill_alpha=0.7,
        hover_fill_color = color[1]
    )
    
    hover = HoverTool(
    tooltips= [("Interval", "@interval"), ("Count", str(f"@{column}"))]
    )
    plot.add_tools(hover)
    output_notebook()
    show(plot)
    
hist_hover(finding_df, column = 'class_id')


In [None]:
hist_hover(finding_df, column = 'rad_label')

In [None]:
hist_hover(finding_df, column = 'bbox_area')

# 4. References

https://www.kaggle.com/trungthanhnguyen0502/eda-vinbigdata-chest-x-ray-abnormalities

https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
