# VinBigData Chest X-ray Abnormalities Detection

![](https://static.theprint.in/wp-content/uploads/2020/03/qureai-696x392.jpg)

In [None]:
from datetime import datetime
dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print(f"Updated {dt_string} (GMT)")

## Prepare the data analysis
## Load packages

In [None]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from matplotlib.patches import Rectangle
import seaborn as sns
import pydicom as dcm
%matplotlib inline 
IS_LOCAL = False
import os 
import sys
import random
import math
import numpy as np
import cv2
import matplotlib.pyplot as plt
import json
import pydicom
from imgaug import augmenters as iaa
from tqdm import tqdm
import pandas as pd 
import glob
from sklearn.model_selection import KFold
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
DATA_DIR = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection'

# Directory to save logs and trained model
ROOT_DIR = '/kaggle/working'

In [None]:
!git clone https://www.github.com/matterport/Mask_RCNN.git
os.chdir('Mask_RCNN')
#!python setup.py -q install

In [None]:
# Import Mask RCNN
sys.path.append(os.path.join(ROOT_DIR, 'Mask_RCNN'))  # To find local version of the library
from mrcnn.config import Config
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

In [None]:
train_dicom_dir = os.path.join(DATA_DIR, 'train')
test_dicom_dir = os.path.join(DATA_DIR, 'test')

In [None]:
!wget --quiet https://github.com/matterport/Mask_RCNN/releases/download/v2.0/mask_rcnn_coco.h5
!ls -lh mask_rcnn_coco.h5

COCO_WEIGHTS_PATH = "mask_rcnn_coco.h5"

In [None]:
def get_dicom_fps(dicom_dir):
    dicom_fps = glob.glob(dicom_dir+'/'+'*.dicom')
    return list(set(dicom_fps))

def parse_dataset(dicom_dir, anns): 
    image_fps = get_dicom_fps(dicom_dir)
    image_annotations = {fp: [] for fp in image_fps}
    for index, row in anns.iterrows(): 
        if dicom_dir+'/'+row['image_id']+'.dicom' in image_fps:
            fp = os.path.join(dicom_dir, row['image_id']+'.dicom')
            image_annotations[fp].append(row)
        else: continue
    return image_fps, image_annotations 

In [None]:
# The following parameters have been selected to reduce running time for demonstration purposes 
# These are not optimal 

class DetectorConfig(Config):
    """Configuration for training Chest X-ray Abnormalities dataset.
    Overrides values in the base Config class.
    """
    
    # Give the configuration a recognizable name  
    NAME = 'Abnormalities'
    
    # Train on 1 GPU and 8 images per GPU. We can put multiple images on each
    # GPU because the images are small. Batch size is 8 (GPUs * images/GPU).
    GPU_COUNT = 1
    IMAGES_PER_GPU = 8
    
    BACKBONE = 'resnet50'
    
    NUM_CLASSES = 2  # background + 1 pneumonia classes
    
    IMAGE_MIN_DIM = 256
    IMAGE_MAX_DIM = 256
    RPN_ANCHOR_SCALES = (16, 32, 64, 128)
    TRAIN_ROIS_PER_IMAGE = 32
    MAX_GT_INSTANCES = 4
    DETECTION_MAX_INSTANCES = 3
    DETECTION_MIN_CONFIDENCE = 0.78  ## match target distribution
    DETECTION_NMS_THRESHOLD = 0.01

    STEPS_PER_EPOCH = 200

config = DetectorConfig()
config.display()

In [None]:
class DetectorDataset(utils.Dataset):
    """Dataset class for training Chest X-ray Abnormalities dataset.
    """

    def __init__(self, image_fps, image_annotations, orig_height, orig_width):
        super().__init__(self)
        
        # Add classes
        self.add_class("Abnormalities", 0, "Aortic enlargement")
        self.add_class("Abnormalities", 1, "Atelectasis")
        self.add_class("Abnormalities", 2, "Calcification")
        self.add_class("Abnormalities", 3, "Cardiomegaly")
        self.add_class("Abnormalities", 4, "Consolidation")
        self.add_class("Abnormalities", 5, "ILD")
        self.add_class("Abnormalities", 6, "Infiltration")
        self.add_class("Abnormalities", 7, "Lung Opacity")
        self.add_class("Abnormalities", 8, "Nodule/Mass")
        self.add_class("Abnormalities", 9, "Other lesion")
        self.add_class("Abnormalities", 10, "Pleural effusion")
        self.add_class("Abnormalities", 11, "Pleural thickening")
        self.add_class("Abnormalities", 12, "Pneumothorax")
        self.add_class("Abnormalities", 13, "Pulmonary fibrosis")
        self.add_class("Abnormalities", 14, "No finding")
        # add images 
        for i, fp in enumerate(image_fps):
            annotations = image_annotations[fp]
            self.add_image('Abnormalities', image_id=i, path=fp, 
                           annotations=annotations, orig_height=orig_height, orig_width=orig_width)
            
    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']

    def load_image(self, image_id):
        info = self.image_info[image_id]
        fp = info['path']
        ds = pydicom.read_file(fp)
        image = ds.pixel_array
        # If grayscale. Convert to RGB for consistency.
        if len(image.shape) != 3 or image.shape[2] != 3:
            image = np.stack((image,) * 3, -1)
        return image

    def load_mask(self, image_id):
        info = self.image_info[image_id]
        annotations = info['annotations']
        count = len(annotations)
        if count == 0:
            mask = np.zeros((info['orig_height'], info['orig_width'], 1), dtype=np.uint8)
            class_ids = np.zeros((1,), dtype=np.int32)
        else:
            mask = np.zeros((info['orig_height'], info['orig_width'], count), dtype=np.uint8)
            class_ids = np.zeros((count,), dtype=np.int32)
            for i, a in enumerate(annotations):
                if a['class_id'] == 0:
                    x = int(a['x_min'])
                    y = int(a['y_min'])
                    w = int(a['x_max'])
                    h = int(a['y_max'])
                    mask_instance = mask[:, :, i].copy()
                    cv2.rectangle(mask_instance, (x, y), (x+w, y+h), 255, -1)
                    mask[:, :, i] = mask_instance
                    class_ids[i] = 1
        return mask.astype(np.bool), class_ids.astype(np.int32)

In [None]:
train = pd.read_csv('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
train = train[train['class_name']!='No finding']
train.head()

In [None]:
train.shape

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return np.transpose(pd.concat([total, percent], axis=1, keys=['Total', 'Percent']))
missing_data(train)

In [None]:
f, ax = plt.subplots(1,1, figsize=(10,8))
total = float(len(train))
sns.countplot(train['class_name'],order = train['class_name'].value_counts().index, palette='Set3')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(100*height/total),
            ha="center") 
plt.show()

### Let's look into more details to the classes.

In [None]:
def get_feature_distribution(data, feature):
    # Get the count for each label
    label_counts = data[feature].value_counts()

    # Get total number of samples
    total_samples = len(data)

    # Count the number of items in each class
    print("Feature: {}".format(feature))
    for i in range(len(label_counts)):
        label = label_counts.index[i]
        count = label_counts.values[i]
        percent = int((count / total_samples) * 10000) / 100
        print("{:<30s}:   {} or {}%".format(label, count, percent))

get_feature_distribution(train, 'class_name')

## Detected Chest Opacity window
### For the class Chest Opacity, corresponding to class Aortic enlargement, we plot the density of y_max, x_max, y_min and x_min

In [None]:
target1 = train[train['class_name']=='Aortic enlargement']
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(2,2,figsize=(12,12))
sns.distplot(target1['x_max'],kde=True,bins=50, color="red", ax=ax[0,0])
sns.distplot(target1['y_max'],kde=True,bins=50, color="blue", ax=ax[0,1])
sns.distplot(target1['x_min'],kde=True,bins=50, color="green", ax=ax[1,0])
sns.distplot(target1['y_min'],kde=True,bins=50, color="magenta", ax=ax[1,1])
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show()

## The centers of the rectangles like points

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,7))
target_sample = target1
target_sample['xc'] = target_sample['y_min'] + target_sample['y_max'] / 2
target_sample['yc'] = target_sample['x_min'] + target_sample['x_max'] / 2
plt.title("Centers of Chest Opacity rectangles (brown) over rectangles (yellow)")
target_sample.plot.scatter(x='xc', y='yc', xlim=(0,1024), ylim=(0,1024), ax=ax, alpha=0.2, marker=".", color="brown")
for i, crt_sample in target_sample.iterrows():
    ax.add_patch(Rectangle(xy=(crt_sample['x_min'], crt_sample['y_min']),
                width=crt_sample['x_max'],height=crt_sample['y_max'],alpha=3.5e-3, color="yellow"))
plt.show()

## Explore DICOM data
#### Let's read now the DICOM data in the train set. The image path is as following:

In [None]:
image_sample_path = os.listdir('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train')[:5]
print(image_sample_path)

The files names are the patients IDs.

Let's check how many images are in the train and test folders.

In [None]:
image_train_path = os.listdir('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train')
image_test_path = os.listdir('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/test')
print("Number of images in train set:", len(image_train_path),"\nNumber of images in test set:", len(image_test_path))

### Only a reduced number of images are present in the training set (15000), compared with the number of images in the train_df data (67914).

### It might be that we do have duplicated entries in the train and class datasets. Let's check this.

### Check duplicates in train dataset

In [None]:
print("Unique image_id in  train: ", train['image_id'].nunique()) 

In [None]:
tmp = train.groupby(['image_id','class_name'])['image_id'].count()
df = pd.DataFrame(data={'Exams': tmp.values}, index=tmp.index).reset_index()
tmp = df.groupby(['Exams','class_name']).count()
df2 = pd.DataFrame(data=tmp.values, index=tmp.index).reset_index()
df2.columns = ['Exams', 'class_name', 'Entries']
df2

In [None]:
fig, ax = plt.subplots(nrows=1,figsize=(12,6))
sns.barplot(ax=ax,x = 'class_name', y='Entries', hue='Exams',data=df2, palette='Set2')
plt.title("Chest exams class and class_name")
plt.show()

In [None]:
dicom_file_dataset = dcm.read_file('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom')
dicom_file_dataset

In [None]:
def show_dicom_images(data):
    img_data = data
    f, ax = plt.subplots(3,3, figsize=(16,18))
    for i,data_row in enumerate(img_data):
        imagePath = data_row
        data_row_img_data = dcm.read_file(imagePath)
        data_row_img = dcm.dcmread(imagePath)
        ax[i//3, i%3].imshow(data_row_img.pixel_array, cmap=plt.cm.bone) 
        ax[i//3, i%3].axis('off')
    plt.show()

In [None]:
data = ['/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/00053190460d56c53cc3e57321387478.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/0006e0a85696f6bb578e84fafa9a5607.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/0007d316f756b3fa0baea2ff514ce945.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/000ae00eb3942d27e0b97903dd563a6e.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/000d68e42b71d3eac10ccc077aba07c1.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/00150343289f317a0ad5629d5b7d9ef9.dicom',
       '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train/00176f7e1b1cb835123f95960b9a9efd.dicom']

In [None]:
show_dicom_images(data)

# Plot bounding box

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
def draw_bboxes(img, boxes, thickness=10, color=(255, 0, 0), img_size=(500,500)):
    img_copy = img.copy()
    if len(img_copy.shape) == 2:
        img_copy = np.stack([img_copy, img_copy, img_copy], axis=-1)
    for box in boxes:
        img_copy = cv2.rectangle(
            img_copy,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness)
    if img_size is not None:
        img_copy = cv2.resize(img_copy, img_size)
    return img_copy

In [None]:
import random
from random import randint

imgs = []
img_ids = train['image_id'].values
class_ids = train['class_id'].unique()

# map label_id to specify color
label2color = {class_id:[randint(0,255) for i in range(3)] for class_id in class_ids}
thickness = 3
scale = 5


for i in range(8):
    img_id = random.choice(img_ids)
    img_path = f'{train_dicom_dir}/{img_id}.dicom'
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    boxes = train.loc[train['image_id'] == img_id, ['x_min', 'y_min', 'x_max', 'y_max']].values/scale
    labels = train.loc[train['image_id'] == img_id, ['class_id']].values.squeeze()
    
    for label_id, box in zip(labels, boxes):
        color = label2color[label_id]
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

In [None]:
# training dataset
features = ['image_id' ,'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max']
anns = pd.read_csv('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
anns = anns[features]
anns.head()

In [None]:
image_fps, image_annotations = parse_dataset(train_dicom_dir, anns=anns)

In [None]:
ds = pydicom.read_file(image_fps[0]) # read dicom image from filepath 
image = ds.pixel_array # get image array

In [None]:
# show dicom fields 
ds

In [None]:
# Original DICOM image size: 3072 x 2540
WIDTH_SIZE = 3072
HIGH_SIZE = 2540

In [None]:
image_fps_list = list(image_fps)
random.seed(42)
random.shuffle(image_fps_list)
val_size = 1500
image_fps_val = image_fps_list[:val_size]
image_fps_train = image_fps_list[val_size:]

print(len(image_fps_train), len(image_fps_val))
# print(image_fps_val[:6])

In [None]:
# prepare the training dataset
dataset_train = DetectorDataset(image_fps_train, image_annotations, WIDTH_SIZE, HIGH_SIZE)
dataset_train.prepare()

In [None]:
# Show annotation(s) for a DICOM image 
test_fp = random.choice(image_fps_train)
image_annotations[test_fp]

In [None]:
# prepare the validation dataset
dataset_val = DetectorDataset(image_fps_val, image_annotations, WIDTH_SIZE, HIGH_SIZE)
dataset_val.prepare()

In [None]:
# Load and display random sample and their bounding boxes

class_ids = [0]
while class_ids[0] == 0:  ## look for a mask
    image_id = random.choice(dataset_train.image_ids)
    image_fp = dataset_train.image_reference(image_id)
    image = dataset_train.load_image(image_id)
    mask, class_ids = dataset_train.load_mask(image_id)

print(image.shape)

plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.axis('off')

plt.subplot(1, 2, 2)
masked = np.zeros(image.shape[:2])
for i in range(mask.shape[2]):
    masked += image[:, :, 0] * mask[:, :, i]
plt.imshow(masked, cmap='gray')
plt.axis('off')

print(image_fp)
print(class_ids)

In [None]:
# Image augmentation (light but constant)
augmentation = iaa.Sequential([
    iaa.OneOf([ ## geometric transform
        iaa.Affine(
            scale={"x": (0.98, 1.02), "y": (0.98, 1.04)},
            translate_percent={"x": (-0.02, 0.02), "y": (-0.04, 0.04)},
            rotate=(-2, 2),
            shear=(-1, 1),
        ),
        iaa.PiecewiseAffine(scale=(0.001, 0.025)),
    ]),
    iaa.OneOf([ ## brightness or contrast
        iaa.Multiply((0.9, 1.1)),
        iaa.ContrastNormalization((0.9, 1.1)),
    ]),
    iaa.OneOf([ ## blur or sharpen
        iaa.GaussianBlur(sigma=(0.0, 0.1)),
        iaa.Sharpen(alpha=(0.0, 0.1)),
    ]),
])

# test on the same image as above
imggrid = augmentation.draw_grid(image[:, :, 0], cols=5, rows=2)
plt.figure(figsize=(30, 12))
_ = plt.imshow(imggrid[:, :, 0], cmap='gray')

Reference: [Notebook 1](https://www.kaggle.com/gpreda/rsna-pneumonia-detection-eda)
[Notebook 2](https://www.kaggle.com/hmendonca/mask-rcnn-and-medical-transfer-learning-siim-acr)