![Chest XRAY](https://media.springernature.com/lw685/springer-static/image/art%3A10.1186%2Fs12890-020-01286-5/MediaObjects/12890_2020_1286_Fig1_HTML.png)

# Competition Aim
Localize and classify 14 types of thoracic abnormalities from chest radiographs
## This is an Object Detection Competition

In [None]:
import os
import cv2
import time
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

In [None]:
ROOT_DIR = "../input/vinbigdata-chest-xray-abnormalities-detection"
TRAIN_DIR = "../input/vinbigdata-chest-xray-abnormalities-detection/train"
TEST_DIR = "../input/vinbigdata-chest-xray-abnormalities-detection/test"

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

In [None]:
df.head()

In [None]:
print(f"Shape of the Dataframe: {df.shape}")

In [None]:
print(f"Number of unique images in the training dataset: {df['image_id'].nunique()}")

In [None]:
test_images = os.listdir(TEST_DIR)
print(f"Number of unique images in the training dataset: {len(test_images)}")

In [None]:
print(f"The dataset consists of observations made by {df['rad_id'].nunique()} radiologists")

In [None]:
rad_df = df['rad_id'].value_counts().reset_index()
fig = go.Figure(data=[go.Table(header=dict(values=['Radiologist ID', 'Number of Observations'], fill_color='yellow'),
                 cells=dict(values=[rad_df['index'], rad_df['rad_id']], fill_color='lavender'))
                     ])
fig.show()

In [None]:
cnt = Counter(df['image_id'])

In [None]:
cnt

There are more than 1 box in an image so it is an Object Detection Competition

# Maximum number of Boxes in an Image

Let's get the image_id of the image with maximum bounding boxes

In [None]:
max_boxes_image = max(cnt, key=cnt.get)
print(f"Image ID of image with with maximum boxes is: \'{max_boxes_image}\' and number of boxes is {cnt[max_boxes_image]}")

## Let's Visualize this image
All the images are in **DICOM** format

In [None]:
# Code taken from https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way

def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def show_dicom(image_id, root_dir=TRAIN_DIR):
    image_path = os.path.join(root_dir, image_id+".dicom")    
    img = read_xray(image_path)
    plt.figure(figsize = (12,12))
    plt.imshow(img, 'gray')

In [None]:
def get_all_bboxes(df, image_id):
    image_bboxes = df[df.image_id == image_id]
    bboxes = []
    for _,row in image_bboxes.iterrows():
        bboxes.append((row.x_min, row.y_min, row.x_max, row.y_max))  
    return bboxes

def plot_single_image(image_id, df):
    show_dicom(image_id)
    bboxes = get_all_bboxes(df, image_id)
    for i, bbox in enumerate(bboxes): 
        patch = patches.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0], bbox[3]-bbox[1],
                                    edgecolor='r',linewidth=1., facecolor='none')
        ax = plt.gca()
        ax.add_patch(patch)

In [None]:
df_max = df[df['image_id'] == max_boxes_image].copy()
df_max = df_max.reset_index(drop=True)

In [None]:
plot_single_image(max_boxes_image, df_max)

### There are many Bounding Boxes with very high overlap
### Let's investigate further

In [None]:
df_max.rad_id.value_counts()

#### We have 3 different annotators which maybe the reason of so many boxes having very high IoU

In [None]:
# Code taken from https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/

def bb_intersection_over_union(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

In [None]:
for i in range(len(df_max)):
    for j in range(i+1, len(df_max)):
        boxA = df_max.iloc[i, 4:].tolist()
        boxB = df_max.iloc[j, 4:].tolist()
        iou = bb_intersection_over_union(boxA, boxB)
        if iou >= 0.5:
            print(iou)
            print(df_max.iloc[i, 1:4])
            print(df_max.iloc[j, 1:4])
            print("-"*25, end='\n')

## There are 18 combinations of boxes with IoU Score above 0.5
## Most of them correspond to same class but have different annotators
## But there are 2 instances where **"ILD"** class has very high IoU with **"Nodule/Mass"** 

# Class Distribution

In [None]:
classes = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly", "Consolidation", "ILD", "Infiltration", "Lung Opacity",
           "Nodule/Mass", "Other lesion", "Pleural effusion", "Pleural thickening", "Pneumothorax", "Pulmonary fibrosis", "No finding"]

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Class ID', 'Class Name'], fill_color='yellow'),
                 cells=dict(values=[list(range(15)), classes], fill_color='lavender'))
                     ])
fig.show()

In [None]:
temp_df = df["class_id"].value_counts().sort_index()

trace1 = go.Bar(
                x = classes,
                y = temp_df.tolist(),
                marker = dict(color = 'rgb(127, 16, 238)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=temp_df.tolist(), textposition='outside')
layout = go.Layout(template= "plotly_dark",title = 'Number of classes' , xaxis = dict(title = 'Class'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

# Bounding Box Area

In [None]:
df_findings = df[df["class_id"] != 14].copy()

In [None]:
df_findings['bbox_area'] = (df_findings["x_max"] - df_findings["x_min"]) * (df_findings["y_max"] - df_findings["y_min"])

In [None]:
print(f"Total number of bounding boxes present in the dataset is {len(df_findings)}")

In [None]:
print(f"Maximum area of bounding box is {df_findings['bbox_area'].max()}")
print(f"Minimum area of bounding box is {df_findings['bbox_area'].min()}")

In [None]:
trace = go.Histogram(
    x=df_findings['bbox_area'],
    name = "Bounding Box Area",
    xbins=dict(size=30000),
    marker=dict(color='rgb(12, 50, 196)'))
layout = go.Layout(template= "plotly_dark",title = 'Bounding Box Area' , xaxis = dict(title = 'Area'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

# Visualizing Images Classwise

In [None]:
df_findings.drop('class_id',axis=1).groupby('class_name').agg(['min', 'max', 'mean','median'])

* **Nodule/Mass** has the smallest bounding boxes
* **Pneumothorax** and **ILD** have the largest bounding boxes

In [None]:
def plot_k_images(df, k=3):
    image_ids = random.choices(df['image_id'].tolist(), k=3)
    for image_id in image_ids:
        plot_single_image(image_id, df)

# Aortic Enlargement

In [None]:
df_0 = df[df.class_id == 0]
plot_k_images(df_0)

# Atelectasis

In [None]:
df_1 = df[df.class_id == 1]
plot_k_images(df_1)

# Calcification

In [None]:
df_2 = df[df.class_id == 2]
plot_k_images(df_2)

# Cardiomegaly

In [None]:
df_3 = df[df.class_id == 3]
plot_k_images(df_3)

# Consolidation

In [None]:
df_4 = df[df.class_id == 4]
plot_k_images(df_4)

# ILD

In [None]:
df_5 = df[df.class_id == 5]
plot_k_images(df_5)

# Infiltration

In [None]:
df_6 = df[df.class_id == 6]
plot_k_images(df_6)

# Lung Opacity

In [None]:
df_7 = df[df.class_id == 7]
plot_k_images(df_7)

# Nodule/Mass

In [None]:
df_8 = df[df.class_id == 8]
plot_k_images(df_8)

# Other lesion

In [None]:
df_9 = df[df.class_id == 9]
plot_k_images(df_9)

# Pleural Effusion

In [None]:
df_10 = df[df.class_id == 10]
plot_k_images(df_10)

# Pleural thickening

In [None]:
df_11 = df[df.class_id == 11]
plot_k_images(df_11)

# Pneumothorax

In [None]:
df_12 = df[df.class_id == 12]
plot_k_images(df_12)

# Pulmonary Fibrosis

In [None]:
df_13 = df[df.class_id == 13]
plot_k_images(df_13)

## If you like this kernel, please leave an upvote :)

## Work in Progress 