__Competition Goal__

Automatically localize and classify 14 types of thoracic abnormalities from chest radiographs

__Competition Metric__

Standard PASCAL VOC 2010 _mean Average Precision (mAP)_ at IoU > 0.4

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from tqdm import tqdm
from glob import glob
import gc


import cv2
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from IPython.display import display

plt.rcParams["figure.figsize"] = (12,8)
plt.rcParams['axes.titlesize'] = 16

import pydicom as dicom

import warnings
warnings.filterwarnings('ignore')

print(os.listdir('/kaggle/input/'))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/'
os.listdir(base_dir)

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
train

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
sub

In [None]:
print('Number of train image ids: {}'.format(len(train)))
print('Number of test image ids: {}'.format(len(sub)))

In [None]:
print('Number of unique train image ids: {}'.format(train['image_id'].nunique()))
print('Number of unique test image ids: {}'.format(sub['image_id'].nunique()))

In [None]:
print('Number of unique rad id in train: {}'.format(train['rad_id'].nunique()))

In [None]:
print('Number of targets: {}'.format(train['class_name'].nunique()))
targets = np.sort(train['class_name'].unique())
print(targets)

In [None]:
train['class_name'].value_counts(normalize = True)

In [None]:
ax = sns.countplot(train['class_name'])
plt.xticks(rotation = 70)
plt.tight_layout()

- Clearly imbalanced dataset!

In [None]:
target_map = {'Aortic enlargement': 0, 'Atelectasis': 1, 'Calcification': 2, 'Cardiomegaly': 3, 
              'Consolidation': 4,  'ILD': 5, 'Infiltration': 6, 'Lung Opacity': 7, 'Nodule/Mass': 8, 
              'Other lesion': 9, 'Pleural effusion': 10, 'Pleural thickening': 11, 'Pneumothorax': 12, 
              'Pulmonary fibrosis': 13, ' No finding': 14}

In [None]:
print('Read and display a dicom file...')

img_id = np.random.choice(train['image_id'], 1)[0]
dicom_path = base_dir + 'train/' + img_id + '.dicom'
dicom_img = dicom.dcmread(dicom_path)
print(dicom_img)

In [None]:
idx = np.random.choice(train['image_id'], 1)[0]
cls = train.loc[train['image_id'] == idx, 'class_name']
print('Number of classes for image id: {} is {}'.format(idx, len(cls)))
print('Number of unique classes for image id: {} is {}'.format(idx, len(np.unique(cls))))
print(cls)

__Visualization__

In [None]:
def display_images(idx, lbl):
    f, ax = plt.subplots(1, 3, figsize = (15, 10))
    f.subplots_adjust(hspace = .1, wspace = .1)

    for i in range(3):
        dicom_path = base_dir + 'train/' + train.loc[idx[i], 'image_id'] + '.dicom'
        dicom_file = dicom.dcmread(dicom_path)
        img = dicom_file.pixel_array
        ax[i].imshow(img, cmap = 'gray')
        ax[i].set_xticklabels([])
        ax[i].set_yticklabels([])
        ax[i].set_title(f'{lbl}', fontsize = 10)
        if lbl != 'No finding':
            bbox = [train.loc[idx[i], 'x_min'],
                    train.loc[idx[i], 'y_min'],
                    train.loc[idx[i], 'x_max'],
                    train.loc[idx[i], 'y_max']]
            p = matplotlib.patches.Rectangle((bbox[0], bbox[1]),
                                             bbox[2] - bbox[0],
                                             bbox[3] - bbox[1],
                                             color = 'red', fc = 'none')
            ax[i].add_patch(p)
    plt.show()

In [None]:
for i, lbl in enumerate(targets):
    indices = train.loc[train['class_name'] == lbl][:3].index.values
    display_images(indices, lbl)

__Visualize with all Bounding Box, number of diagnosis  and unqiue diagnosis__

In [None]:
def display_all_class(idx, lbl):
    f, ax = plt.subplots(1, 3, figsize = (15, 10))
    f.subplots_adjust(hspace = .1, wspace = .1)

    for i in range(3):
        dicom_path = base_dir + 'train/' + idx[i] + '.dicom'
        temp = train.loc[train['image_id'] == idx[i]]
        n_diag = len(temp)
        n_udiag = temp['class_name'].unique()
        #cmap = plt.cm.get_cmap("hsv", n_diag + 1)
        dicom_file = dicom.dcmread(dicom_path)
        img = dicom_file.pixel_array
        ax[i].imshow(img, cmap = 'gray')
        ax[i].set_xticklabels([])
        ax[i].set_yticklabels([])
        ax[i].set_title(f'{lbl}, Votes: {n_diag}, Majority: {len(n_udiag)}', fontsize = 10)
        if lbl != 'No finding':
            for j in temp.index.values:
                bbox = [temp.loc[j, 'x_min'],
                        temp.loc[j, 'y_min'],
                        temp.loc[j, 'x_max'],
                        temp.loc[j, 'y_max']]
                p = matplotlib.patches.Rectangle((bbox[0], bbox[1]),
                                                 bbox[2] - bbox[0],
                                                 bbox[3] - bbox[1],
                                                 ec = np.random.random(3), fc = 'none')
                ax[i].add_patch(p)
    plt.show()

In [None]:
for i, lbl in enumerate(targets):
    ids = train.loc[train['class_name'] == lbl]['image_id'].unique()[:3]
    display_all_class(ids, lbl)

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))