# EDA Chest X-ray Abnormalities Detection

In [None]:
from IPython.display import HTML
HTML('<center><iframe width="650" height="450" src="https://www.youtube.com/embed/PRS_CXprri0" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe></center>')

### Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import gc
import numpy as np # linear algebra
import pandas # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
#import matplotlib.patches as ptc
import plotly.graph_objects as go
import seaborn as sns
%matplotlib inline
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut
'''
from functools import partial
import multiprocessing as mpc
from joblib import Parallel, delayed
'''

import os
for dirname, _, filenames in os.walk('/kaggle/input/vinbigdata-chest-xray-abnormalities-detection'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
PATH = "../input/vinbigdata-chest-xray-abnormalities-detection/"
train_df = pandas.read_csv(os.path.join(PATH, 'train.csv'))
train_df

In [None]:
Rows, Cols = train_df.shape
print(f'There are {Rows} Rows and {Cols} columns in train.csv')

In [None]:
train_df.class_id.nunique()

In [None]:
train_df.groupby(['class_name', 'class_id']).agg({'count'})['image_id'].sort_values(by='count').rename(columns={0:"Unique Values"}).style.background_gradient(cmap="plasma")

In [None]:
plt.figure(figsize=(10, 10))
sns.pairplot(train_df, hue='class_name')
plt.show()

In [None]:
train_df.image_id.value_counts().to_frame()

In [None]:
images = train_df.image_id.nunique()
print(f"There are in total {images} unique images in the train test.")

Read Dicom image

In [None]:
di = dcmread('../input/vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom')
di

View Dicom image

In [None]:
plt.figure(figsize=(16,6))
x = plt.imshow(di.pixel_array, 'gray')

In [None]:
plt.figure(figsize=(16,6))
x = plt.imshow(di.pixel_array, cmap=plt.cm.bone)

In [None]:
plt.figure(figsize=(16,6))
x = plt.imshow(di.pixel_array, cmap=plt.cm.gist_ncar)

# Class Distribution

In [None]:
plt.figure(figsize=(28, 8))
sns.countplot(x="class_name", orient="h", data=train_df)
plt.title("Class Distribution")
plt.show()

In [None]:
def plot_distribution_classes(x_values, y_values, title):
    
    #colors = ['rgb(26, 118, 255)',] * 15
    #colors[0] = 'lightslategray'

    fig = go.Figure(data=[go.Bar(
        x=x_values, 
        y=y_values,
        text=y_values
        #marker_color=colors
    )])

    fig.update_layout(height=400, width=700, title_text=title)
    fig.update_xaxes(type="category")

    fig.show()

In [None]:
train_df.class_name.value_counts().to_frame().rename(columns={0:"Unique Values"}).style.background_gradient(cmap="plasma")

In [None]:
indexes = train_df.class_name.unique()
counts = train_df.class_name.value_counts()

sorted_dict = dict(zip(indexes, counts))
sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse = True)}

x = list(sorted_dict.keys())
y = list(sorted_dict.values())

plot_distribution_classes(x, y, 
                          title="Distribution of radiographic observations")

As we can see there is a class imbalance problem. We need to augment the data to address this problem.

# Radiologiet Distribution

In [None]:
train_df.rad_id.value_counts().to_frame().rename(columns={0:"Unique Values"}).style.background_gradient(cmap="plasma")

In [None]:
indexes = train_df.rad_id.unique()
counts = train_df.rad_id.value_counts()

sorted_dict = dict(zip(indexes, counts))
sorted_dict = {k: v for k, v in sorted(sorted_dict.items(), key=lambda item: item[1], reverse = True)}

x = list(sorted_dict.keys())
y = list(sorted_dict.values())

plot_distribution_classes(x, y, 
                          title="Distribution of Annotations by Radioloiest")

# FastAI to process DICOMs 

### DICOM metadata to Dataframe --> Pikle

In [None]:
!pip install -Uqq fastai

In [None]:
from fastai.basics import *
from fastai.callback.all import *
from fastai.vision.all import *
from fastai.medical.imaging import *

import pydicom,kornia,skimage
from pydicom.dataset import Dataset as DcmDataset
from pydicom.tag import BaseTag as DcmTag
from pydicom.multival import MultiValue as DcmMultiValue
from PIL import Image

try:
    import cv2
    cv2.setNumThreads(0)
except: pass

In [None]:
path = Path('../input/vinbigdata-chest-xray-abnormalities-detection')
train_imgs = path/'train'
train_dicom = get_dicom_files(train_imgs)
dicom_dataframe = pd.DataFrame.from_dicoms(train_dicom, window=dicom_windows.lungs, px_summ=False)

Write DICOM metadata into pkl for fast processing

In [None]:
dicom_dataframe.to_pickle('./dicom_dataframe_pickle.pkl')
dicom_dataframe.shape

Read metadata from pickle

In [None]:
dicom_dataframe = pd.read_pickle('./dicom_dataframe_pickle.pkl')
dicom_dataframe.shape # should be 15k by 29

View DICOM metadata into dataframe

In [None]:
dicom_dataframe

# Save DICOM to Image

## DICOM to PNG

In [None]:
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from PIL import Image
from tqdm.auto import tqdm

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [None]:
image_id = []
dim0 = []
dim1 = []

for split in ['train', 'test']:
    load_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/{split}/'
    save_dir = f'/kaggle/tmp/{split}/'

    os.makedirs(save_dir, exist_ok=True)

    for file in tqdm(os.listdir(load_dir)):
        # set keep_ratio=True to have original aspect ratio
        xray = read_xray(load_dir + file)
        im = resize(xray, size=1024)  
        im.save(save_dir + file.replace('dicom', 'png'))
        
        if split == 'train':
            image_id.append(file.replace('.dicom', ''))
            dim0.append(xray.shape[0])
            dim1.append(xray.shape[1])

In [None]:
%%time
!tar -zcf train.tar.gz -C "/kaggle/tmp/train/" .
!tar -zcf test.tar.gz -C "/kaggle/tmp/test/"

In [None]:
df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1})
df.to_csv('train_meta.csv', index=False)

References: 
1. https://docs.fast.ai/medical.imaging
2. https://www.kaggle.com/crained/vinbigdata-fastai-get-started
3. https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
4. https://www.kaggle.com/c/vinbigdata-chest-xray-abnormalities-detection/discussion/207955
5. https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-png-1024x1024

Thanks a million to the community.

Upcoming:

working on fastai build-in save_jpg() method to convert dicom to jpg. 

Any advice and contriutions would be appreciated.
Thank you.


This notebook is a starter guide for ones who want to start with DICOM.

# Work In progress