In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import cv2
import matplotlib.pyplot as plt
import pydicom
import glob as glob
from skimage import exposure
%matplotlib inline
import seaborn as sns
import matplotlib
from pydicom.pixel_data_handlers.util import apply_voi_lut

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Understanding the Dataset

**The dataset comprises 18,000 postero-anterior (PA) CXR scans in DICOM format, which were de-identified to protect patient privacy. All images were labeled by a panel of experienced radiologists for the presence of 14 critical radiographic findings as listed below:**

1. 0 - Aortic enlargement
2. 1 - Atelectasis
3. 2 - Calcification
4. 3 - Cardiomegaly
5. 4 - Consolidation
6. 5 - ILD
7. 6 - Infiltration
8. 7 - Lung Opacity
9. 8 - Nodule/Mass
10. 9 - Other lesion
11. 10 - Pleural effusion
12. 11 - Pleural thickening
13. 12 - Pneumothorax
14. 13 - Pulmonary fibrosis

**The "No finding" observation (14) was intended to capture the absence of all findings above.**

In [None]:
train_df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
train_df.shape

In [None]:
train_df.head(6)

In [None]:
train_df.info()

# Let's check the Class_name labels

In [None]:
counts = train_df['class_name'].value_counts()
plt.figure(figsize=(15,5))
counts.plot(kind='barh')
plt.tight_layout()
plt.show()

# Image Datasets 

In [None]:
dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection/'
train_imgs = '../input/vinbigdata-chest-xray-abnormalities-detection/train/'
test_imgs = '../input/vinbigdata-chest-xray-abnormalities-detection/test/'

print("Training samples : {} ".format(len(os.listdir(train_imgs))))
print("Test samples : {} ".format(len(os.listdir(test_imgs))))

**I will be implementing some processes used in [Trung Thann Ngyuyen](https://www.kaggle.com/trungthanhnguyen0502/eda-vinbigdata-chest-x-ray-abnormalities/)'s notebook**

In [None]:
from sklearn.preprocessing import LabelEncoder
from PIL import Image

lbl = LabelEncoder()
train_df['rad_label'] = lbl.fit_transform(train_df['rad_id'])
train_df.head(5)

In [None]:
train_df.isna().sum().sum()

# Defining Bounding Box Area

In [None]:
def bbox_area(row):
    return (row['x_max']-row['x_min'])*(row['y_max']-row['y_min'])
finding_df = train_df[train_df['class_name']!='No finding']
finding_df['bbox_area'] = finding_df.apply(bbox_area, axis=1)
finding_df.head()

In [None]:
def dicom_to_array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    if fix_monochrome and dicom.PhotometricInterpretation == 'MONOCHROME1':
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data*255).astype(np.uint8)
    return data

def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

# Exploring arrays

In [None]:
img = dicom_to_array('../input/vinbigdata-chest-xray-abnormalities-detection/train/00053190460d56c53cc3e57321387478.dicom')
img

# Plotting Bounding Boxes

In [None]:
import random
from random import randint

imgs = []
img_ids = finding_df['image_id'].values
class_ids = finding_df['class_id'].unique()

# map label_id to specify color
label2color = {class_id:[randint(0,255) for i in range(3)] for class_id in class_ids}
thickness = 3
scale = 5


for i in range(8):
    img_id = random.choice(img_ids)
    img_path = f'{dataset_dir}/train/{img_id}.dicom'
    img = dicom_to_array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    boxes = finding_df.loc[finding_df['image_id'] == img_id, ['x_min', 'y_min', 'x_max', 'y_max']].values/scale
    labels = finding_df.loc[finding_df['image_id'] == img_id, ['class_id']].values.squeeze()
    
    for label_id, box in zip(labels, boxes):
        color = label2color[label_id]
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)
plt.tight_layout()
plt.axis('off')
plt.show()

In [None]:
sns.pairplot(train_df, hue='class_name')
plt.show()

# Visualizing the Images through CLAHE Normalization

**This method produces sharper images and is quite often used in chest X-ray research. This generates view, which radiologist would not see in his standard workplace. However, it closely resembles the "bone-enhanced" view in some X-rays done (usually due to broken ribs).**

In [None]:
dicom_paths = glob.glob(f'{dataset_dir}/train/*.dicom')
imgs = [dicom_to_array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)


## Maybe, you can try some preprocess like equalize histogram.
## You can see the difference between before and after
imgs = [exposure.equalize_adapthist(img) for img in imgs]
plot_imgs(imgs)
plt.show()

# Bounding Boxes with Diseases

In [None]:
def plot_example(idx_list):
    fig, axs = plt.subplots(1, 3, figsize=(15, 10))
    fig.subplots_adjust(hspace = .1, wspace=.1)
    axs = axs.ravel()
    for i in range(3):
        image_id = train_df.loc[idx_list[i], 'image_id']
        data_file = pydicom.dcmread(dataset_dir+'train/'+image_id+'.dicom')
        img = data_file.pixel_array
        axs[i].imshow(img, cmap=plt.cm.bone)
        axs[i].set_title(train_df.loc[idx_list[i], 'class_name'])
        axs[i].set_xticklabels([])
        axs[i].set_yticklabels([])
        if train_df.loc[idx_list[i], 'class_name'] != 'No finding':
            bbox = [train_df.loc[idx_list[i], 'x_min'],
                    train_df.loc[idx_list[i], 'y_min'],
                    train_df.loc[idx_list[i], 'x_max'],
                    train_df.loc[idx_list[i], 'y_max']]
            p = matplotlib.patches.Rectangle((bbox[0], bbox[1]),
                                             bbox[2]-bbox[0],
                                             bbox[3]-bbox[1],
                                             ec='r', fc='none', lw=2.)
            axs[i].add_patch(p)
            
for num in range(15):
    idx_list = train_df[train_df['class_id']==num][0:3].index.values
    plot_example(idx_list)
    plt.show()

# GroupKFold

In [None]:
from sklearn.model_selection import GroupKFold, train_test_split

train_df = train_df[train_df['class_id'] != 14].reset_index(drop=True)

gkf  = GroupKFold(n_splits = 5)
train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups = train_df.image_id.tolist())):
    train_df.loc[val_idx, 'fold'] = fold
train_df.head()

In [None]:
train, test = train_test_split(train_df, test_size = 0.2, random_state = 45)
print(train.shape)
print(test.shape)

# Submission File

In [None]:
subs_df = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/sample_submission.csv')

In [None]:
train_df.to_csv("submission.csv", index=False)

# WORK IN PROGRESS