In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pydicom
import cv2
from functools import reduce
from glob import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset_dir = '../input/vinbigdata-chest-xray-abnormalities-detection'
train_dir = f'{dataset_dir}/train'
test_dir = f'{dataset_dir}/test'

In [None]:
!pwd

In [None]:
# Read example dicom
sample_id = '000434271f63a053c4128a0ba6352c7f'
path_file = os.path.join(train_dir, f'{sample_id}.dicom')
example = pydicom.read_file(path_file)
print(type(example))
print(example)

In [None]:
example.pixel_array

In [None]:
def dicom2arr(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    data = apply_voi_lut(dicom.pixel_array, dicom) if (voi_lut) else dicom.pixel_array
    if (fix_monochrome and (dicom.PhotometricInterpretation == "MONOCHROME1")):
        data = np.amax(data) - data
    data = data.astype(np.float)
    data -= np.min(data)
    data /= np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
arr_example = dicom2arr(path_file)
print(arr_example)

In [None]:
def visualize_data(info_imgs, figsize=None, color_map=None, thickness=10, axes_shape=None):
    num_imgs = len(info_imgs)
    fig, axes = plt.subplots(num_imgs//axes_shape[1] + (1 if (num_imgs % axes_shape[1] != 0) else 0), axes_shape[1], figsize=figsize)
    for i in range(axes.shape[0]):
        for j in range(axes.shape[1]):
            img = info_imgs[i * axes.shape[1] + j]["img"]
            bboxes = info_imgs[i * axes.shape[1] + j].get("bboxes", None)
            title = info_imgs[i * axes.shape[1] + j].get("title", None)
            if (bboxes is not None):
                for k, bbox in enumerate(bboxes):
                    img = cv2.rectangle(img, tuple(bbox[:2]), tuple(bbox[2:]), color_map[info_imgs[i * axes.shape[1] + j]["class_id"][k]], thickness)
            axes[i, j].imshow(img, cmap="gray")
            if (title is not None):
                axes[i, j].set_title(title)
    plt.show()
    return

In [None]:
dicom_paths = glob(f'{dataset_dir}/train/*.dicom')[:8]
imgs = [{"img": dicom2arr(path)} for path in dicom_paths]
visualize_data(imgs, figsize=(20, 12), axes_shape = (-1, 4))

# Visualize img with histogram equalization to obtain high contrast image
preprocess_imgs = [{"img": cv2.equalizeHist(img["img"])} for img in imgs]
visualize_data(preprocess_imgs, figsize=(20, 12), axes_shape = (-1, 4))

In [None]:
from bokeh.plotting import figure as bokeh_figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
import pandas as pd
from PIL import Image
from sklearn import preprocessing
import random
from random import randint


train_df = pd.read_csv('../input/train-csv-of-vinbd-chest-xray-abnormalities/train.csv')

# convert rad_id to int type
le = preprocessing.LabelEncoder()
train_df['rad_label'] = le.fit_transform(train_df['rad_id'])
train_df.head()

In [None]:
print("Number of radiologists: ", le.classes_.shape[0])
print("Radiologists: ", le.classes_)

In [None]:
diseases, counts = np.unique(train_df["class_name"].values, return_counts=True)
print("Number of classes: ", diseases.shape[0])

In [None]:
info_imgs = []
color_map = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255), (255, 255, 0), (255, 51, 187), (0, 128, 255), (0, 0, 0), (255, 255, 255), (30, 105, 210), (179, 222, 245), (128, 128, 128), (21, 0, 128)]
color_name = ["Blue", "Green", "Red", "Pink", "Yellow", "Aqua", "Purple", "Orange", "Black", "White", "Brown", "Wheat", "Gray", "Burgundy"]
classes_name = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly", "Consolidation", "ILD", "Infiltration", "Lung Opacity", "Nodule/Mass", "Other lesion", "Pleural effusion", "Pleural thickening", "Pneumothorax", "Pulmonary fibrosis", "No finding"]
for i in range(diseases.shape[0] - 1):
    print(f"{color_name[i]}: ", classes_name[i])
identifiers = pd.unique(train_df.loc[train_df["class_id"] != 14]["image_id"])
indexes = np.random.permutation(identifiers.shape[0])[:8]
for index in indexes:
    identifier_df = train_df.loc[train_df["image_id"] == identifiers[index], ["class_name", "x_min", "y_min", "x_max", "y_max", "class_id"]]
    img = dicom2arr(f'{train_dir}/{identifiers[index]}.dicom')
    img = cv2.equalizeHist(img) 
    img = np.repeat(np.expand_dims(img, axis=-1), 3, axis=-1)
    class_name = identifier_df.iloc[0]["class_name"]
    bboxes = None
    classes = None
    if (class_name != classes_name[-1]):
        bboxes = []
        classes = []
        identifier_df.apply(lambda x: bboxes.append([int(x["x_min"]), int(x["y_min"]), int(x["x_max"]), int(x["y_max"])]), axis=1)
        identifier_df.apply(lambda x: classes.append(x["class_id"]), axis=1)
    info_imgs.append({"img": img, "bboxes": bboxes, "title": identifiers[index], "class_id": classes})
visualize_data(info_imgs, figsize=(20, 12), color_map=color_map, thickness=10, axes_shape=(-1, 4)) 

In [None]:
unique_class, count_class = np.unique(train_df["class_name"].values, return_counts=True)
indexes = np.argsort(count_class)[::-1]
fig, axes = plt.subplots(1, 2, figsize=(20, 12))
axes[0].bar(unique_class[indexes], counts[indexes], width=0.5, color=sns.color_palette("RdGy", n_colors=20))
axes[0].set_xticklabels(unique_class[indexes], rotation='90')
axes[0].set_title('Bar chart for classes', fontsize=15, fontweight='bold')

axes[1].pie(count_class[indexes], labels=unique_class[indexes], autopct='%1.2f%%',colors=sns.color_palette("RdGy", n_colors=20))
axes[1].set_title('Pie chart for classes', fontsize=15, fontweight='bold')
plt.show()

In [None]:
train_df.image_id.value_counts()

In [None]:
# rad: radiologits
unique_rad, count_rad = np.unique(train_df["rad_id"].values, return_counts=True)
plt.figure(figsize=(12, 8))
plt.bar(unique_rad, count_rad, width=1, color=sns.color_palette("RdGy", n_colors=20))
plt.title("Bar chart for rad")
plt.show()

In [None]:
train_df.isna().sum(axis=0)

In [None]:
train_df.loc[train_df["class_id"] != 14].isna().sum(axis=0)

In [None]:
new_train_df = train_df[train_df['class_name'] != "No finding"].copy()
new_train_df.head()

In [None]:
new_train_df.info()

In [None]:
new_train_df.describe()

In [None]:
new_train_df.shape

In [None]:
fig = plt.figure(figsize=(12, 6))
sns.distplot(new_train_df['x_min']);

In [None]:
fig = plt.figure(figsize=(12, 6))
sns.distplot(new_train_df['x_max']);

In [None]:
fig = plt.figure(figsize=(12, 6))
sns.distplot(new_train_df['y_min']);

In [None]:
fig = plt.figure(figsize=(12, 6))
sns.distplot(new_train_df['y_max']);

In [None]:
# Statistic Measure for Width, Height is very important to know method for improve model's quality
new_train_df["width_bbox"] = new_train_df["x_max"] - new_train_df["x_min"]
new_train_df["height_bbox"] = new_train_df["y_max"] - new_train_df["y_min"]
print("Statistic Measure for Width: ")
new_train_df["width_bbox"].describe()

In [None]:
print("Statistic Measure for Height: ")
new_train_df["height_bbox"].describe()

In [None]:
from scipy.stats import gaussian_kde

new_train_df["width_bbox_normalize"] = new_train_df.apply(lambda row: row["width_bbox"]/row["width"], axis=1)
new_train_df["height_bbox_normalize"] = new_train_df.apply(lambda row: row["height_bbox"]/row["height"], axis=1)
x_val = new_train_df["width_bbox_normalize"].values
y_val = new_train_df["height_bbox_normalize"].values

# Calculate the point density
xy = np.vstack([x_val,y_val])
z = gaussian_kde(xy)(xy)

fig, ax = plt.subplots(figsize = (10, 10))
ax.scatter(x_val, y_val, c=z, s=100, cmap='viridis')
ax.set_xlabel('bbox_width')
ax.set_ylabel('bbox_height')
plt.show()

In [None]:
new_train_df["area"] = (new_train_df["x_max"] - new_train_df["x_min"]) * (new_train_df["y_max"] - new_train_df["y_min"])
fig = plt.figure(figsize=(12, 6))
sns.distplot(new_train_df['area']);

In [None]:
new_train_df['area'].describe()

In [None]:
plt.gcf().set_size_inches(12, 21)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
for i in range(7):
    for j in range(2):
        plt.subplot(7, 2, i * 2 + j + 1)
        plt.title(classes_name[i * 2 + j])
        sns.distplot(new_train_df.loc[new_train_df["class_id"] == i * 2 + j, ['area']]);
plt.show()

In [None]:
# Imbalance Dataset
# Mode of Area from 180 to 152684