In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from glob import glob

## Reading `DICOM` image

In [None]:
BASE_DIR = "/kaggle/input/vinbigdata-chest-xray-abnormalities-detection"

In [None]:
from pydicom import dcmread
from pydicom.pixel_data_handlers.util import apply_voi_lut

# I copied and adapted this code from
# https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way

def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = dcmread(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
data = read_xray(BASE_DIR + "/train/000434271f63a053c4128a0ba6352c7f.dicom")

In [None]:
data.shape

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(data, 'gray')
plt.show()

In [None]:
train_df = (pd.read_csv(BASE_DIR + "/train.csv")
            .drop_duplicates())

In [None]:
train_df.head()

## Some EDA

In [None]:
count_by_class = train_df["class_name"].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(count_by_class.index, count_by_class.values, linewidth=1, edgecolor="black")
plt.ylabel("Class Name")
plt.xlabel("Number of images")
plt.title("Image count by class")
plt.show()

As can be seen from the above graph, the dataset is imbalance in terms of classes. This will be a challenge to train models.

The dataset were manually annotated by a total of 17 experienced radiologists. So I guess the field `rad_id` is associated with identifiers of 17 radiologists.

In [None]:
print("The number of rads: ", train_df["rad_id"].unique().shape[0])

We explore the number of images annotated per radiologist.

In [None]:
count_by_radiologist = train_df["rad_id"].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(count_by_radiologist.index, count_by_radiologist.values, 
        linewidth=1, edgecolor="black")
plt.xlabel("ID of radiologist")
plt.ylabel("Number of images")
plt.title("Image count by radiologist")
plt.legend()
plt.show()

In [None]:
count_by_radiologist_with_findings = train_df[train_df["class_id"] != 14]["rad_id"].value_counts()

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(count_by_radiologist_with_findings.index, count_by_radiologist_with_findings.values, 
        linewidth=1, edgecolor="black")
plt.xlabel("ID of radiologist")
plt.ylabel("Number of images")
plt.title("Image (with findings) count by radiologist")
plt.legend()
plt.show()

So, R9, R10 and R8 are three radiologists who significantly annotates the dataset and also detects most of the abnormalities. I think the labels of the datasets would be biased because R8, R9 and R10 are three dominant annotators here. They can detect some kinds of abnormalities well, but they may miss other kinds of abnormalities. 

In [None]:
train_df[train_df['rad_id'] == "R8"]["class_name"].value_counts()

## Making a dummy submission

In [None]:
!mkdir -p submissions

In [None]:
# all are in no-finding class
pred_str = "14 1 0 0 1 1"

test_image_ids = [file.split("/")[-1][:-6] for file in glob(BASE_DIR + "/test/*.dicom")]

predictions = [(image_id, pred_str)
               for image_id in test_image_ids]
    
predictions = pd.DataFrame(predictions, columns=["image_id", "PredictionString"])

predictions.to_csv("submissions/dummy_submissions.csv", index=False)