In [None]:
import os
# Define the root data directory
DATA_DIR = "/kaggle/input/vinbigdata-chest-xray-abnormalities-detection"

# Define the paths to the training and testing dicom folders respectively
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")

# Capture all the relevant full train/test paths
TRAIN_DICOM_PATHS = [os.path.join(TRAIN_DIR, f_name) for f_name in os.listdir(TRAIN_DIR)]
TEST_DICOM_PATHS = [os.path.join(TEST_DIR, f_name) for f_name in os.listdir(TEST_DIR)]
print(f"\n... The number of training files is {len(TRAIN_DICOM_PATHS)} ...")
print(f"... The number of testing files is {len(TEST_DICOM_PATHS)} ...")


### Images

The raw data set is comprised of 18,000 images that were manually annotated by 17 experienced radiologists. The data set has been divided into a training set of 15,000 images and test set of 3,000 images. Each scan in training set was <u>**independently**</u> labeled by 3 different radiologists whereas each scan in test set was labeled based on <u>**consensus**</u> of 5 radiologists. So the training data we are given is not labelled in the same way as the test data.

![Data Labeling](https://bl3302files.storage.live.com/y4ms_W0BdOJoz1GO0wW8_X67xQvy8R-Zvkm5fB96xc8LS2wHsjYrj_GzUSqyHTMjgcVBl0MOvBJs07WCA2_srv5-b6gWhT4vObwGHUNQYbJqbg8dXwcg5K0N3mYl5R_69run2WnNDwIsIsHf2imnmn_FTYsefIbZoVYOqNDOFVZ1ybK7q3E8HmVPf8CN2co6cFT?width=500&height=500&cropmode=none)

<p>The original 18,000 images were in DICOM (Digital Imaging and Communications in Medicine) format and they consumed memory space of 191.82 GB. These images were preprocessed and converted into jpg format, consequently resulting in data set of 1.6 GB.

### CSV file

<p>Apart from the dicom image files, the data comes with train.csv file. There are 8 columns in CSV file and each column  contains the following information,

<br>image_id - unique image identifier
<br>class_name - the name of the class of detected object (or "No finding")
<br>class_id - the ID of the class of detected object
<br>rad_id - the ID of the radiologist that made the observation
<br>x_min - minimum X coordinate of the object's bounding box
<br>y_min - minimum Y coordinate of the object's bounding box
<br>x_max - maximum X coordinate of the object's bounding box
<br>y_max - maximum Y coordinate of the object's bounding box
    
<p>There are 67,914 rows. Each row contains information about one bounding box annotated by one radiologist in a single image.


In [None]:

import pandas as pd

train_csv_df = pd.read_csv("/kaggle/input/vinbigdata-chest-xray-abnormalities-detection/train.csv")
train_csv_df.head()

In [None]:
train_csv_df.info()

In [None]:
train_csv_df.describe()

### Abnormalities

<p>These are the 14 types of thoracic (chest) abnormalities that we need to classify and localize. If no abnormality is found in the image, we will classify it as "No finding" class.
 
<br>0 - Aortic enlargement
<br>1 - Atelectasis
<br>2 - Calcification
<br>3 - Cardiomegaly
<br>4 - Consolidation
<br>5 - ILD (Interstitial Lung Disease)
<br>6 - Infiltration
<br>7 - Lung Opacity
<br>8 - Nodule/Mass
<br>9 - Other lesion
<br>10 - Pleural effusion
<br>11 - Pleural thickening
<br>12 - Pneumothorax
<br>13 - Pulmonary fibrosis
<br>14 - No finding (absence of any of 14 diseases listed above)
    



In [None]:
# Distribution of 15 different abnormalities in given 15,000 train set images

print(f"\n... Total number of unique classes = {train_csv_df['class_name'].nunique()} ...\n")

# Count of each class is given below 
train_csv_df['class_name'].value_counts()

<p>From plot below, it can be observed that we are dealing with highly imbalanced data set. Almost half of the annotations marked in the data set are comprised of "No finding" class.

In [None]:
import plotly
import plotly.express as px

fig = px.histogram(train_csv_df, x="class_name", color="class_name",opacity=0.7,
                   labels={"class_name":"Abnormality"},
                   title="<b>Annotations Per Class</b>",
                   ).update_xaxes(categoryorder="total descending")
fig.update_layout(showlegend=False,
                  xaxis_title="",
                  yaxis_title="<b>Annotations Per Class</b>",
                  )
fig.show()

In [None]:
# calculating percentage distribution of each class annotations
def percent_distribution(train_csv_df):
    # Get the count for each label
    label_counts = train_csv_df.class_name.value_counts()

    # Get total number of samples
    total_samples = len(train_csv_df)

    # Count the number of items in each class
    for i in range(len(label_counts)):
        label = label_counts.index[i]
        count = label_counts.values[i]
        percent = int((count / total_samples) * 10000) / 100
        print("{:<30s}:   {} or {}%".format(label, count, percent))

percent_distribution(train_csv_df)

### Annotations



#### Annotations per Unique Image

In [None]:
print(f"\n... Total number of unique images = {train_csv_df['image_id'].nunique()} ...\n")

# Count of the number of annotations per unique image
train_csv_df['image_id'].value_counts()

From plot below, it can be observed that the training images contain
* atleast 3 annotations (1 distinct object annotation by 3 radiologists)
* at most 57 annotations 

<p> The distribution overall is heavily skewed. The vast majority of images only have 3 annotations (~11,000 out of 15,000 images)

In [None]:
fig = px.histogram(train_csv_df.image_id.value_counts(), 
                   log_y=True, color_discrete_sequence=['indianred'], opacity=0.7,
                   labels={"value":"Annotations Per Image"},
                   title="<b>Distribution of Annotations per each unique CXR scan" \
                         "<i><sub>(Log Scale for Y-Axis)</sub></i></b>",
                   )
fig.update_layout(showlegend=False,
                  xaxis_title="Annotation Count",
                  yaxis_title="Unique Images Count",
                  )
fig.show()

Let's take a look at a sample image with only 3 annotations in the train.csv file.

In [None]:
train_csv_df[train_csv_df['image_id']=='000434271f63a053c4128a0ba6352c7f']

Now let's take a look at a sample image with 57 annotations in the train.csv file.

In [None]:
train_csv_df[train_csv_df['image_id']=='03e6ecfa6f6fb33dfeac6ca4f9b459c9']

#### Unique Annotations per Unique Image

The goal of this is to determine the distribution of distinct diseases within the same patient. So for example, if a radiologist identifies 8 modules of same disease in an image, we count it as one annotation.

From plot below, it can be observed that the training images contain
* Images contain no more than 10 unique abnormalities (out of a possible 14)
* The more unique abnormalities present in an image, the rarer it is. For example, out of 15,000 patients there are only 4 patients diagnosed with 10 distinct diseases 
* The vast majority of images only have 1 annotations (~11,000 out of 15,000 images)

In [None]:
fig = px.histogram(train_csv_df.groupby('image_id')["class_name"].unique().apply(lambda x: len(x)), 
             log_y=True, color_discrete_sequence=['skyblue'], opacity=0.7,
             labels={"value":"Number of Distinct Annotations"},
             title="<b>Distribution of Distinct Diseases per patient" \
                   "<i><sub>(Log Scale for Y-Axis)</sub></i></b>",
                   )
fig.update_layout(showlegend=False,
                  xaxis_title="Number of Distinct Annotations",
                  yaxis_title="Count of Patients",
                  )
fig.show()

### Radiologists

The `rad_id` column indicates the the ID of the radiologist that made the observation. Remember, three radiologists will annotate a given image out of a pool of seventeen possible radiologists, where the radiologist ID is encoded from R1 to R17.

#### Annotations per Radiologist

From plot below, it can be observed that
* 3 of the radiologists (R9, R10, & R8 in that order) are responsible for the vast majority of annotations (~60% of all annotations)
* Among the other 14 radiologists there is some variation around the number of annotations made, however, these 14 radiologists all made between 3121 annotations and 812 annotations



In [None]:
fig = px.histogram(train_csv_df, x="rad_id", color="rad_id",opacity=0.85,
                   labels={"rad_id":"Radiologist ID"},
                   title="<b>Distribution of Annotations Per Radiologist</b>",
                   ).update_xaxes(categoryorder="total descending")
fig.update_layout(showlegend=False,
                  xaxis_title="Radiologist ID",
                  yaxis_title="Number of Annotations",
                  )
fig.show()

### Do the radiologists agree or disagree when they independently review CXRs?

We will check for No findings class to check if all 3 radiologists agree or not. Thus we will aggregate the image_id by class 14 and check if the count is 3 or not.

In [None]:
is_normal_df = train_csv_df.groupby("image_id")["class_id"].agg(lambda s: (s == 14).sum()).reset_index().rename({"class_id": "num_normal_annotations"}, axis=1)

In [None]:
is_normal_df.head()

In [None]:
is_normal_df[(is_normal_df["num_normal_annotations"]!=3) & (is_normal_df["num_normal_annotations"]!=0)]

Thus all the radioligists agree whether a CXR is healthy or abnormal

In [None]:
abnormality_0 = train_csv_df.groupby("image_id")["class_id"].agg(lambda s: (s == 0).sum()).reset_index().rename({"class_id": "num_normal_annotations"}, axis=1)
len(abnormality_0[(abnormality_0["num_normal_annotations"]!=3) & (abnormality_0["num_normal_annotations"]!=0)])

But radiologists don't always agree on an abnormality

#### Creating dictionary mappings

Here we are creating the mappings between
* `class_id` and `class_name`
* `class_name` and `class_id`
* `class_id` and class color

In [None]:
import seaborn as sns
LABEL_COLORS = [px.colors.label_rgb(px.colors.convert_to_RGB_255(x)) for x in sns.color_palette("Spectral", 15)]

# Create dictionary mappings
int_2_str = {i:train_csv_df[train_csv_df["class_id"]==i].iloc[0]["class_name"] for i in range(15)}
str_2_int = {v:k for k,v in int_2_str.items()}
int_2_clr = {str_2_int[k]:LABEL_COLORS[i] for i,k in enumerate(sorted(str_2_int.keys()))}

print("\n... Dictionary Mapping Class Integer to Class String Representation [int_2_str]...\n")
display(int_2_str)

print("\n... Dictionary Mapping Class String to Class Integer Representation [str_2_int]...\n")
display(str_2_int)

print("\n... Dictionary Mapping Class Integer to Color Representation [str_2_clr]...\n")
display(int_2_clr)

#### Annotations per Radiologists based on Class type 

Here we would like to identify if all 17 radiologists were able to see and annotate all 15 classes or not.

From plot below, it can be observed that

* Among the 17 radiologists, 7 of them (R1 through R7) have only ever annotated images as `No finding`
* When compared to the 3 main radiologists (R8 through R10), the annotations made by other 7 radiologists are heavily skewed towards the `No finding` label


In [None]:
import plotly.graph_objects as go

fig = go.Figure()

for i in range(15):
    fig.add_trace(go.Histogram(
        x=train_csv_df[train_csv_df["class_id"]==i]["rad_id"],
        marker_color=int_2_clr[i],
        name=f"<b>{int_2_str[i]}</b>"
    ))

fig.update_xaxes(categoryorder="total descending")
fig.update_layout(title="<b>Distribution of Class Label Annotations by Radiologist</b>",
                  barmode='stack',
                  xaxis_title="Radiologist ID",
                  yaxis_title="Number of Annotations",
                 )
fig.show()

### Bounding Box Coordinates

The **`x_min`**, **`y_min`**, **`x_max`**, and **`y_max`** columns indicate the location of the annotated object bounding box, where the top-left corner is represented by the tuple (**`x_min`**, **`y_min`**) and the bottom-right corner is represented by the tuple (**`x_max`**, **`y_max`**).

A value of **`NaN`** coincides with a label 14 (**`No finding`**) and means that there is nothing to annotate (healthy x-ray).<br>
For the purpose of examining these columns we will <b style="text-decoration: underline;">only be examining rows where the objects have been annotated with a bounding box</b><br>
(i.e. All rows with a label of **`No finding`** will be discarded)<br><br>

The important thing to focus on will be identifying for each class the approximate range of locations the annotations are found in and the intensity of the locations within the heatmap.

**From the heatmaps plotted below we can ascertain the following information**
*  Aortic Enlargement <i><sub>(CLASS-ID: 0)</sub></i>
    * Heatmap distribution is slightly oval (vertical) and is very tight and intense, located in the centre of the image (slight drift to the top-right).
*  Atelectasis <i><sub>(CLASS-ID: 1)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse with a circular focus on the upper-left part of the left lung.
*  Calcification <i><sub>(CLASS-ID: 2)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse with a oval (vertical) focus on the top-left edge of the right lung.
*  Cardiomegaly <i><sub>(CLASS-ID: 3)</sub></i>
    * Heatmap distribution is rectangular and is very tight and intense, located in the bottom-centre (to bottom-centre-right) of the image.
*  Consolidation <i><sub>(CLASS-ID: 4)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse, the focus of the distribution covers the entire left lung.
*  ILD <i><sub>(CLASS-ID: 5)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse, the focus leans a little towards the centre of the lungs.
*  Infiltration <i><sub>(CLASS-ID: 6)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse, the focus of the distribution covers the entire left lung.
*  Lung Opacity <i><sub>(CLASS-ID: 7)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse, the focus of the distribution covers the entire left lung.
*  Nodule/Mass <i><sub>(CLASS-ID: 8)</sub></i>
    * Heatmap distribution is lung shaped and relatively diffuse, the focus leans a little towards the centre of the lungs. <b>(NOTE: The diffusion pattern looks patchy... probably due to smaller bounding boxes)</b>
*  Other Lesion <i><sub>(CLASS-ID: 9)</sub></i>
    * Heatmap distribution is incredibly diffuse and covers most of the image, the focus is towards a vertical-strip in the centre of the image.
*  Pleural Effusion <i><sub>(CLASS-ID: 10)</sub></i>
    * Heatmap distribution is lung shaped (slightly more rectangular?) and relatively diffuse, the focus is towards the bottom of the lungs and although both lungs are covered, the left lung has a stronger focus.
*  Pleural Thickening <i><sub>(CLASS-ID: 11)</sub></i>
    * Heatmap distribution is vaguely lung shaped (patches near top and focus trails down exterior lung edge fading as it goes), the focus is towards the top of the lungs is oval (horizontal).
*  Pneumothorax <i><sub>(CLASS-ID: 12)</sub></i>
    * Heatmap distribution is lung shaped (more rectangular), the focus is on the entire left lung however the right lung has some diffuse coverage.
*  Pulmonary Fibrosis <i><sub>(CLASS-ID: 13)</sub></i>
    * Heatmap distribution is vaguely lung shaped (patches near top and focus trails down lung fading as it goes), the focus is towards the top of the lung and it is oval.

In [None]:
import numpy as np
import tqdm
from tqdm.notebook import tqdm
import pydicom


# Get paths to images where bboxes exist `class_id!=14`
bbox_df = train_csv_df[train_csv_df.class_id!=14].reset_index(drop=True)
BBOX_PATHS = [
    os.path.join(TRAIN_DIR, name+".dicom") \
    for name in bbox_df.image_id.unique()
]

# Initalize our map for image sizes
sizes_of_images_w_bboxes = {}

# ############################################################### #
# ############## THIS STEP WILL TAKE 10 MINUTES ############## #
# ############################################################### #
#
# Get the image sizes so we can resize the bboxes all based on a static size
# so that we can generate a heatmap that is representative of the actual
# locations of annotations
for path in tqdm(BBOX_PATHS, total=len(BBOX_PATHS)):
    dicom = pydicom.read_file(path)
    sizes_of_images_w_bboxes[path[:-6].rsplit("/", 1)[1]] = \
        (dicom.Rows, dicom.Columns)
# ############################################################### #

# Create new dataframe columns for the source image width and height
bbox_df["img_height"] = bbox_df["image_id"].map(lambda x: sizes_of_images_w_bboxes[x][0])
bbox_df["img_width"] = bbox_df["image_id"].map(lambda x: sizes_of_images_w_bboxes[x][1])

# Create new dataframe columns for the bboxes that is a 
# percentage of the respective source image width and height
#   -- i.e. if x_min is 100 and the image width is 1000 than frac_x_min is 0.1
#   -- i.e. if y_max is 28 and the image height is 900 than frac_y_max is 0.031
#
def create_fractional_bbox_coordinates(row):
    """ Function to return bbox coordiantes as fractions from DF row """
    frac_x_min = row["x_min"]/row["img_width"]
    frac_x_max = row["x_max"]/row["img_width"]
    frac_y_min = row["y_min"]/row["img_height"]
    frac_y_max = row["y_max"]/row["img_height"]
    return frac_x_min, frac_x_max, frac_y_min, frac_y_max

# This will allow us to pick a heat-map size and make sure that we can use
# all of the bounding boxes and scale them appropriately
#   -- NOTE: We will most likely default the heatmap to the average
#            image shape so that there is as little distortion as possible
bbox_df["frac_x_min"], bbox_df["frac_x_max"], bbox_df["frac_y_min"], bbox_df["frac_y_max"] = \
    zip(*bbox_df.apply(create_fractional_bbox_coordinates, axis=1))

# # Record some important values for later
ave_src_img_height = np.mean([size[0] for size in sizes_of_images_w_bboxes.values()], dtype=np.int32)
ave_src_img_width  = np.mean([size[1] for size in sizes_of_images_w_bboxes.values()], dtype=np.int32)

# # Preview the dataframe
bbox_df.head()

In [None]:
import matplotlib.pyplot as plt
import matplotlib

# DEFAULT
HEATMAP_SIZE = (ave_src_img_height, ave_src_img_width, 14)

# Initialize
heatmap = np.zeros((HEATMAP_SIZE), dtype=np.int16)
bbox_np = bbox_df[["class_id", "frac_x_min", "frac_x_max", "frac_y_min", "frac_y_max"]].to_numpy()
bbox_np[:, 1:3] *= ave_src_img_width
bbox_np[:, 3:5] *= ave_src_img_height
bbox_np = np.floor(bbox_np).astype(np.int16)

# Color map stuff
custom_cmaps = [
    matplotlib.colors.LinearSegmentedColormap.from_list(
        colors=[(0.,0.,0.), c, (0.95,0.95,0.95)], 
        name=f"custom_{i}") for i,c in enumerate(sns.color_palette("Spectral", 15))
]
custom_cmaps.pop(8) # Remove No-Finding

for row in tqdm(bbox_np, total=bbox_np.shape[0]):
    heatmap[row[3]:row[4]+1, row[1]:row[2]+1, row[0]] += 1
    
fig = plt.figure(figsize=(20,25))
plt.suptitle("Heatmaps Showing Bounding Box Placement\n ", fontweight="bold", fontsize=16)
for i in range(15):
    plt.subplot(4, 4, i+1)
    if i==0:
        plt.imshow(heatmap.mean(axis=-1), cmap="bone")
        plt.title(f"Average of All Classes", fontweight="bold")
    else:
        plt.imshow(heatmap[:, :, i-1], cmap=custom_cmaps[i-1])
        plt.title(f"{int_2_str[i-1]} – ({i})", fontweight="bold")
        
    plt.axis(False)
fig.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.show()

<b style="text-decoration: underline; font-family: Verdana;">INVESTIGATE THE SIZES OF BOUNDING BOXES AND THE IMPACT OF CLASS</b>

As we wish to examine the average, as well as the upper and lower limits for various class-based bounding box statistics, we will use a box plot to investigate. To make things easier to understand let us consider the following basic buckets.

<b><u>Bounding Box Area - Median</u></b>
* Under   0.01 –– <b>Smallest</b>
* 0.01 to 0.02 –– <b>Small</b>
* 0.02 to 0.04 –– <b>Medium</b>
* 0.04 to 0.06 –– <b>Large</b>
* Above   0.06 –– <b>Largest</b>

<b><u>Bounding Box Area - Quartile Range</u></b>
* Under     0.0075 –– <b>Smallest</b>
* 0.0075 to 0.0125 –– <b>Small</b>
* 0.0125 to 0.0250 –– <b>Medium</b>
* 0.0250 to 0.0500 –– <b>Large</b>
* Above     0.0500 –– <b>Largest</b>

---

**From the boxplot plotted below we can ascertain the following information**
* Regarding Aortic Enlargement Box Plot <i><sub>(CLASS-ID: 0)</sub></i>
    * Median Value is <b>Small</b>  –––  Quartile Range is <b>Smallest</b>
* Regarding Atelectasis Box Plot <i><sub>(CLASS-ID: 1)</sub></i>
    * Median Value is <b>Medium</b>  –––  Quartile Range is <b>Large</b>
* Regarding Calcification Box Plot <i><sub>(CLASS-ID: 2)</sub></i>
    * Median Value is <b>Smallest</b>  –––  Quartile Range is <b>Medium</b>
* Regarding Cardiomegaly Box Plot <i><sub>(CLASS-ID: 3)</sub></i>
    * Median Value is <b>Large</b>  –––  Quartile Range is <b>Large</b>
* Regarding Consolidation Box Plot <i><sub>(CLASS-ID: 4)</sub></i>
    * Median Value is <b>Medium</b>  –––  Quartile Range is <b>Large</b>
* Regarding ILD Box Plot <i><sub>(CLASS-ID: 5)</sub></i>
    * Median Value is <b>Largest</b>  –––  Quartile Range is <b>Largest</b>
* Regarding Infiltration Box Plot <i><sub>(CLASS-ID: 6)</sub></i>
    * Median Value is <b>Medium</b>  –––  Quartile Range is <b>Large</b>
* Regarding Lung Opacity Box Plot <i><sub>(CLASS-ID: 7)</sub></i>
    * Median Value is <b>Medium</b>  –––  Quartile Range is <b>Large</b>
* Regarding Nodule/Mass Box Plot <i><sub>(CLASS-ID: 8)</sub></i>
    * Median Value is <b>Smallest</b>  –––  Quartile Range is <b>Smallest</b>
* Regarding Other Lesion Box Plot <i><sub>(CLASS-ID: 9)</sub></i>
    * Median Value is <b>Small</b>  –––  Quartile Range is <b>Large</b>
* Regarding Pleural Effusion Box Plot <i><sub>(CLASS-ID: 10)</sub></i>
    * Median Value is <b>Smallest</b>  –––  Quartile Range is <b>Large</b>
* Regarding Pleural Thickening Box Plot <i><sub>(CLASS-ID: 11)</sub></i>
    * Median Value is <b>Smallest</b>  –––  Quartile Range is <b>Smallest</b>
* Regarding Pneumothorax Box Plot <i><sub>(CLASS-ID: 12)</sub></i>
    * Median Value is <b>Largest</b>  –––  Quartile Range is <b>Largest</b>
* Regarding Pulmonary Fibrosis Box Plot <i><sub>(CLASS-ID: 13)</sub></i>
    * Median Value is <b>Small</b>  –––  Quartile Range is <b>Medium</b>


In [None]:
LABEL_COLORS_WOUT_NO_FINDING = LABEL_COLORS[:8]+LABEL_COLORS[9:]

# Update bbox dataframe for boxplots
bbox_df["frac_bbox_area"] = (bbox_df["frac_x_max"]-bbox_df["frac_x_min"])*(bbox_df["frac_y_max"]-bbox_df["frac_y_min"])
bbox_df["class_id_as_str"] = bbox_df["class_id"].map(int_2_str)
display(bbox_df.head())

fig = px.box(bbox_df.sort_values(by="class_id_as_str"), x="class_id_as_str", y="frac_bbox_area", color="class_id_as_str", 
             color_discrete_sequence=LABEL_COLORS_WOUT_NO_FINDING, notched=True,
             labels={"class_id_as_str":"Class Name", "frac_bbox_area":"BBox Area (%)"},
             title="<b>DISTRIBUTION OF BBOX AREAS AS % OF SOURCE IMAGE AREA   " \
                   "<i><sub>(Some Upper Outliers Excluded For Better Visualization)</sub></i></b>")

fig.update_layout(showlegend=True,
                  yaxis_range=[-0.025,0.4],
                  legend_title_text=None,
                  xaxis_title="",
                  yaxis_title="<b>Bounding Box Area %</b>",
                  )
fig.show()

### DICOM metadata

So, what information is contained in a .dicom file? We'll use an image that has some abnormalities.

In [None]:
train_csv_df[train_csv_df['image_id']=='9a5094b2563a1ef3ff50dc5c7ff71345']

In [None]:
import pydicom

dicom_sample = pydicom.read_file('../input/vinbigdata-chest-xray-abnormalities-detection/train/9a5094b2563a1ef3ff50dc5c7ff71345.dicom')
dicom_sample

In [None]:
dir(dicom_sample)

We might be able to gain some insight from age, sex, rows and columns values of dicom meta data but the rest do not be seem to be very useful.

In [None]:
# list containing full paths to image_ids of all dicoms
TRAIN_DICOM_PATHS

In [None]:
# ############################################################### #
# ############## THIS STEP WILL TAKE 30 MINUTES ############## #
# ############################################################### #

# converting dicom meta data to pandas dataframe

import re
import tqdm
from tqdm.notebook import tqdm
import pandas as pd
import pydicom

prog = re.compile('^[A-Z]*')

def get_dcm_contents(file):
    dcm = pydicom.read_file(file)    
    properties = [string for string in dir(dcm) if prog.match(string).group(0)!='']
    dict1 = {'file': file.replace('.dicom', '')}    
    dict1.update( { what: dcm[what].value for what in properties if isinstance(dcm[what].value, (bytes, bytearray))!=True } )
    return dict1
    
train_dicom_files = pd.DataFrame( [ get_dcm_contents(file) for file in tqdm(TRAIN_DICOM_PATHS) ] )


In [None]:
# saving df to csv file
train_dicom_files.to_csv('train_dicom_metadata.csv', index=False)

train_dicom_files.head()

In [None]:
# reading data from csv file
train_dicom_csv = pd.read_csv('train_dicom_metadata.csv')
train_dicom_csv

In [None]:
# checking for missing values in dicom metadata
train_dicom_csv.isnull().sum()

From average aspect ratio of 0.877 for the X-rays we can say that the X-rays tends to be taller than wide.

In [None]:
import numpy as np

# calculating mean aspect ratio of all images
train_dicom_csv['Aspect Ratio'] = train_dicom_csv['Columns'] / train_dicom_csv['Rows']
np.mean(train_dicom_csv['Aspect Ratio'])


Age data does not make much sense. For example age 000Y, Y, 238Y might either mean a missing age or truly an age < 1 years-old. Then there is 000D (66) which might represent the age in days.

In [None]:
# to display non-truncated output
pd.options.display.max_rows = 4000

train_dicom_csv['PatientAge'].value_counts()

Out of 15,000 patients, we are certain that 3840 are Male and 3514 are Female whereas O might represent missing values.

In [None]:
train_dicom_csv['PatientSex'].value_counts()

References

https://www.kaggle.com/bjoernholzhauer/eda-dicom-reading-vinbigdata-chest-x-ray
<br>
https://www.kaggle.com/dschettler8845/visual-in-depth-eda-vinbigdata-competition-data