Yet another EDA for train CSV files...

## Preparations

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import glob
import pydicom

In [None]:
siim_covid19_dir = os.path.join(
    '..', 'input', 'siim-covid19-detection')

In [None]:
def read_csv(file_name):
    file_path = os.path.join(siim_covid19_dir, file_name)
    df = pd.read_csv(file_path)
    return df

----
## train_study_level.csv

* 6054 rows x 5 columns
* Columns: 'id' and 4 labels

In [None]:
study_level_df = read_csv('train_study_level.csv')

study_level_df

In [None]:
# How many rows?
study_level_num_rows = len(study_level_df)

study_level_num_rows

### Study Level: id

* Unique and no duplicates.

In [None]:
# Are unique? Any duplicates?
study_level_num_unique_ids = len(pd.unique(study_level_df['id']))

if study_level_num_unique_ids == study_level_num_rows:
    print("Unique and no duplicates")
else:
    print("Some duplicates")

### Study Level: Labels

* Only one of four label values is 1 in training data.
* For test data, maybe multiple 1's ([Overview -- Evaluation](https://www.kaggle.com/c/siim-covid19-detection/overview/evaluation)).

In [None]:
# What are the unique value combinations? How many of them?
study_level_label_colums = [
    'Negative for Pneumonia', 'Typical Appearance',
    'Indeterminate Appearance', 'Atypical Appearance' ]
study_level_labels_df = study_level_df[study_level_label_colums]
study_level_label_values = study_level_labels_df.values

study_level_unique_label_combinations, \
study_level_unique_label_counts = \
    np.unique(
        study_level_label_values, return_counts=True, axis=0)

print("Unique Combinations:\n", study_level_unique_label_combinations)
print("Unique Counts:\n", study_level_unique_label_counts)

----
## train_image_level.csv

* 6334 rows x 4 columns
* Columns: 'id', 'boxes', 'label', and 'StudyInstanceUID'

In [None]:
image_level_df = read_csv('train_image_level.csv')

image_level_df

In [None]:
# How many rows?
image_level_num_rows = len(image_level_df)

image_level_num_rows

### Image Level: id

* Unique and no dupulicates

In [None]:
# Are unique? Any duplicates?
image_level_num_unique_ids = len(pd.unique(image_level_df['id']))

if image_level_num_unique_ids == image_level_num_rows:
    print("Unique and no duplicates")
else:
    print("Some duplicates")

### Image Level: Boxes

* A list of dictionaries. Each dictionary holds bbox information.
* NaN for no bbox, empty list is easier to handle...

In [None]:
# How does it look like?
for i in range(5):
    boxes = image_level_df.loc[i, 'boxes']
    print("{0}: {1}".format(i, boxes))

### Image Level: Label

* Format: "prediction, confidence, left, top, right, bottom", ...
* Number of fields is multiple of 6, maximum is 48.
* Maximum number of bboxes for a image is 8 (= 48 / 6).

In [None]:
# How does it look like?
for i in range(5):
    label = image_level_df.loc[ i, 'label']
    print("{0}: {1}".format(i, label))

In [None]:
# How many rows have how many fields?
# 3013 rows have 6 fileds, ..., 1 row has 48 fields.
image_level_label_field_counts = \
    image_level_df['label'] \
        .apply(lambda label: len(label.split())) \
        .value_counts() \
        .sort_index()

print(image_level_label_field_counts)
assert sum(image_level_label_field_counts) == image_level_num_rows

In [None]:
def make_image_level_label_df(label):
    '''Make a DataFrame from a label string.'''
    fields_list = label.split()
    num_fields = len(fields_list)
    # https://note.nkmk.me/python-list-ndarray-1d-to-2d/
    fields_2d_list = [
        fields_list[ i:i+6 ] for i in range(0, num_fields, 6)]
    columns = [
        'prediction', 'confidence',
        'left', 'top', 'right', 'bottom']
    label_df = pd.DataFrame(fields_2d_list, columns=columns)
    label_df = label_df.astype({
        'confidence': np.float32,
        'left': np.float32, 'top': np.float32,
        'right': np.float32, 'bottom': np.float32 })
    return label_df

#### Image Level: Label: Prediction

* The prediction for each bbox is either 'opacity' or 'none'.
* Number of 'opacity' is 7853, and 'none' is 2040.
* For each image, prediction is either:
    * 'none', or
    * one or more 'opacity'.

In [None]:
# For each labels, what predictions and how many?
image_level_label_pred_count_dict_list = []
for idx, (image_id, label) in image_level_df[['id', 'label']].iterrows():
    label_df = make_image_level_label_df(label)
    pred_count_dict = label_df['prediction'].value_counts().to_dict()
    image_level_label_pred_count_dict_list.append(pred_count_dict)
    
image_level_label_pred_df = pd.DataFrame(
    image_level_label_pred_count_dict_list)
image_level_label_pred_df = image_level_label_pred_df.fillna(0)

image_level_label_pred_df

In [None]:
# How many for each predictions?
image_level_label_pred_df.sum()

In [None]:
# How many for each prediction combinations?
image_level_label_pred_df.value_counts().sort_index()

### Image Level: StudyInstanceUID

* For each StudyInstanceUID, number of images are from 1 to 9.

In [None]:
# How many images for each StudyInstanceUID?
image_level_study_id_value_counts = \
    image_level_df['StudyInstanceUID'].value_counts()

image_level_study_id_value_counts.value_counts().sort_index()

----
## Image Predictions and Study Labels

For each study:

* No 'opacity' and only 'none':
    * Almost 'Negative'

| 'opacity' | 'none' | Negative | Typical | Indeterminate | Atypical |
|:---------:|:------:|:--------:|:-------:|:-------------:|:--------:|
|     0     |  >= 1  |   1676   |    1    |       0       |    83    |

* Some 'opacity' and zero or more 'none':
    * NO 'Negative'

| 'opacity' | 'none' | Negative | Typical | Indeterminate | Atypical |
|:---------:|:------:|:--------:|:-------:|:-------------:|:--------:|
|    >= 1   |   0    |    0     |  2724   |    1007       |    386   |
|    >= 1   |  >= 1  |    0     |   130   |      42       |      5   |

In [None]:
# Append StudyInstanceUID to image level predictions.
label_pred_study_id_df = \
    pd.concat([
        image_level_label_pred_df,
        image_level_df['StudyInstanceUID']],
        axis=1)

label_pred_study_id_df

In [None]:
def get_0_or_ge_1(value):
    return "0" if value == 0 else ">= 1"

In [None]:
# For each study, check image prediction sum is "0" or ">= 1".
def get_prediction_counts_for(study_id):
    study_uid = study_id.replace("_study", "")
    study_uid_mask = \
        (label_pred_study_id_df['StudyInstanceUID'] == study_uid)
    label_pred_for_study_df = \
        label_pred_study_id_df[ study_uid_mask ]
    image_pred_count_for_study_df = \
        label_pred_for_study_df[ ['opacity', 'none'] ].sum()
    opacity_value = get_0_or_ge_1(
        image_pred_count_for_study_df['opacity'])
    none_value = get_0_or_ge_1(
        image_pred_count_for_study_df['none'])
    return pd.Series({
        'opacity': opacity_value, 'none' : none_value})

study_level_image_pred_count_df = study_level_df['id'].apply(
    lambda study_id: get_prediction_counts_for(study_id))

study_level_image_pred_count_df

In [None]:
# How many for image predictions and study labels?
study_level_labels_df.columns = [
    'Negative', 'Typical', 'Indeterminate', 'Atypical' ]
image_pred_count_study_labels_pd = pd.concat(
    [study_level_image_pred_count_df, study_level_labels_df],
    axis=1)

image_pred_count_study_labels_pd.value_counts().sort_index()

----
## DICOM Series Number

Make sure the topic regarding Series Number described [here](https://www.kaggle.com/c/siim-covid19-detection/discussion/243273).

In this discussion:
> **Whichever image has the lowest SeriesNumber in the study is the one that you will need to predict bounding boxes on**.

If the lowest SerialNumber image has:
* NO opacity,
    * most of the other images have NO opacity.
    * 8 images have opacity.
* one or more opacity, ALL the other images have NO opacity.

| lowest SeriesNumber opacity | the other SeriesNumber opacity | count |
|:---------------------------:|:------------------------------:|:-----:|
|                0            |                0               |  1760 |
|                             |               >= 1             |     8 |
|               >= 1          |                0               |  4286 |
|                             |               >= 1             |     0 |


In [None]:
def get_image_path(image_level_row):
    image_id = image_level_row['id'].replace('_image', '')
    study_id = image_level_row['StudyInstanceUID']
    image_path_pattern = os.path.join(
        siim_covid19_dir, 'train', study_id, '*', image_id + ".dcm")
    image_path_list = glob.glob(image_path_pattern)
    assert len(image_path_list) == 1
    return image_path_list[0]

In [None]:
def get_series_number(image_level_row):
    image_path = get_image_path(image_level_row)
    # https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.filereader.dcmread.html#pydicom.filereader.dcmread
    # stop_before_pixels=True: to read element information only.
    dicom = pydicom.filereader.dcmread(
        image_path, stop_before_pixels=True)
    if dicom.SeriesNumber is None:
        series_number = -1   # Some DICOM file doesn't have the number...
    else:
        series_number = int(dicom.SeriesNumber)
    return series_number

In [None]:
def make_series_number_row(image_level_row):
    series_number = get_series_number(image_level_row)
    return pd.Series({
        "id": image_level_row['id'],
        'StudyInstanceUID': image_level_row['StudyInstanceUID'],
        'SeriesNumber': series_number,
    })

In [None]:
series_number_df = image_level_df.apply(make_series_number_row, axis=1)
series_number_df = pd.concat([
    series_number_df, image_level_label_pred_df], axis=1)

series_number_df

In [None]:
# Show the study id and number of images for the study
series_number_grp = series_number_df.groupby(['StudyInstanceUID'])
series_number_grp_size = series_number_grp.size()

series_number_grp_size.sort_values(ascending=False)

In [None]:
# For the study '0fd2db233deb', which has 9 images,
# an image with lowest Series Number of '1' has an opacity bbox.
# No opacity bboxes for the other images.
series_number_grp.get_group('0fd2db233deb').sort_values('SeriesNumber')

In [None]:
# The same for the study 'a7335b2f9815'.
series_number_grp.get_group('a7335b2f9815').sort_values('SeriesNumber')

In [None]:
def process_series_number_group(ser_num_grp_df):
    ser_num_grp_df = ser_num_grp_df.sort_values('SeriesNumber')
    lowest = ser_num_grp_df.iloc[ 0, : ]
    other = ser_num_grp_df.iloc[ 1: , : ]
    return pd.Series({
        'lowest_opacity': get_0_or_ge_1(lowest['opacity']),
        'lowest_none': get_0_or_ge_1(lowest['none']),
        'other_opacity': get_0_or_ge_1(other['opacity'].sum()),
        'other_none': get_0_or_ge_1(other['none'].sum()) })

In [None]:
series_number_pred_count_df = \
    series_number_grp.apply(process_series_number_group)

series_number_pred_count_df

In [None]:
series_number_pred_count_df.value_counts().sort_index()

In [None]:
ser_num_pred_count_other_opacity_ge_1_df = \
    series_number_pred_count_df[
        series_number_pred_count_df['other_opacity'] == ">= 1" ]

ser_num_pred_count_other_opacity_ge_1_df

In [None]:
for study_id in ser_num_pred_count_other_opacity_ge_1_df.index:
    other_opacity_ge_1_study_df = series_number_grp.get_group(study_id)
    print(other_opacity_ge_1_study_df)

----
## BBoxes

Majority pattern is 2 opacity bboxes for each lung.

In [None]:
def get_xray_size(image_level_row):
    image_path = get_image_path(image_level_row)
    dicom = pydicom.filereader.dcmread(
        image_path, stop_before_pixels=True)
    return int(dicom.Rows), int(dicom.Columns)

In [None]:
def get_opacity_count(image_level_row):
    label_df = make_image_level_label_df(image_level_row['label'])
    opacity_mask = label_df['prediction'] == "opacity"
    opacity_count = sum(opacity_mask)
    return opacity_count

In [None]:
def make_image_level_bbox_row(image_level_row):
    opacity_count = get_opacity_count(image_level_row)
    height, width = get_xray_size(image_level_row)
    return pd.Series({
        'opacity_count': opacity_count,
        'label': image_level_row['label'],
        'height': height,
        'width': width })

image_level_bbox_df = \
    image_level_df.apply(make_image_level_bbox_row, axis=1)

image_level_bbox_df

In [None]:
opacity_mask = image_level_bbox_df['opacity_count'] > 0
opacity_count = image_level_bbox_df.loc[opacity_mask, 'opacity_count'] \
    .value_counts() \
    .sort_index()

opacity_count

In [None]:
plt.bar(opacity_count.index, opacity_count.values)
plt.title('Opacity Counts for Each Image')
plt.show()

In [None]:
lefts = []
tops = []
rights = []
bottoms = []
for _, bbox_row in image_level_bbox_df.iterrows():
    label = bbox_row['label']
    label_df = make_image_level_label_df(label)
    for _, bbox in label_df.iterrows():
        if bbox['prediction'] == 'none':
            continue
        left, top, right, bottom = \
            bbox[['left', 'top', 'right', 'bottom']]
        image_width = bbox_row['width']
        image_height = bbox_row['height']
        lefts.append(left / image_width)
        tops.append(top / image_height)
        rights.append(right / image_width)
        bottoms.append(bottom / image_height)

len(lefts)

In [None]:
plt.figure(figsize=(12, 9))
plt.scatter(lefts, tops, alpha=0.4, label="top-left")
plt.scatter(rights, bottoms, alpha=0.4, label="bottom-right")
plt.title("Opacity BBox Positions")
plt.gca().invert_yaxis()
plt.legend()
plt.show()