In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load libraries
from pathlib import Path
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import ast
import copy
from collections import Counter

In [None]:
HOME = Path('/kaggle/input/siim-covid19-detection/')
STUDY_ANNOTATION = Path('train_study_level.csv')
IMG_ANNOTATION = Path('train_image_level.csv')



# /kaggle/input/siim-covid19-detection/test/2fb11712bc93/b056067b8455/a29c5a68b07b.dcm
# /kaggle/input/siim-covid19-detection/test/19c66935e737/41b28d4fa278/9850b5470fd6.dcm

We start from the general, with inspecting the <b>Study level annotations</b>

In [None]:
study_annotation = pd.read_csv(HOME/STUDY_ANNOTATION)
study_annotation.head(3)

In [None]:
assert len(study_annotation[study_annotation.duplicated() == True]) == 0, "There are duplicated rows." 

In [None]:
label_counts = study_annotation.iloc[:,1:].sum(axis=0)

fig, ax = plt.subplots( figsize = (10,5))
ax = sns.barplot(x = label_counts.values, y = label_counts.index)
ax.set_title('Label frequency at study level')
ax.set_xlabel('Number of studies')
plt.show()

In [None]:
# Extract all study ids from the study_annotation file and store them in a new column
study_ids_study_annotation = [study_id.split('_')[0] for study_id in study_annotation.id]
study_annotation['StudyInstanceUID'] = study_ids_study_annotation

In [None]:
# Transform the dataframe for further usage
study_annotation_tr = copy.deepcopy(study_annotation.iloc[:,1:])

study_annotation_tr = study_annotation_tr.mask(study_annotation_tr.eq(1), study_annotation_tr.columns.to_series(), axis=1)
study_annotation_tr = study_annotation_tr.replace(0,'')


study_annotation_tr['Diagnosis'] = (study_annotation_tr['Negative for Pneumonia'] +
                                    study_annotation_tr['Typical Appearance'] + 
                                    study_annotation_tr['Indeterminate Appearance'] + 
                                    study_annotation_tr['Atypical Appearance'])

# Rename the labels as in the submission file.
study_annotation_tr.replace("Negative for Pneumonia", "negative 1 0 0 1 1", inplace=True)
study_annotation_tr.replace("Typical Appearance", "typical 1 0 0 1 1", inplace=True)
study_annotation_tr.replace("Atypical Appearance", "atypical 1 0 0 1 1", inplace=True)
study_annotation_tr.replace("Indeterminate Appearance", "indeterminate 1 0 0 1 1", inplace=True)

study_annotation_tr = study_annotation_tr[['StudyInstanceUID', 'Diagnosis']]
study_annotation_tr.head(3)

And continue with the particular: <b>image level annotations</b>.

In [None]:
# Read the annotation file
img_annotation = pd.read_csv(HOME/IMG_ANNOTATION)
img_annotation.head(3)

In [None]:
# How many images does each study have? 

img_per_study = img_annotation[['id', 'StudyInstanceUID']].groupby('StudyInstanceUID').count().reset_index()

studies_with_n_imgs = img_per_study.groupby('id').count().reset_index()
studies_with_n_imgs.columns = ['nr_images_per_study', 'occurences']
studies_with_n_imgs

The number of images in one study ranges between 1 and 9. The majority of patients (5822 studies) have one annotated image, but there are also some studies with more than 3 images. Those are:

In [None]:
over3_imgs = img_per_study[img_per_study.id > 3]
over3_imgs

Let's see if there is a connection between the opacity features and the patient's (study) diagnosis. For this we merge the two dataframes and perform some operations (calculate the number of opacities in each image and the total estimated area of the opacities in every image).

In [None]:
# Merge the image and study annotations for further investigations
print(len(img_annotation))
print(len(study_annotation_tr))
merged_df = pd.merge(img_annotation, study_annotation_tr, on = 'StudyInstanceUID')
merged_df.head(3)

In [None]:
col_nr_opacities = []
col_opacity_area = []

for image in merged_df.boxes:
    if not pd.isnull(image):
        box_list = ast.literal_eval(image)
        nr_opacities = len(box_list)
        
        opacity_area = 0
        for box in box_list:
            coord = box
            area = coord['width'] * coord['height']
            opacity_area = opacity_area + area
        
    else:
        nr_opacities = 0
        opacity_area = 0
        
    col_nr_opacities.append(nr_opacities)
    col_opacity_area.append(opacity_area)
    

merged_df['opacity_area'] = col_opacity_area
merged_df['nr_opacities'] = col_nr_opacities
merged_df.head(3)

The majority of the 'Typical appearance' cases have 2 opacities. 
There is no indication of a clear difference between the 'Atypical' and 'Indeterminate' cases based on the number of opacities.
<b>One observation</b> that we will investigate further refers to the cases with <b>NO OPACITIES</b> and a diagnosis other than <b>Negative for Pneumonia</b>. Please see the <b>Observation 1</b> snippet.

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15, 7))

sns.histplot(ax = ax[1],
            x = merged_df['nr_opacities'], 
            hue = merged_df['Diagnosis'], 
            multiple = 'dodge')

sns.boxplot(ax = ax[0],
            x = merged_df['nr_opacities'], 
            y = merged_df['Diagnosis'])

plt.show()

Relationship between the type of diagnosis and the total surface of the opacities

In the case of No Pneumonia, there is no opacity identified on the lungs. 
The Typical Appearance cases differ from the other groups, exhibiting more surface with opacities.
The Atypical and Indeterminate appearance are similar.

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15, 7))

sns.histplot(ax = ax[1],
            x = merged_df['opacity_area'], 
            hue = merged_df['Diagnosis'], 
            multiple = 'dodge')

sns.boxplot(ax = ax[0],
            x = merged_df['opacity_area'], 
            y = merged_df['Diagnosis'])

plt.show()

From the boxplot below we can see that having one opacity implies less affected surface of the lungs. The majority of the cases
with one opacity are 'Indeterminate appearance'. 
One cannot draw a clear line between the cases with 2, 3 or 4 opacities, as there is no relationship between how many opacities there are
and the affected area.

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (15, 7))
sns.scatterplot(ax = ax[0],
                x = merged_df['opacity_area'], 
                y = merged_df['nr_opacities'],
                hue = merged_df['Diagnosis'])

sns.boxplot(ax = ax[1],
            x = merged_df['nr_opacities'], 
            y = merged_df['opacity_area'],
            orient = 'v')

plt.show()

In [None]:
merged_df[['Diagnosis', 'opacity_area']].groupby('Diagnosis').median()

In [None]:
# merged_df[merged_df.StudyInstanceUID == 'bf35d36cf961']
study_annotation[study_annotation.StudyInstanceUID == '4189ab7c6027']
img_annotation[img_annotation.StudyInstanceUID ==  '4189ab7c6027']

## Observation 1

Here, we look into those images which have no opacities, but have a diagnosis other than "Negative for Pneumonia". 
One source for these images is, naturally, the fact that a study with multiple images, has opacities on just one image. The diagnosis is placed based on that single image, so all other images will be tagged with the same diagnostic, even if they have no opacities.

<b>Conclusion for postprocessing:</b> if at least one image of a study has opacities, the study will be diagnoses with a label other than "Negative for Pneumonia". 

But unfortunately, this still leaves us with unsolved 82 study ids (patients) which:
- have only one image to assess, 
- that images has 0 opacities
- that study has a label different from "Negative for Pneumonia" (81 cases with "atypical" and 1 with "typical")

In [None]:
st_id_pos_0opac = merged_df[(merged_df.Diagnosis != "Negative for Pneumonia") & (merged_df.nr_opacities == 0)].StudyInstanceUID.values

In [None]:
diag_ =[]
stid_ = []
std_id_multiple_images = []

for i in set(st_id_pos_0opac):
    
    study_images = merged_df[merged_df.StudyInstanceUID == i]
    
    if len(study_images) < 2:
        diag_.append(study_images.Diagnosis.values[0])
        stid_.append(study_images.StudyInstanceUID.values[0])
    if len(study_images) > 1:
        std_id_multiple_images.append(study_images.StudyInstanceUID.values[0])

In [None]:
print("number of unadressed patients: {}".format(len(set(stid_))))
print(Counter(diag_))

In [None]:
#stid_

## Visualize images

1. Studies with "Atypical appearance" label which have no bounding boxes localizing opacities
2. Studies with "Negative for Pneumonia"
3. Studies with "Typical Appearance"

In [None]:
import os
import glob
import pydicom
import random

In [None]:
TRAIN = '/kaggle/input/siim-covid19-detection/train/'
COLS = 4
ROWS = 2

fig, ax = plt.subplots(ncols = COLS, nrows = ROWS, figsize = (25, 10))
plt.suptitle("Studies with 'Atypical appearance' without opacity bounding boxes")

studies = random.sample(stid_, COLS * ROWS)

c = 0
r = 0

for study in studies:
    dcm_file = pydicom.dcmread(glob.glob(os.path.join(TRAIN, study, "*/*.dcm"))[0])
    dcm_array = dcm_file.pixel_array
    
    ax[r, c].imshow(dcm_array, cmap= 'gray')
    ax[r, c].set_title(study)
    
    
    c = c + 1
    if (c) % COLS == 0:
        c = 0
        r = r + 1
      

In [None]:
stid_negative_for_pneumonia = list(set(merged_df[merged_df.Diagnosis == "Negative for Pneumonia"].StudyInstanceUID))

TRAIN = '/kaggle/input/siim-covid19-detection/train/'
COLS = 4
ROWS = 2

fig, ax = plt.subplots(ncols = COLS, nrows = ROWS, figsize = (25, 10))
plt.suptitle("Studies with 'Negative for Pneumonia' without opacity bounding boxes")

studies = random.sample(stid_negative_for_pneumonia, COLS * ROWS)

c = 0
r = 0

for study in studies:
    dcm_file = pydicom.dcmread(glob.glob(os.path.join(TRAIN, study, "*/*.dcm"))[0])
    dcm_array = dcm_file.pixel_array
    
    ax[r, c].imshow(dcm_array, cmap = 'gray')
    ax[r, c].set_title(study)
    
    
    c = c + 1
    if (c) % COLS == 0:
        c = 0
        r = r + 1

In [None]:
stid_negative_for_pneumonia = list(set(merged_df[merged_df.Diagnosis == "Typical Appearance"].StudyInstanceUID))

TRAIN = '/kaggle/input/siim-covid19-detection/train/'
COLS = 4
ROWS = 2

fig, ax = plt.subplots(ncols = COLS, nrows = ROWS, figsize = (25, 10))
plt.suptitle("Studies with 'Typical Appearance' without opacity bounding boxes")

studies = random.sample(stid_negative_for_pneumonia, COLS * ROWS)

c = 0
r = 0

for study in studies:
    dcm_file = pydicom.dcmread(glob.glob(os.path.join(TRAIN, study, "*/*.dcm"))[0])
    dcm_array = dcm_file.pixel_array
    
    ax[r, c].imshow(dcm_array, cmap = 'gray')
    ax[r, c].set_title(study)
    
    
    c = c + 1
    if (c) % COLS == 0:
        c = 0
        r = r + 1

In [None]:
stid_negative_for_pneumonia = list(set(merged_df[merged_df.Diagnosis == "Indeterminate Appearance"].StudyInstanceUID))

TRAIN = '/kaggle/input/siim-covid19-detection/train/'
COLS = 4
ROWS = 2

fig, ax = plt.subplots(ncols = COLS, nrows = ROWS, figsize = (25, 10))
plt.suptitle("Studies with 'Indeterminate Appearance' without opacity bounding boxes")

studies = random.sample(stid_negative_for_pneumonia, COLS * ROWS)

c = 0
r = 0

for study in studies:
    dcm_file = pydicom.dcmread(glob.glob(os.path.join(TRAIN, study, "*/*.dcm"))[0])
    dcm_array = dcm_file.pixel_array
    
    ax[r, c].imshow(dcm_array, cmap = 'gray')
    ax[r, c].set_title(study)
    
    
    c = c + 1
    if (c) % COLS == 0:
        c = 0
        r = r + 1