In [None]:
### NOTE
# Before running this notebook, run read-dicom-info to generate train_df_dicom_data.csv. 

import numpy as np 
import pandas as pd 
import os
import seaborn as sns
sns.set(rc={"figure.figsize":(12, 8)})


## Inspect data files

In [None]:
train_folder = '/kaggle/input/siim-covid19-detection/train'
test_folder = '/kaggle/input/siim-covid19-detection/test'
train_image_level = '/kaggle/input/siim-covid19-detection/train_image_level.csv'
train_study_level = '/kaggle/input/siim-covid19-detection/train_study_level.csv'
sample_submission = '/kaggle/input/siim-covid19-detection/sample_submission.csv'

In [None]:
df_train_image = pd.read_csv(train_image_level)
df_train_study = pd.read_csv(train_study_level)

In [None]:
df_train_image.head(1)

In [None]:
df_train_study.head(1)

In [None]:
#rename id with StudyInstanceUID to merge two dataframes
df_train_study.rename(columns={'id':'StudyInstanceUID'},inplace=True)
df_train_study.head(1)

In [None]:
# get rid of _image and _study
df_train_image['id'] = df_train_image['id'].apply(lambda x: x[0:-6])
df_train_study['StudyInstanceUID'] = df_train_study['StudyInstanceUID'].apply(lambda x: x[0:-6])

In [None]:
df_train_merge = pd.merge(df_train_image,df_train_study,on='StudyInstanceUID')

In [None]:
df_train_merge.head(5)

In [None]:
#unify class columns
df_train_merge.loc[df_train_merge['Negative for Pneumonia']==1,'class']='negative'
df_train_merge.loc[df_train_merge['Typical Appearance']==1,'class']='typical'
df_train_merge.loc[df_train_merge['Indeterminate Appearance']==1,'class']='indeterminate'
df_train_merge.loc[df_train_merge['Atypical Appearance']==1,'class']='atypical'
df_train_merge.drop(['Negative for Pneumonia','Typical Appearance',
                     'Indeterminate Appearance','Atypical Appearance'],axis=1,inplace=True)
df_train_merge.head()

In [None]:
df_train_merge.loc[4560,'label']

In [None]:
#get boxes and opacity values
for c in ['bbox','confidence','label_op']:
    df_train_merge[c] = np.nan
    df_train_merge[c] = df_train_merge[c].astype('object')
for r in df_train_merge.index:
    val_len = len(df_train_merge['label'][r].split(' '))
    val = df_train_merge['label'][r].split(' ')
    nbox = int(val_len/6)
    boxes,cc=[],[]
    for i in range(0,nbox):
        cc.append(val[(1+i*6)])
        box = val[(2+i*6):(6+i*6)]
        boxes.append(box)
    df_train_merge.at[r,'label_op'] = val[0]
    df_train_merge.at[r,'confidence'] = cc
    df_train_merge.at[r,'bbox'] = boxes
    if val[0] =='none':
        df_train_merge.at[r,'num_box'] = 0
    else:
        df_train_merge.at[r,'num_box'] = len(boxes)
            
    

In [None]:
df_train_merge

In [None]:
#check whether images contain mixed confidence values
df_train_merge['confidence'].apply(lambda x: len(list(set(x)))).unique()

In training dataset, confidence value for opacity is always 1.

In [None]:
df_train_merge.shape

## Class distribution in training set is not uniform.

In [None]:
df_train_merge.groupby('class').count()['label'].plot(kind='bar',
                                                      title='class size');

# Observations on boxes and opacity values
* Images may have 0 to 8 number of boxes. 
* In training dataset confidence value for opacity is 1.
* Only class 'negative' does not have any boxes.
* For all other classes, they may or may not have boxes.
* Number of boxes in each class is different (see the plot below). For example class 'typical' mostly have 2 boxes.

In [None]:
#existence of boxes in each class
df_train_merge.groupby(['class','label_op']) \
              .count()['id'] \
              .unstack() \
              .plot(kind='bar',title='Images that contain any number of boxes in each class');

# Plot number of boxes in each class

In [None]:
df=df_train_merge.groupby(['class','label_op','num_box']).count().id.reset_index()
df

In [None]:
ax=sns.barplot(x="class", y="id", hue="num_box", data=df)
ax.set_ylabel('Count')
ax.set_title('Number of boxes in each class');

# Import DICOM paths

In [None]:
# Get paths to DICOM files
#It looks like this:
#set/study/series/image.dcm
!ls /kaggle/input/siim-covid19-detection/train/00086460a852/9e8302230c91

In [None]:
def read_dcm_paths():
    #this function is taken from https://www.kaggle.com/farhanhaikhan/object-detection-starter-rescale-image-bbox
    IMG_FORMAT = ".dcm"
    IMG_PATHS = []
    IMAGE_IDS = []
    IMAGE_NAMES = []
    SETS = []
    SERIES = []
    STUDIES = []

    for dirname, _, filenames in os.walk('/kaggle/input/siim-covid19-detection'):
        for filename in filenames:
            if filename.endswith(IMG_FORMAT):
                img_path = os.path.join(dirname, filename)
                Splitted = img_path.split('/')
                # print(Splitted)
                img_name = Splitted[-1]
                img_id = img_name[:-4]
                series_name = Splitted[-2]
                study_name = Splitted[-3]
                set_name = Splitted[-4]
                IMG_PATHS.append(img_path)
                IMAGE_NAMES.append(img_name)
                IMAGE_IDS.append(img_id)
                SETS.append(set_name)
                SERIES.append(series_name)
                STUDIES.append(study_name)
    df_dcm = pd.DataFrame.from_dict({
                                 "id":IMAGE_IDS,
                                 "Image_Path":IMG_PATHS,
                                 "Image_Name":IMAGE_NAMES,
                                 "id_set": SETS,
                                 "id_series":SERIES,
                                 "StudyInstanceUID":STUDIES})
    return df_dcm

In [None]:
df_dcm_path = read_dcm_paths()

In [None]:
#sanity check
!ls /kaggle/input/siim-covid19-detection/train/9d514ce429a7/22897cd1daa0
df_dcm_path[df_dcm_path['Image_Name']=='0012ff7358bc.dcm']
df_dcm_path[df_dcm_path['id']=='0012ff7358bc']


In [None]:
df_dcm_path.head()

In [None]:
df_train_merge.head(5)

In [None]:
#fill to 12 characters
df_dcm_path['id'] = df_dcm_path['id'].apply(lambda x: (12-len(x))*'0'+x)

## Merge dicom paths with image information

In [None]:
df_train_merge_dcm = pd.merge(df_train_merge,df_dcm_path[['id','Image_Path']],on='id',how='left')

In [None]:
df_train_merge_dcm.head()

In [None]:
df_train_merge.shape

In [None]:
len(df_train_merge['id'].unique())

In [None]:
len(df_dcm_path['id'].unique())

In [None]:
df_train_merge_dcm.shape

## Load DICOM fields

In [None]:
# this is precalculated
df_dicom_data = pd.read_csv('/kaggle/input/read-dicom-info/train_df_dicom_data.csv')

In [None]:
df_dicom_data.head(5)

In [None]:
df_train_merge_dcm.columns

In [None]:
df_train_merge_dcm.shape

In [None]:
df_train_dcm=pd.merge(df_train_merge_dcm,df_dicom_data[['id', 'Rows', 'Columns', 'PatientID', 'PatientName',
                                           'PhotometricInterpretation', 'SamplesPerPixel',
                                           'BitsAllocated', 'BitsStored', 'HighBit', 'PixelRepresentation',
                                           'ImagerPixelSpacing_X', 'ImagerPixelSpacing_Y', 'ImageType', 'Modality',
                                           'PatientSex', 'BodyPartExamined']],
         how='left',on='id')

In [None]:
df_train_dcm.shape

In [None]:
df_train_dcm.head(5)

In [None]:
## Save dataframes
df_train_dcm.to_csv('train_df_dcm_merged.csv')