In [None]:
! conda install -c conda-forge gdcm -y

In [None]:
import pandas as pd
import numpy as np
import re
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2
import albumentations as A
from tqdm.notebook import tqdm

from fastai.vision.all import *
from fastai.medical.imaging import *

In [None]:
from matplotlib.pyplot import figure
figure(figsize=(12, 10), dpi=80)

In [None]:
path = Path('../input/siim-covid19-detection')

In [None]:
path.ls()

In [None]:
path_train_image = path/'train'
path_test_image = path/'test'
ss = pd.read_csv(path/'sample_submission.csv')
train_image = pd.read_csv(path/'train_image_level.csv')
train_study = pd.read_csv(path/'train_study_level.csv')

# Understanding train_study DF

In [None]:
train_study.head()

In [None]:
#number of individual studies
len(train_study)

In [None]:
#sanity check to make sure no repeats for individual studies
assert len(train_study) == train_study.id.nunique()

In [None]:
class_dist = train_study.iloc[:,1:].sum(axis=0); class_dist/sum(class_dist)

Number of classes = 4

In [None]:
plt.bar(class_dist.index, class_dist.values)

for x, y in zip(class_dist.index, class_dist.values):
    plt.text(x, y+50, f'{y/len(train_study)*100:.3}%', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# are there studies with more than one label? No
(train_study.iloc[:, 1:].sum(1) > 1).sum()

# Understanding train_image DF

In [None]:
train_image.head(5)

In [None]:
# there are more images than study. This is because one study can have more than one image.
len(train_image)

In [None]:
# checks if the number of studies in the image level is same as the train_study. Yes, it is
assert train_image.StudyInstanceUID.nunique() == len(train_study)

In [None]:
numimagesperstudy = train_image.groupby('StudyInstanceUID').count()['id'].sort_values(ascending=False)

In [None]:
numimagesperstudycounts = numimagesperstudy.value_counts(); numimagesperstudycounts

In [None]:
x = plt.bar(numimagesperstudycounts.index, numimagesperstudycounts.values)

for x, y in zip(numimagesperstudycounts.index, numimagesperstudycounts.values):
    plt.text(x, y+50, f'{y}', fontsize=14)

plt.tight_layout()
plt.show()

Most studies have only one image. About 230 studies have more than one image. There is one study with 9 images. Lets take a look at this.

In [None]:
numimagesperstudy[numimagesperstudy >8].index[0]

In [None]:
train_image[train_image['StudyInstanceUID'] == numimagesperstudy[numimagesperstudy >8].index[0]]

This is interesting. Out of the 9 images, only one has a bbox. Other images have label none. Let's take a look at the study level label for this.

In [None]:
train_study[train_study['id'] == '0fd2db233deb_study']

At study level, the label is 'Intermediate Appearace'.

Now, lets take a look at the distribution of number of bboxes per image.

In [None]:
train_image['n_boxes'] = train_image['boxes'].apply(lambda x: sum(1 for _ in re.findall('width', str(x))))

In [None]:
n_boxes = train_image['n_boxes'].value_counts(); n_boxes/sum(n_boxes)*100

In [None]:
x = plt.bar(n_boxes.index, n_boxes.values)

for x, y in zip(n_boxes.index, n_boxes.values):
    plt.text(x, y+50, f'{y/len(train_image)*100:.3}%', fontsize=14)

plt.tight_layout()
plt.show()

Most images have <= 2 bboxes. 

In [None]:
train_image['label_only'] = train_image['label'].apply(lambda x: x.split()[0])

In [None]:
labels = train_image['label_only'].value_counts()

In [None]:
x = plt.bar(labels.index, labels.values)

for x, y in zip(labels.index, labels.values):
    plt.text(x, y+50, f'{y/len(train_study)*100:.3}%', fontsize=14)

plt.tight_layout()
plt.show()

70.9% of the images have label opacity.

In [None]:
train_study.columns = ['StudyInstanceUID', 'Negative for Pneumonia', 'Typical Appearance',
       'Indeterminate Appearance', 'Atypical Appearance']

In [None]:
train_image['StudyInstanceUID'] = train_image['StudyInstanceUID'].apply(lambda x: f'{x}_study')

Let's merge the dataframes. 

In [None]:
df = train_image.merge(train_study, on='StudyInstanceUID')

In [None]:
df.head()

# DICOM Metadata

The train images are stored in this format
```
--root [../input/siim-covid19-detection/train]
    |--StudyID01
        |--ImageID01
        |--ImageID02
    |--StudyID02
        |--ImageID01
        |--ImageID02
```

In [None]:
dcm_fns = get_dicom_files(path_train_image)

In [None]:
samp = dcm_fns[0].dcmread()

In [None]:
samp

MONOCHROME1
Pixel data represent a single monochrome image plane. The minimum sample value is intended to be displayed as white after any VOI gray scale transformations have been performed. 

MONOCHROME2
Pixel data represent a single monochrome image plane. The minimum sample value is intended to be displayed as black after any VOI gray scale transformations have been performed.

Let's create dicom metadata df. This takes a while hence I generated and uploaded the csv.

In [None]:
#to generate dicom metadata for train images
#df_dcm = pd.DataFrame.from_dicoms(dcm_fns, px_summ=True)
#df_dcm.to_csv('COVID_dcm_metadata.csv')

In [None]:
#to generate dicom metadata for test images
#dcm_test_fns = get_dicom_files(path_test_image)
#df_dcm_test = pd.DataFrame.from_dicoms(dcm_test_fns, px_summ=True)
#df_dcm_test.to_csv('COVID_dcm_metadata_test.csv')

In [None]:
df_dcm = pd.read_csv('../input/covid-dataframes/COVID_dcm_metadata.csv')
df_dcm_test = pd.read_csv('../input/covid-dataframes/COVID_dcm_metadata_test.csv')

In [None]:
df_dcm.columns

In [None]:
# Let's look at the modality
print(f"{df_dcm['Modality'].value_counts()}\n\n{df_dcm_test['Modality'].value_counts()}")

In [None]:
print(f"{df_dcm['PatientSex'].value_counts()}\n\n{df_dcm_test['PatientSex'].value_counts()}")

In [None]:
print(f"{df_dcm['PhotometricInterpretation'].value_counts()}\n\n{df_dcm_test['PhotometricInterpretation'].value_counts()}")

We have to consider this when we are preparing the dataset.

In [None]:
print(f"{df_dcm['PixelRepresentation'].value_counts()}\n\n{df_dcm_test['PixelRepresentation'].value_counts()}")

All dicom are `unsigned`. 

In [None]:
df_dcm['BitsAllocated'].value_counts()

Some are 16bits while others are 8bits

In [None]:
#the following codes are from https://www.kaggle.com/tanlikesmath/siim-covid-19-detection-a-simple-eda

def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.grid(False)
    plt.axis('off')
    plt.show()


def plot_imgs(imgs, cols=3, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500), label=[]):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
        plt.title(label[i])
        plt.grid(False)
        plt.axis('off')

    plt.suptitle(title)
    plt.show()

In [None]:
plot_img(dicom2array(dcm_fns[0]))

Let's take a look at the 9 images from study - `0fd2db233deb`

In [None]:
imgs_path = get_dicom_files(path_train_image/'0fd2db233deb')
imgs_id = [f"{str(img).split('/')[-1].split('.dcm')[0]}_image" for img in imgs_path]
imgs_label = list(df[df['id'].isin(imgs_id)]['label'].apply(lambda x: x.split()[0]).values)

In [None]:
plot_imgs([dicom2array(img) for img in imgs_path], label=imgs_label)

They all look same to me :(

# Resize and create Smaller images for prototyping

The following codes are from this wonderful [notebook](https://www.kaggle.com/konradb/diy-rescaled-images-with-bboxes/output).

In [None]:
df.head(2)

In [None]:
path_list = []
image_list = []
splits = []

for split in ['train']:   
    for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
        for file in filenames:
            fullpath = dirname + '/' + file
            path_list.append(fullpath)
            image_list.append(file)
            
temp_df = pd.DataFrame(image_list, columns =['image_id'])
temp_df['image_path'] = path_list

In [None]:
size=512
transform = A.Compose(
    [
        A.Resize(height = size , width = size, p=1),
    ], 
    p=1.0,  bbox_params=A.BboxParams( format='pascal_voc', min_area=0,  min_visibility=0, label_fields=['labels']  ))        

In [None]:
df['image_id']  = df['id'].apply(lambda s: s.replace('_image','') + '.dcm')
df = pd.merge(left = df, right = temp_df, on = 'image_id')

In [None]:
!mkdir train512

In [None]:
OUTPUT_DIRECTORY = Path('./train512')

In [None]:
img_list = []
label_list = []

# loop over files
for ii in tqdm(range(len(df)), total=len(df)):
    # get the image
    row = df.loc[ii]
    img_path = row['image_path']
    img = dicom2array(path=img_path)
    newname = img_path.split('/')[-1].replace('dcm', 'jpg')
    img_list.append(newname)
    
    # get the bounding boxes
    bboxes = []
    bbox = []
    labels = []
    confidences = []

    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) :
            labels.append(l)
        if (i % 6 == 1):
            confidences.append(l)
        if (i % 6 > 1):
            bbox.append(np.clip(float(l), a_min = 0, a_max = None ))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    

    # transform both
    result = transform(image = img, bboxes = bboxes, labels = np.ones(len(bboxes)))
    new_image = result['image']
    new_bboxes = np.array(result['bboxes']).tolist()

    # format the output
    # print('orig label: ' + row['label'])
    newlabel = ''
    if labels[0] == 'none':
        newlabel = 'none 1 0 0 1 1'
    else:
        for j in range(len(labels)):
            newlabel += labels[j] + ' ' + confidences[j] + ' ' +  ' '.join([str(np.round(f,5)) for f in new_bboxes[j]]) + ' '
    #print('new label:' + newlabel)
    label_list.append(newlabel)
    
    # store the new image
    cv2.imwrite(str(OUTPUT_DIRECTORY/newname), new_image)

In [None]:
# store the new boxes with image_ids
xmeta = pd.DataFrame(img_list, columns =['image_id'])
xmeta['label'] = label_list
xmeta.to_csv('bounding_boxes512.csv', index = False)

In [None]:
# wrap it up
!zip -rm -qq rescaled_with_bb512.zip train512 bounding_boxes_512.csv