In [1]:
# Import libraries
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pathlib
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
tf.get_logger().setLevel('INFO')

from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')
import PIL
import cv2
import matplotlib.pyplot as plt
import ast
import random
import sys
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
# Appending to sys paths
effnet_path = os.path.join(os.getcwd(), 'efficientdetv2')
sys.path.append(effnet_path)
sys.path.append(os.path.join(os.getcwd(), 'efficientdetv2', 'automl', 'efficientdet'))

In [31]:
from efficientdetv2.automl.efficientdet.dataset import create_covid_19_tfrecord as cct

In [4]:
data_path = r'D:\Datasets\siim_covid19_detection'
train_image_level = pd.read_csv(os.path.join(data_path , 'train_image_level.csv'))
train_study_level = pd.read_csv(os.path.join(data_path, 'train_study_level.csv'))

In [5]:
train_image_level.head()

Unnamed: 0,id,boxes,label,StudyInstanceUID
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e


In [6]:
train_study_level.head()

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,00086460a852_study,0,1,0,0
1,000c9c05fd14_study,0,0,0,1
2,00292f8c37bd_study,1,0,0,0
3,005057b3f880_study,1,0,0,0
4,0051d9b12e72_study,0,0,0,1


In [7]:
train_image_level.shape, train_study_level.shape

((6334, 4), (6054, 5))

In [8]:
dicom_dir = pathlib.Path(os.path.join(data_path, 'train'))
dicom_train_paths = list(dicom_dir.glob("*/*/*"))
with open('dicom_train_paths.txt', 'w') as train_paths:
    train_paths.write(str(dicom_train_paths))

In [9]:
print(f'Total number of image level files: {len(dicom_train_paths)}')

Total number of image level files: 6334


### Creating meta df

In [10]:
dicom_train_paths[:5]

[WindowsPath('D:/Datasets/siim_covid19_detection/train/00086460a852/9e8302230c91/65761e66de9f.dcm'),
 WindowsPath('D:/Datasets/siim_covid19_detection/train/000c9c05fd14/e555410bd2cd/51759b5579bc.dcm'),
 WindowsPath('D:/Datasets/siim_covid19_detection/train/00292f8c37bd/73120b4a13cb/f6293b1c49e2.dcm'),
 WindowsPath('D:/Datasets/siim_covid19_detection/train/005057b3f880/e34afce999c5/3019399c31f4.dcm'),
 WindowsPath('D:/Datasets/siim_covid19_detection/train/0051d9b12e72/152f6ec68d86/bb4b1da810f3.dcm')]

In [11]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [12]:
columns = ['Height', 'Width', 'Sex']
meta_df = pd.DataFrame(index=train_image_level.id)
for col in columns:
    meta_df.loc[:, col] = -1

for index, image_path in enumerate(dicom_train_paths):
    file_name = str(image_path).split(os.sep)[-1]
    image_id = file_name.split('.')[0] + '_image'
    img = pydicom.dcmread(image_path)
    meta_df.loc[image_id, :] = [img.Rows, img.Columns, img.PatientSex]

In [13]:
meta_df.head()

Unnamed: 0_level_0,Height,Width,Sex
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000a312787f2_image,3488,4256,M
000c3a3f293f_image,2320,2832,M
0012ff7358bc_image,2544,3056,F
001398f4ff4f_image,3520,4280,F
001bd15d1891_image,2800,3408,M


In [14]:
train_image_level['StudyInstanceUID'] = train_image_level['StudyInstanceUID'] + '_study'

In [15]:
train_image_level['label_class'] = [i[0] for i in train_image_level['label'].str.split(' ')]

In [16]:
main_df = train_image_level.merge(train_study_level, left_on = 'StudyInstanceUID', right_on =  'id')
main_df.drop(columns = 'id_y', inplace = True)
main_df.rename(columns = {'id_x': 'Image_id'}, inplace= True)
main_df.head()

Unnamed: 0,Image_id,boxes,label,StudyInstanceUID,label_class,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75_study,opacity,0,1,0,0
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed_study,none,1,0,0,0
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7_study,opacity,0,1,0,0
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2_study,opacity,0,0,0,1
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e_study,opacity,0,1,0,0


In [17]:
main_df.shape

(6334, 9)

In [18]:
meta_df.shape

(6334, 3)

In [19]:
main_df = main_df.merge(meta_df, left_on = 'Image_id', right_on = meta_df.index, suffixes=[])

In [20]:
main_df.head()

Unnamed: 0,Image_id,boxes,label,StudyInstanceUID,label_class,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,Height,Width,Sex
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75_study,opacity,0,1,0,0,3488,4256,M
1,000c3a3f293f_image,,none 1 0 0 1 1,ff0879eb20ed_study,none,1,0,0,0,2320,2832,M
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7_study,opacity,0,1,0,0,2544,3056,F
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2_study,opacity,0,0,0,1,3520,4280,F
4,001bd15d1891_image,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e_study,opacity,0,1,0,0,2800,3408,M


In [18]:
main_df['boxes'] = main_df['boxes'].apply(lambda x: ast.literal_eval(x) if x is not np.nan else x)

In [23]:
main_df.to_csv(os.path.join(data_path, 'main.csv'), index= False )

In [20]:
ids_with_opacity = list(main_df[main_df.label_class == 'opacity'].index)
ids_with_none = list(main_df[main_df.label_class != 'opacity'].index)

In [21]:
len(ids_with_opacity), len(ids_with_none)

(4294, 2040)

In [11]:
object_detection_files_path = os.path.join(data_path, 'object_detection_files')
if not os.path.isdir(object_detection_files_path):
    os.makedirs(object_detection_files_path)

In [410]:
def create_image_info_dict(idx, df, file_name_col):
    image_info  = {}
    image_info['id'] = idx
    image_info['file_name'] = df.loc[i, file_name_col] + '.png'
    image_info['height'] = int(df.loc[i, 'Height'])
    image_info['width'] = int(df.loc[i, 'Width'])
    return image_info

In [411]:
def get_annotations(idx, df):
    annots = []
    boxes = df.loc[idx, 'boxes']
    for index, box in enumerate(boxes):
        annot_dict = {}
        annot_dict['image_id'] = idx
        box_as_list = list(box.values())
        annot_dict['bbox'] = box_as_list
        annot_dict['area'] = round((box_as_list[-2]*box_as_list[-1])/(df.loc[idx, 'Height']*df.loc[idx, 'Width']),2)
        annot_dict['category_id'] = 1 # 0 is for background
        annot_dict['id'] = index+1
        annots.append(annot_dict)
    return annots

In [19]:

for fold in range(5):
    image_info_file_path = os.path.join(object_detection_files_path, f'files_fold_{fold}')
    # Make fold directory
    if not os.path.isdir(image_info_file_path):
        os.makedirs(image_info_file_path)
    
    # Validation indexes
    val_idx = ids_with_none
    val_op_idx = np.random.choice(ids_with_opacity, size = 500)
    val_idx.extend(val_op_idx)
    
    # Training indexes
    train_idx = [idx for idx in ids_with_opacity if idx not in val_idx]

    
    # Make train and valid directories inside each folds folder
    train_path = os.path.join(image_info_file_path, 'train')
    valid_path = os.path.join(image_info_file_path, 'valid')
    if not os.path.isdir(train_path):
        os.makedirs(train_path)
        os.makedirs(valid_path)
    
    # Create dictionaries to store image and annotations information
    train_images_info, valid_images_info = [[],[]]
    train_annot_info, valid_annot_info = [[],[]]
    for i in list(main_df.index):
        if i in val_idx:
            valid_images_info.append(create_image_info_dict(i, main_df, 'Image_id'))
            if i not in ids_with_none:
                valid_annot_info.extend(get_annotations(i, main_df))
        else:
            train_images_info.append(create_image_info_dict(i, main_df, 'Image_id'))
            train_annot_info.extend(get_annotations(i, main_df))
    valid_info_dict = {'images': valid_images_info}
    train_info_dict = {'images': train_images_info}
    
    # Dump image information dictionaries
    with open(os.path.join(train_path, 'images_info.json'), 'w') as train_file:
        json.dump(train_info_dict, train_file)
    with open(os.path.join(valid_path, 'images_info.json'), 'w') as valid_file:
        json.dump(valid_info_dict, valid_file)
    
    
    train_annot_dict = {'images': train_images_info,
                       'annotations': train_annot_info, 
                       'categories': {'id': 1, 'name': 'opacity'}}
    valid_annot_dict = {'images': valid_images_info,
                       'annotations': valid_annot_info, 
                       'categories': {'id': 1, 'name': 'opacity'}}
    
    # Dump image information dictionaries
    with open(os.path.join(train_path, 'object_detection_info.json'), 'w') as train_file:
        json.dump(train_annot_dict, train_file)
    with open(os.path.join(valid_path, 'object_detection_info.json'), 'w') as valid_file:
        json.dump(valid_annot_dict, valid_file)        

TypeError: create_image_info_dict() missing 2 required positional arguments: 'height' and 'width'

### Create object detection tfrecords dataset for 1080px resolution

In [22]:
data_path_1080px = os.path.join(data_path, '1080px')
object_detection_files_path = os.path.join(data_path_1080px, 'object_detection_files')
if not os.path.isdir(object_detection_files_path):
    os.makedirs(object_detection_files_path)

In [23]:
def create_image_info_dict(idx, df, file_name_col, height, width):
    image_info  = {}
    image_info['id'] = idx
    image_info['file_name'] = df.loc[i, file_name_col] + '.png'
    image_info['height'] = height 
    image_info['width'] = width
    return image_info

In [24]:
def get_annotations(idx, df, targ_height, targ_width):
    annots = []
    boxes = df.loc[idx, 'boxes']
    orig_height = df.loc[idx, 'Height']
    orig_width = df.loc[idx, 'Width']
    
    for index, box in enumerate(boxes):
        annot_dict = {}
        annot_dict['image_id'] = idx
        box_as_list = list(box.values())
        box_as_list[0] = box_as_list[0]/orig_width* targ_width # x
        box_as_list[1] = box_as_list[1]/orig_height* targ_height # y
        box_as_list[2] = box_as_list[2]/orig_width* targ_width # width
        box_as_list[3] = box_as_list[3]/orig_height* targ_height # height
        annot_dict['bbox'] = box_as_list
        annot_dict['area'] = round((box_as_list[-2]*box_as_list[-1])/(targ_height*targ_width),2)
        annot_dict['category_id'] = 1 # 0 is for background
        annot_dict['id'] = index+1
        annots.append(annot_dict)
    return annots

In [25]:
targ_height, targ_width = (1080,1080)
for fold in range(5):
    image_info_file_path = os.path.join(object_detection_files_path, f'files_fold_{fold}')
    # Make fold directory
    if not os.path.isdir(image_info_file_path):
        os.makedirs(image_info_file_path)
    
    # Validation indexes
    val_idx = ids_with_none
    val_op_idx = np.random.choice(ids_with_opacity, size = 500)
    val_idx.extend(val_op_idx)
    
    # Training indexes
    train_idx = [idx for idx in ids_with_opacity if idx not in val_idx]

    
    # Make train and valid directories inside each folds folder
    train_path = os.path.join(image_info_file_path, 'train')
    valid_path = os.path.join(image_info_file_path, 'valid')
    if not os.path.isdir(train_path):
        os.makedirs(train_path)
        os.makedirs(valid_path)
    
    # Create dictionaries to store image and annotations information
    train_images_info, valid_images_info = [[],[]]
    train_annot_info, valid_annot_info = [[],[]]
    for i in list(main_df.index):
        if i in val_idx:
            valid_images_info.append(create_image_info_dict(i, main_df, 'Image_id', targ_height, targ_width))
            if i not in ids_with_none:
                valid_annot_info.extend(get_annotations(i, main_df, targ_height, targ_width))
        else:
            train_images_info.append(create_image_info_dict(i, main_df, 'Image_id', targ_height, targ_width))
            train_annot_info.extend(get_annotations(i, main_df, targ_height, targ_width))
    valid_info_dict = {'images': valid_images_info}
    train_info_dict = {'images': train_images_info}
    
    # Dump image information dictionaries
    with open(os.path.join(train_path, 'images_info.json'), 'w') as train_file:
        json.dump(train_info_dict, train_file)
    with open(os.path.join(valid_path, 'images_info.json'), 'w') as valid_file:
        json.dump(valid_info_dict, valid_file)
    
    
    train_annot_dict = {'images': train_images_info,
                       'annotations': train_annot_info, 
                       'categories': [{'id': 1, 'name': 'opacity'}]}
    valid_annot_dict = {'images': valid_images_info,
                       'annotations': valid_annot_info, 
                       'categories': [{'id': 1, 'name': 'opacity'}]}
    
    # Dump image information dictionaries
    with open(os.path.join(train_path, 'object_detection_info.json'), 'w') as train_file:
        json.dump(train_annot_dict, train_file)
    with open(os.path.join(valid_path, 'object_detection_info.json'), 'w') as valid_file:
        json.dump(valid_annot_dict, valid_file)        

### Create tfrecords

In [1]:
# Import libraries
import os
import numpy as np
import pathlib
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
tf.get_logger().setLevel('INFO')

from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')
import PIL
import cv2
import matplotlib.pyplot as plt
import ast
import random
import sys
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
# Appending to sys paths
effnet_path = os.path.join(os.getcwd(), 'efficientdetv2')
sys.path.append(effnet_path)
sys.path.append(os.path.join(os.getcwd(), 'efficientdetv2', 'automl', 'efficientdet'))
from efficientdetv2.automl.efficientdet.dataset import create_covid_19_tfrecord

In [2]:
data_path = r'D:\Datasets\siim_covid19_detection'
data_path_1080px = os.path.join(data_path, '1080px')

In [3]:
main_df = pd.read_csv(os.path.join(data_path, 'main.csv'))

In [4]:
# For Training 
for fold in range(5):
    image_info_file = os.path.join(data_path_1080px, 'object_detection_files', f'files_fold_{fold}', 'train', 'images_info.json') 
    image_dir = os.path.join(data_path_1080px, 'train', 'image')
    output_path =os.path.join(data_path_1080px, 'tfrecords', f'fold_{fold}', 'train')
    os.makedirs(output_path, exist_ok= True)
    num_shards = 32
    object_annotations_file= os.path.join(data_path_1080px,'object_detection_files', f'files_fold_{fold}', 'train', 'object_detection_info.json')
    create_covid_19_tfrecord._create_tf_record_from_covid_19_annotations(image_info_file, image_dir, output_path, num_shards,
                                                                        object_annotations_file)

{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}


In [6]:
# For validation
split = 'valid'
for fold in range(5):
    image_info_file = os.path.join(data_path_1080px, 'object_detection_files', f'files_fold_{fold}', split, 'images_info.json') 
    image_dir = os.path.join(data_path_1080px, 'train', 'image')
    output_path =os.path.join(data_path_1080px, 'tfrecords', f'fold_{fold}', split)
    os.makedirs(output_path, exist_ok= True)
    num_shards = 32
    object_annotations_file= os.path.join(data_path_1080px,'object_detection_files', f'files_fold_{fold}', split, 'object_detection_info.json')
    create_covid_19_tfrecord._create_tf_record_from_covid_19_annotations(image_info_file, image_dir, output_path, num_shards,
                                                                        object_annotations_file)

{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}
{1: {'id': 1, 'name': 'opacity'}}


In [16]:
tf.shape(tf.expand_dims(tf.constant([4,3,5]), -1))[1]

<tf.Tensor: shape=(), dtype=int32, numpy=1>

In [12]:
tf.expand_dims(tf.constant([4,3,5]), -1).shape.as_list()

[3, 1]

In [37]:
ignored_value = tf.zeros(4)
unmatched_value = tf.zeros(4)
input_tensor = tf.random.uniform((5,4))
tf.concat([tf.stack([ignored_value, unmatched_value]),
                              input_tensor], axis=0)

<tf.Tensor: shape=(7, 4), dtype=float32, numpy=
array([[0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.44013572, 0.13725257, 0.4837637 , 0.27637935],
       [0.3956014 , 0.68841624, 0.8597882 , 0.2992519 ],
       [0.6738144 , 0.07710135, 0.30445337, 0.888929  ],
       [0.4457544 , 0.40169072, 0.15376341, 0.12624729],
       [0.44943154, 0.31900883, 0.34198284, 0.163414  ]], dtype=float32)>

In [35]:
tf.gather(tf.concat([tf.stack([ignored_value, unmatched_value]),
                              input_tensor], axis=0), [0,2,4,5])

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[0.        , 0.        , 0.        , 0.        ],
       [0.41042364, 0.44311905, 0.36754072, 0.8006016 ],
       [0.8837333 , 0.6250452 , 0.5993353 , 0.97363925],
       [0.8627362 , 0.7240653 , 0.47522664, 0.06449366]], dtype=float32)>

In [32]:
tf.gather(input_tensor, tf.arg_max(input_tensor, 1))

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[0.8627362 , 0.7240653 , 0.47522664, 0.06449366],
       [0.27167726, 0.6867018 , 0.6009382 , 0.54461527],
       [0.8627362 , 0.7240653 , 0.47522664, 0.06449366],
       [0.41042364, 0.44311905, 0.36754072, 0.8006016 ],
       [0.8627362 , 0.7240653 , 0.47522664, 0.06449366]], dtype=float32)>

In [23]:
import tensorflow.compat.v1 as tf

In [39]:
tf.tile(tf.ones((1,4)), [5,1])

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]], dtype=float32)>

In [40]:
output_path

'D:\\Datasets\\siim_covid19_detection\\1080px\\tfrecords\\fold_4\\valid'

In [50]:
len(list((tf.data.Dataset.list_files('D:\\Datasets\\siim_covid19_detection\\1080px\\tfrecords\\*\\train\\*').as_numpy_iterator())))

160