In [None]:
!conda install '/kaggle/input/pydicom-conda-helper/libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/gdcm-2.8.9-py37h500ead1_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/conda-4.10.1-py37h89c1867_0.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/certifi-2020.12.5-py37h89c1867_1.tar.bz2' -c conda-forge -y
!conda install '/kaggle/input/pydicom-conda-helper/openssl-1.1.1k-h7f98852_0.tar.bz2' -c conda-forge -y

In [None]:
IMAGE_SIZE = 1024

In [None]:
import numpy as np 
import pandas as pd
import os
import pydicom
import glob
from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
from fastai.vision.all import *
from fastai.medical.imaging import *
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import multiprocessing as mp

tqdm.pandas()
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
dataset_path = Path('../input/siim-covid19-detection')
dataset_path.ls()

In [None]:
train_data_path = dataset_path/'train'
train_data_path.ls()

In [None]:
df_study_level  = pd.read_csv('../input/kagglesiimcovid/train_stratified_group_5xfolds_clean.csv')
df_study_level.drop(columns=['has_bbox'], inplace=True)
df_study_level.head()

In [None]:
all_dicoms = get_dicom_files(train_data_path)
image_ids  = df_study_level.id.unique().tolist()
all_dicoms = all_dicoms.filter(lambda x : str(x).split(os.path.sep)[-1].split('.')[0] in image_ids)
dicom_dict = all_dicoms.map_dict(lambda x: str(x).split(os.path.sep)[-1].split('.')[0])
dicom_dict = {v:str(os.path.abspath(k)) for k,v in dicom_dict.items()}

In [None]:
df_study_level['dcm_path'] = df_study_level['id'].map(lambda x: dicom_dict[x])
df_study_level.head()

In [None]:
train = df_study_level
print('Images WITH Negative for Pneumonia')
imgs = train.loc[train.negative == 1].sample(10).dcm_path.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = dicom2array(k)
    img = cv2.resize(img, (128,128))
    plt.subplot(2,5,i+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
plt.show()

print('Images WITH Typical Appearance')
imgs = train.loc[train.typical == 1].sample(10).dcm_path.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = dicom2array(k)
    img = cv2.resize(img, (128,128))
    plt.subplot(2,5,i+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
plt.show()

print('Images WITH Indeterminate Appearance')
imgs = train.loc[train.indeterminate == 1].sample(10).dcm_path.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = dicom2array(k)
    img = cv2.resize(img, (128,128))
    plt.subplot(2,5,i+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
plt.show()

print('Images WITH Atypical Appearance')
imgs = train.loc[train.atypical == 1].sample(10).dcm_path.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = dicom2array(k)
    img = cv2.resize(img, (128,128))
    plt.subplot(2,5,i+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
plt.show()

In [None]:
def convert_to_feature(value, value_type=None):
    """Converts the given python object to a tf.train.Feature.
  Args:
    value: int, float, bytes or a list of them.
    value_type: optional, if specified, forces the feature to be of the given
      type. Otherwise, type is inferred automatically. Can be one of
      ['bytes', 'int64', 'float', 'bytes_list', 'int64_list', 'float_list']
  Returns:
    feature: A tf.train.Feature object.
  """

    if value_type is None:

        element = value[0] if isinstance(value, list) else value

        if isinstance(element, bytes):
            value_type = 'bytes'

        elif isinstance(element, (int, np.integer)):
            value_type = 'int64'

        elif isinstance(element, (float, np.floating)):
            value_type = 'float'

        else:
            raise ValueError('Cannot convert type {} to feature'.format(
                type(element)))

        if isinstance(value, list):
            value_type = value_type + '_list'

    if value_type == 'int64':
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

    elif value_type == 'int64_list':
        value = np.asarray(value).astype(np.int64).reshape(-1)
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

    elif value_type == 'float':
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    elif value_type == 'float_list':
        value = np.asarray(value).astype(np.float32).reshape(-1)
        return tf.train.Feature(float_list=tf.train.FloatList(value=value))

    elif value_type == 'bytes':
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    elif value_type == 'bytes_list':
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

    else:
        raise ValueError('Unknown value_type parameter - {}'.format(value_type))

In [None]:
def dicom2array_2(fname, target_size=IMAGE_SIZE, use_clahe=True, clip_limit=2.0, grid_size=(8, 8)):
    dicom = pydicom.dcmread(fname)
    data = apply_voi_lut(dicom.pixel_array, dicom)
    im = data - np.min(data)
    im = 255.0 * im / np.max(im)

    if dicom.PhotometricInterpretation == "MONOCHROME1":  # check for inverted image
        im = 255.0 - im

    if use_clahe:
        clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=grid_size)
        im = clahe.apply(im.astype("uint8"))
    return (
        np.expand_dims(
            cv2.resize(im, (target_size, target_size)).astype(dtype=np.uint8), axis=-1
        ),
        im.shape,
    )


def image_info_to_feature_dict(image_id, height, width, encoded_str, labels):
    """Convert image information to a dict of features."""
    
    # ["atypical", "indeterminate", "negative", "typical"]
    return {
        "image/encoded"       : convert_to_feature(encoded_str.numpy(), 'bytes'),
        "image/image_id"      : convert_to_feature(image_id, 'bytes'),
        "image/height"        : convert_to_feature(height),
        "image/width"         : convert_to_feature(width),
        "label/atypical"      : convert_to_feature(labels[0], value_type="int64"),
        "label/indeterminate" : convert_to_feature(labels[1], value_type="int64"),
        "label/negative"      : convert_to_feature(labels[2], value_type="int64"),
        "label/typical"       : convert_to_feature(labels[3], value_type="int64"),
    }


def serialize_sample(dcm_path, labels):
    """Serializes a single dicom image and its corresponding label"""
    assert os.path.exists(dcm_path)
    img, (h, w) = dicom2array_2(dcm_path)
    image_id = str(dcm_path).split(os.path.sep)[-1].split('.')[0]
    encoded_image = tf.io.encode_png(img)
    feature_dict = image_info_to_feature_dict(image_id.encode(), h, w, encoded_image, labels)
    sample = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    return sample

In [None]:
def check_and_make_dir(directory):
    """Creates the directory if it doesn't exist."""
    if not tf.io.gfile.isdir(directory):
        tf.io.gfile.makedirs(directory)

def write_tf_record_dataset(
    output_path,
    annotation_iterator,
    process_func,
    num_shards,
    use_multiprocessing=True,
    unpack_arguments=True,
):
    """Iterates over annotations, processes them and writes into TFRecords.
    Args:
      output_path: The prefix path to create TF record files.
      annotation_iterator: An iterator of tuples containing details about the
        dataset.
      process_func: A function which takes the elements from the tuples of
        annotation_iterator as arguments and returns a tuple of (tf.train.Example,
        int). The integer indicates the number of annotations that were skipped.
      num_shards: int, the number of shards to write for the dataset.
      use_multiprocessing:
        Whether or not to use multiple processes to write TF Records.
      unpack_arguments:
        Whether to unpack the tuples from annotation_iterator as individual
          arguments to the process func or to pass the returned value as it is.
    Returns:
      num_skipped: The total number of skipped annotations.
    """
    writers = [
        tf.io.TFRecordWriter(output_path + "%05d-of-%05d.tfrecord" % (i, num_shards))
        for i in range(num_shards)
    ]

    total_num_annotations_skipped = 0

    if use_multiprocessing:
        pool = mp.Pool(processes=mp.cpu_count())
        if unpack_arguments:
            tf_example_iterator = pool.starmap(process_func, annotation_iterator)
        else:
            tf_example_iterator = pool.imap(process_func, annotation_iterator)
    else:
        if unpack_arguments:
            tf_example_iterator = itertools.starmap(process_func, annotation_iterator)
        else:
            tf_example_iterator = map(process_func, annotation_iterator)
    
    iterator = tqdm(tf_example_iterator, total=len(annotation_iterator))

    for idx, (tf_example) in enumerate(iterator):
        writers[idx % num_shards].write(tf_example.SerializeToString())

    if use_multiprocessing:
        pool.close()
        pool.join()

    for writer in writers:
        writer.close()

In [None]:
CLASSES = ["atypical", "indeterminate", "negative", "typical"]
train = df_study_level.copy()
train.head()

In [None]:
folds = 5
num_shards = 20

for fold in tqdm(range(folds)):
    df_fold = train.query(f'kfold=={fold}').reset_index(inplace=False, drop=True)
    df_fold_labels = df_fold[CLASSES].values.tolist()
    df_fold_labels = [list(o) for o in df_fold_labels]
    df_fold_images = df_fold['dcm_path'].values.tolist()
    iterator = [(df_fold_images[i], df_fold_labels[i]) for i in range(len(df_fold))]
    check_and_make_dir(f'fold-{fold}/')

    write_tf_record_dataset(
        output_path=f'fold-{fold}/',
        annotation_iterator=iterator, 
        process_func=serialize_sample, 
        num_shards=num_shards, 
        use_multiprocessing=True
    )

In [None]:
image_feature_description = {
    "image/encoded"       : tf.io.FixedLenFeature([], tf.string),
    "image/image_id"      : tf.io.FixedLenFeature([], tf.string),
    "image/height"        : tf.io.FixedLenFeature([], tf.int64),
    "image/width"         : tf.io.FixedLenFeature([], tf.int64),
    f"label/{CLASSES[0]}" : tf.io.FixedLenFeature([], tf.int64),
    f"label/{CLASSES[1]}" : tf.io.FixedLenFeature([], tf.int64),
    f"label/{CLASSES[2]}" : tf.io.FixedLenFeature([], tf.int64),
    f"label/{CLASSES[3]}" : tf.io.FixedLenFeature([], tf.int64),
}


raw_image_dataset = tf.data.TFRecordDataset(tf.io.gfile.glob('./fold-0/*.tfrecord'))

def _parse_image_function(example_proto):
    return tf.io.parse_single_example(example_proto, image_feature_description)

parsed_image_dataset = raw_image_dataset.map(_parse_image_function)

plt.figure(figsize=(20,8))
for i,features in enumerate(parsed_image_dataset.take(10)):
    # ["atypical", "indeterminate", "negative", "typical"]
    image_raw = features['image/encoded'].numpy()
    img = PILImage.create(image_raw)
    
    atypical      = features[f'label/{CLASSES[0]}'].numpy()
    indeterminate = features[f'label/{CLASSES[1]}'].numpy()
    negative      = features[f'label/{CLASSES[2]}'].numpy()
    typical       = features[f'label/{CLASSES[3]}'].numpy()
    label_str = [atypical, indeterminate, negative, typical]
    
    plt.subplot(2,5, i+1)
    plt.axis('off')
    plt.imshow(img, cmap='gray')
    plt.title(label_str, color='green' if negative else 'red')