In [None]:
import os, math, glob, re
import numpy as np
import pandas as pd
import cv2

import matplotlib.pyplot as plt
import pydicom

from kaggle_datasets import KaggleDatasets

from sklearn.model_selection import train_test_split


from random import shuffle
import tensorflow as tf

In [None]:
IMAGE_SIZE  = 256
IMAGE_DEPTH = 64

mri_types = ['FLAIR','T1w','T1wCE','T2w']
CHANNELS  = len(mri_types)

local_directory = "../input/rsna-miccai-brain-tumor-radiogenomic-classification"
local_label_path = local_directory+"/train_labels.csv"
local_submission_path = local_directory+"/sample_submission.csv"

# Read Data

In [None]:
# test data
df_test = pd.read_csv(local_submission_path)
df_test['BraTS21ID'] = [format(x, '05d') for x in df_test.BraTS21ID]

# train data
train_df = pd.read_csv(local_label_path)
EXCLUDE = [109, 123, 709]
train_df = train_df[~train_df.BraTS21ID.isin(EXCLUDE)]
train_df['BraTS21ID'] = [format(x, '05d') for x in train_df.BraTS21ID]
train_df.head(5)

# Split Data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df.BraTS21ID, 
                                                  train_df.MGMT_value, 
                                                  test_size=0.2, 
                                                  random_state=42,
                                                  stratify=train_df.MGMT_value)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# Load DICOM

In [None]:
# load 1 dicom img, e.g. image-1.dcm
def load_dicom_slice(path, img_size=256):

    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = cv2.resize(data, (img_size, img_size))
    
    return data

# load all dicoms in a modality folder, e.g. FLAIR/*.dcm
def load_dicom_modality(mri_type, scan_id, img_depth, img_size, split):

    files = sorted(tf.io.gfile.glob(f"{local_directory}/{split}/{scan_id}/{mri_type}/*.dcm"), 
               key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])
    num_files = len(files)
    num_files_middle = num_files//2
    img_depth_middle = img_depth//2
    
    start_depth = max(0, num_files_middle - img_depth_middle)
    end_depth   = min(num_files, num_files_middle + img_depth_middle)
    img3d = np.stack([load_dicom_slice(dicom_img, img_size=img_size) 
                      for dicom_img in files[start_depth:end_depth]]).T
    
    if img3d.shape[-1] < img_depth:
        n_zero = np.zeros((img_size, img_size, img_depth - img3d.shape[-1]))
        img3d  = np.concatenate((img3d, n_zero), axis=-1)        

    return img3d

# load all modality for a single sample, e.g. 00010/*/*.dcm
def load_dicom_3D(scan_id, img_depth=128, img_size=256, split="train"):
    print(scan_id, end=" ")
    dicom_channels = [load_dicom_modality(scan_id=scan_id,
                                         img_depth=img_depth, 
                                         img_size=img_size, 
                                         split=split,
                                         mri_type=mtype) 
                      for mtype in mri_types]
        
    img = np.stack(dicom_channels, axis=-1)
    # Normalize
    if np.min(img) < np.max(img):
        img = img - np.min(img)
        img = img / np.max(img)
    return img

# Convert to TFRecords

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(image, label):
    feature = {
        'image': _bytes_feature(image.tobytes()),
        'MGMT_value': _float_feature(label)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
! mkdir -p ./tfrecords/train/
outpath_train = "./tfrecords/train"
with tf.io.TFRecordWriter(str(outpath_train + os.sep + 'brain_train.tfrec'),
                          options=tf.io.TFRecordOptions(compression_type="GZIP")) as writer:
    for x,y in zip(X_train,y_train):
        img = load_dicom_3D(x, img_size=IMAGE_SIZE, img_depth=IMAGE_DEPTH, split="train")
        example = serialize_example(img, y)
        writer.write(example)

! mkdir -p ./tfrecords/valid/
outpath_valid = "./tfrecords/valid"
with tf.io.TFRecordWriter(str(outpath_valid + os.sep + 'brain_val.tfrec'),
                          options=tf.io.TFRecordOptions(compression_type="GZIP")) as writer:
    for x,y in zip(X_val,y_val):
        img = load_dicom_3D(x, img_size=IMAGE_SIZE, img_depth=IMAGE_DEPTH, split="train")
        example = serialize_example(img, y)
        writer.write(example)