# This notebook helps you prepare the 100+gb data into smaller dataset. Convert raw data into tensorflow recordset (TfRecords)

# UP Vote if this notebook is helpful and comment if you have any doubt.

In [None]:
import os, math, glob, re
import numpy as np
import pandas as pd
import cv2

import matplotlib.pyplot as plt
import pydicom

from kaggle_datasets import KaggleDatasets

from sklearn.model_selection import train_test_split


from random import shuffle
import tensorflow as tf

# Hyperparameters for your final images

In [None]:
IMAGE_SIZE  = 256   # consider this as hyper parameter for your data prepration
IMAGE_DEPTH = 128 # consider this as hyper parameter for your data prepration

mri_types = ['FLAIR','T1w','T1wCE','T2w']
CHANNELS  = len(mri_types)

local_directory = "../input/rsna-miccai-brain-tumor-radiogenomic-classification"
local_label_path = local_directory+"/train_labels.csv"
local_submission_path = local_directory+"/sample_submission.csv"

# Read Data

In [None]:
# test data
df_test = pd.read_csv(local_submission_path)
df_test['BraTS21ID'] = [format(x, '05d') for x in df_test.BraTS21ID]

# train data
train_df = pd.read_csv(local_label_path)
EXCLUDE = [109, 123, 709]
train_df = train_df[~train_df.BraTS21ID.isin(EXCLUDE)]
train_df['BraTS21ID'] = [format(x, '05d') for x in train_df.BraTS21ID]
train_df.head(5)

In [None]:
len(train_df)

# Split Data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df.BraTS21ID, 
                                                  train_df.MGMT_value, 
                                                  test_size=0.2, 
                                                  random_state=42,
                                                  stratify=train_df.MGMT_value)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# Preparing the data
There are few things to note about the data.

* Data has been preprocessed via this notebook
* For one patient e.g. '00002' a TFRecord will look this IMAGE_SIZE x IMAGE_SIZE x IMAGE_DEPTH x 4
* IMAGE_SIZE x IMAGE_SIZE (128 x 128) is the black and white image provided to us (resized, original was 512x512 I guess)
* IMAGE_DEPTH (16) means, there are a total of IMAGE_DEPTH such images of IMAGE_SIZE x IMAGE_SIZE size (there are multiple images in each folder. E.g. FLAIR folder has 148 images for a patient, I picked 16 (IMAGE_DEPTH)
* 4 means there are IMAGE_SIZE x IMAGE_SIZE x IMAGE_DEPTH images of each MRI Image type. 
* In a nutshell, each TFRecord has 16 images of 256x256 size for each MRI SCAN/IMAGE type

In [None]:
# load 1 dicom img, e.g. image-1.dcm
def read_one_dicom_image(path, img_size=256):

    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = cv2.resize(data, (img_size, img_size))
    return data


In [None]:
p = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-103.dcm"
img = read_one_dicom_image(p, IMAGE_SIZE)
img.shape

In [None]:

# load all dicoms in a modality folder, e.g. FLAIR/*.dcm
def load_dicom_modality(mri_type, scan_id, img_depth, img_size, split):
    files_path = f"{local_directory}/{split}/{scan_id}/{mri_type}/*.dcm"
    files = sorted(tf.io.gfile.glob(files_path), 
               key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])
    
   
    num_files = len(files)
    
#     print(f"total files in path {files_path}: = {num_files}")
    
    num_files_middle = num_files//2
    img_depth_middle = img_depth//2
    
    start_depth = max(0, num_files_middle - img_depth_middle)
    end_depth   = min(num_files, num_files_middle + img_depth_middle)
    
    lst_imgs = []

    
    # picking up only the middle files because
    for dicom_img in files[start_depth:end_depth]:
        
#     for dicom_img in files:        
        resized_img = read_one_dicom_image(dicom_img, img_size=img_size)
        lst_imgs.append(resized_img)
        
#     convert list of nested list into one big nested list
    img3d = np.stack(lst_imgs, axis=-1)
    
    # print(f"Old Shape of img3d = : {img3d.shape}")
    # basically put 0s to make sure shape is same
    # effectively - add more images a x b x num_of_images
    if img3d.shape[-1] < img_depth:
        # if this patient does not have enough images to make a depth of 32 (img_depth)
        # we will add additional 3d images with zero values
        
        for n in range(img_depth - img3d.shape[-1]):
            n_zero = np.zeros_like(img3d[:, :, 0])
            n_zero = tf.expand_dims(n_zero, axis=-1)
            img3d  = np.concatenate((img3d, n_zero), axis=-1)        
    
#     print(f"New Shape of img3d = : {img3d.shape}")
    return img3d

# load all modality for a single sample, e.g. 00010/*/*.dcm
def load_dicom_3D(scan_id, img_depth=128, img_size=256, split="train"):
#     print(scan_id, end=" ")
    dicom_channels = [load_dicom_modality(scan_id=scan_id,
                                         img_depth=img_depth, 
                                         img_size=img_size, 
                                         split=split,
                                         mri_type=mtype) 
                      for mtype in mri_types]
    
#     stacking all type images for one patient into one huge image
#     for 0001->flair, T1w, T1wCE and T2w
#     print(dicom_channels)
    img = np.stack(dicom_channels, axis=-1)
    
    # Normalize
    if np.min(img) < np.max(img):
        img = img - np.min(img)
        img = img / np.max(img)
        
#     so we need image which looks something like this
#     img_row x img_col x num_images x 4_class_of_image
#     print(f"Image for patient {scan_id} = {img.shape}")
    return img

In [None]:
img = load_dicom_3D("00002", img_size=IMAGE_SIZE, img_depth=64, split="train")
img.shape

In [None]:
img.size, img.shape

# Convert to TFRecords

In [None]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() 
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(image, label):
    feature = {
        'image': _bytes_feature(image.tobytes()),
        'MGMT_value': _float_feature(label)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def save_all_images_as_tensor_records(dest_path, writer_options, folder_name, X, y, n_sample=32):
    with tf.io.TFRecordWriter(dest_path, options=writer_options) as writer:    
    
        # patient id and label e.g. (00002, 1)
        cnt = 1
        for patient_id, label in zip(X, y):
            
            img3d_for_one_patient = load_dicom_3D(patient_id, 
                                                  img_size=IMAGE_SIZE, 
                                                  img_depth=IMAGE_DEPTH, 
                                                  split=folder_name) # to look in test or train folder

            example = serialize_example(img3d_for_one_patient, label)
            writer.write(example)
            if cnt > n_sample:
                break

            print(f"{cnt}: Completed for patient id : {patient_id}")
            cnt +=1
            
        print("Finished")

# Creating folders and uploading input images as Tensorflow Records

In [None]:
!rm -r tfrecords

# creating trian data folder
! mkdir -p ./tfrecords/train/
                
# creating validation data folder
! mkdir -p ./tfrecords/valid/

# creating real test data folder
! mkdir -p ./tfrecords/test/

In [None]:
my_writer_options = tf.io.TFRecordOptions(compression_type="GZIP")

In [None]:
outpath_train = "./tfrecords/train"
dest_path = str(outpath_train) + os.sep + "brain_train.tfrec"

# creating tensor records for the train data
save_all_images_as_tensor_records(dest_path, 
                                  my_writer_options, 
                                  "train", 
                                  X_train,
                                 y_train,
                                 n_sample=32000) 
# use n_sample to save 10 samples to test your Model 
# then later you can put a big value to get all the data into the dataset
                


In [None]:

outpath_valid = "./tfrecords/valid"
dest_path = str(outpath_valid) + os.sep + "brain_valid.tfrec"

# creating tensor records for the test data
save_all_images_as_tensor_records(dest_path, 
                                  my_writer_options, 
                                  "train", 
                                  X_val,
                                 y_val,
                                 n_sample=32000)


In [None]:
# reading the ground test data
gt_df = pd.read_csv(local_submission_path)

gt_df['BraTS21ID'] = [format(x, '05d') for x in gt_df["BraTS21ID"]]

# gt_df

In [None]:
outpath_valid = "./tfrecords/test"
dest_path = str(outpath_valid) + os.sep + "brain_test.tfrec"

# creating tensor records for the test data
save_all_images_as_tensor_records(dest_path, 
                                  my_writer_options, 
                                  "test", 
                                  gt_df["BraTS21ID"].values,
                                 gt_df["MGMT_value"].values,
                                 n_sample=32000)