# Preprocessing of training studies

This notebook preprocesses all training studies by cropping the brain from MRI images and storing the remaining slices in tensors.

In [None]:
import os
import sys 
import json
import glob
import random
import re
import collections
import time

import numpy as np
import pandas as pd
import pydicom
import cv2
import matplotlib.pyplot as plt
import seaborn as sns

from skimage.color import rgb2gray
from skimage import data
from skimage.filters import gaussian
from skimage.segmentation import active_contour
from skimage import measure

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm

In [None]:
pip install --upgrade ../input/mclahenumpy/mclahe-numpy

In [None]:
from mclahe import mclahe

All MRI studies are stored in tensors of size (SIZE, SIZE, min(NUM_IMAGES, len(slices))

Parameter FIRST_HALF indicates whether first part of the dataset should be processed. In case of False, second half is prepared. This is to overcome the problem of a limited output data size (20gb).

In [None]:
mri_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']
SIZE = 160
NUM_IMAGES = 160
FIRST_HALF = True

In [None]:
output_dir = '/kaggle/tmp/dataset'
data_directory = '/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification'

In [None]:
!mkdir -p {output_dir}/train

## Functions to load images

In [None]:
def load_file(path):
    return pydicom.read_file(path).pixel_array / 255.


def crop_dicom_image(data, crop_coordinates, img_size=SIZE):
    dim0_min = 0 if crop_coordinates[0] < 0 or crop_coordinates[0] >= data.shape[0]-1 else crop_coordinates[0]
    dim0_max = data.shape[0] if crop_coordinates[1] - dim0_min <= 0 or crop_coordinates[1] - dim0_min > data.shape[0] else crop_coordinates[1]
    dim1_min = 0 if crop_coordinates[2] < 0 or crop_coordinates[2] >= data.shape[1]-1 else crop_coordinates[2]
    dim1_max = data.shape[1] if crop_coordinates[3] - dim1_min <= 0 or crop_coordinates[3] - dim0_min > data.shape[0] else crop_coordinates[3]
    data = data[dim0_min:dim0_max, dim1_min:dim1_max]
    data = cv2.resize(data, (img_size, img_size))

    return data


def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)


def normalise(f):
    f = mclahe(f)
    return f


def find_biggest_snake(data_list, equal_to_prev, patience):
    current_index = len(data_list) // 2
    data = data_list[current_index]
    s = np.linspace(0, 2 * np.pi, max(data.shape[0], data.shape[1]))
    r = data.shape[0] / 2 + data.shape[0] / 2 * np.sin(s)
    c = data.shape[1] / 2 + data.shape[1] / 2 * np.cos(s)
    initial_snake = np.array([r, c]).T
    
    left_coordinates = find_biggest_snake_recursively(initial_snake, data_list, equal_to_prev, current_index, None, -1, patience=patience)
    return find_biggest_snake_recursively(initial_snake, data_list, equal_to_prev, current_index + 1, left_coordinates, 1, patience=patience)
    
    
def find_biggest_snake_recursively(initial_snake, data_list, equal_to_prev, current_index, current_coordinates, index_diff, patience):
    #print("find_biggest_snake_recursively", current_index, current_coordinates, index_diff)
    
    if current_index < 0 or current_index >= len(data_list):
        return current_coordinates
    
    if patience == 0:
        #print("end of patience")
        return current_coordinates
    
    new_coordinates = [] 
    found_new_coordinates = False
    
    if current_coordinates is not None:
        if index_diff == 1 and equal_to_prev[current_index]:
            #print("found equal for diff 1")
            found_new_coordinates = True
            new_coordinates = [coord for coord in current_coordinates]
        elif index_diff == -1 and current_index < len(data_list) - 1 and equal_to_prev[current_index+1]:
            found_new_coordinates = True
            new_coordinates = [coord for coord in current_coordinates]
            #print("found equal for diff -1")

    if not found_new_coordinates:
        # Find contours at a constant value of 0.5
        contours = measure.find_contours(data_list[current_index], 0.5)
        
        snake_min = [np.inf, np.inf]
        snake_max = [0, 0]
        for c in contours:
            c_min = np.min(c, axis=0)
            for i in [0, 1]:
                if int(c_min[i]) < snake_min[i]:
                    snake_min[i] = int(c_min[i])
            c_max = np.max(c, axis=0)
            for i in [0, 1]:
                if int(c_max[i]) > snake_max[i]:
                    snake_max[i] = int(c_max[i])
        
        coordinates = (snake_min[0], snake_max[0], snake_min[1], snake_max[1])

        for i in range(4):
            if (current_coordinates is None) or (i % 2 == 0 and coordinates[i] < current_coordinates[i]) or (i % 2 == 1 and coordinates[i] > current_coordinates[i]):
                new_coordinates.append(coordinates[i])
                found_new_coordinates = True
            else:
                new_coordinates.append(current_coordinates[i])
    
    if not found_new_coordinates:
        patience -= 1
        
    return find_biggest_snake_recursively(initial_snake, data_list, equal_to_prev, current_index + index_diff, new_coordinates, index_diff, patience)


def load_dicom_images_3d_prime(scan_id, num_imgs=NUM_IMAGES, img_size=SIZE, mri_type="FLAIR", split="train"):
    files = natural_sort(glob.glob(f"{data_directory}/{split}/{scan_id}/{mri_type}/*.dcm"))
    
    # excluding blank slices
    for index in [0, -1]:
        blank_slice_found = True
        removed_blank = 0
        while blank_slice_found:
            data = load_file(files[index])
            if np.count_nonzero(data - np.min(data)) / (data.shape[0] * data.shape[1]) < 0.003:
                blank_slice_found = True
                removed_blank += 1
                del files[index]
            else:
                blank_slice_found = False
    
    if len(files) > num_imgs:
        every_nth = len(files) / num_imgs
        indexes = [min(int(round(i*every_nth)), len(files)-1) for i in range(0,num_imgs)]
    else:
        every_nth = 1
        indexes = [i for i in range(0, len(files))]
    
    unique_indices = np.unique(np.array(indexes))
    
    files_to_load = [files[i] for i in indexes]
    
    images = []
    equal_to_prev = []
    prev_file = ''
    for f in files_to_load:
        if f == prev_file:
            images.append(images[-1])
            equal_to_prev.append(True)
        else:
            image = load_file(f)
            images.append(image)
            equal_to_prev.append(False)
        
        prev_file = f
        
    patience = int(every_nth * 2) + 1
        
    crop_coordinates = find_biggest_snake(images, equal_to_prev, patience)
    
    for i in range(len(images)):
        if equal_to_prev[i]:
            images[i] = images[i - 1]
        else:
            images[i] = crop_dicom_image(images[i], crop_coordinates)
    
    img3d = np.stack(images).T
    
    img3d = normalise(img3d)
    
    return np.expand_dims(img3d,0)

In [None]:
#load_dicom_images_3d_prime("00113", mri_type="FLAIR")

In [None]:
samples_to_exclude = [109, 123, 709]

df = pd.read_csv(f"{data_directory}/train_labels.csv")
print("original shape", df.shape)
df = df[~df.BraTS21ID.isin(samples_to_exclude)]
print("new shape", df.shape)
display(df)

In [None]:
split = 'train'


df = df.iloc[0:df.shape[0] // 2] if FIRST_HALF else df.iloc[df.shape[0] // 2:]

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    scan_id = str(row['BraTS21ID']).zfill(5)
    out_dir = f'{output_dir}/{split}/{scan_id}'
    os.makedirs(out_dir)
    for mri_type in mri_types:
        np.save(f'{out_dir}/{mri_type}.npy', load_dicom_images_3d_prime(scan_id, mri_type=mri_type))

In [None]:
!zip -r /kaggle/working/rsna-miccai-brain-tumor.zip {output_dir}