In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import cv2
import glob
import matplotlib.pyplot as plt
import numpy as np # linear algebra
from operator import itemgetter
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_path = '/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification'

This is a simple notebook showing how to load the full Brain in PyTorch.  This means, reading in all of the images of a particular modality for a particular patient, putting them in order, and producing a single 3D tensor to be fed into a PyTorch model.  In order for this to work, all images of a particular modality for a particular patient must have the same shape.  This is the case for this dataset, but you may want to investigate it for yourself.

Several of these helper functions came from or were inspired by this notebook: https://www.kaggle.com/furcifer/no-baseline-pytorch-cnn-for-mri?scriptVersionId=68186710


In [None]:
def _dicom2array(path, voi_lut=True, fix_monochrome=True, resize=False):
    dicom = pydicom.read_file(path)
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    #Normalize the data: subtract off the minimum, divide by the maximum, convert to 256 uint8
    data = data - np.min(data)
    data = data/np.max(data)
    data = (data * 255).astype(np.uint8)
    
    #Resize images to target value
    if resize:
        data = cv2.resize(data, (256, 256))
    return data

If one looks at the raw images, one sees lots of blank space around the actually brain, which is not useful for classification.  The following function finds the edges of the brain and eliminates the surrounding blank space.  Note: this is circumscribing it, not eliminating *all* the blank space, there is still blank space in the corners because PyTorch needs a cuboid input.

In [None]:
def _circumscriber(img: np.array) -> np.array:
    #First is vertical, second is horizontal, third is slices
    vmin = 0
    vlimit = img.shape[0]
    hmin = 0
    hlimit = img.shape[1]
    
    for i in range(vlimit):
        if np.max(img[i, :, :]) == 0:
            vmin += 1
        else:
            break
    vmax = vmin + 1
    for i in range(vmin+1, vlimit):
        if np.max(img[i, :, :]) > 0:
            vmax += 1
        else:
            break
    
    for j in range(hlimit):
        if np.max(img[:, j, :]) == 0:
            hmin += 1
        else:
            break
    hmax = hmin + 1
    for j in range(hmin+1, hlimit):
        if np.max(img[:, j, :]) > 0:
            hmax += 1
        else:
            break
    return img[vmin:vmax, hmin:hmax, :]

In [None]:
def load_FULL_brain(scan_id, split = 'train', modality='FLAIR'):
    """
    send all of the images in the chosen modality, in order, as a single 3D np array
    """
    if split != "train" and split != "test":
        print('Please request a valid split: train or test.  Defaulting to train.')
        split = "train"
        
    if modality != 'FLAIR' and modality != 'T1w' and modality != 'T1wCE' and modality != 'T2w':
        print('Please select an appropriate modality: FLAIR, T1w, T1wCE, or T2w')
        print('Defaulting to FLAIR')
        modality = 'FLAIR'
        
    image = sorted(glob.glob(f'{train_path}/{split}/{scan_id}/{modality}/*.dcm'))
    image_slice_locs = [pydicom.dcmread(im)[('0020', '1041')].value for im in image]
    image_pairs = list(zip(image, image_slice_locs))
    ordered_IP = sorted(image_pairs, key=itemgetter(1))
    images = [f[0] for f in ordered_IP]
    real_images = [_dicom2array(f) for f in images]
    good_images = np.array([im for im in real_images if np.max(im) > 0]).T
    final_image = _circumscriber(good_images)
    return final_image

In [None]:
def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(512,512)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i in range(4):
        img = imgs[:,:,i]
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

In [None]:
boxtest = load_FULL_brain('00000', split='train', modality='FLAIR')

Let's have a look at four slices from the middle of patient 00000's FLAIR brain image.

In [None]:
plot_imgs(boxtest[:,:,100:104])

In [None]:
class BrainLoader(Dataset):
    def __init__(self, label_file, path, split, modality, val_split=0.25):
        train_data = pd.read_csv(os.path.join(path, label_file))
        self.labels = {}
        self.path = path
        brats = list(train_data['BraTS21ID'])
        mgmt = list(train_data['MGMT_value'])
        for b, m in zip(brats, mgmt):
            self.labels[str(b).zfill(5)] = m
            
        self.split = split
        self.modality = modality
        
        self.ids = [a.split('/')[-1] for a in sorted(glob.glob(path + f'/{split}/*'))]
        stop = int(len(self.ids) * (1 - val_split))
        if split == 'train':
            self.ids = self.ids[:stop]
        elif split == 'val':
            self.ids = self.ids[stop:]
            
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        p_id = self.ids[idx]
        imgs = load_FULL_brain(p_id, split=self.split, modality=self.modality)
        transform = transforms.Compose([transforms.ToTensor()])
        imgs = transform(imgs)
        
        if self.split != 'test':
            label = torch.tensor(self.labels[p_id], dtype=torch.long)
            return torch.tensor(imgs, dtype=torch.float32), label
        return torch.tensor(imgs, dtype=torch.float32)

Finally, let's test out BrainLoader.

In [None]:
train_bs = 1
train_dataset = BrainLoader('train_labels.csv', train_path, split='train', modality='T1w')

train_loader = DataLoader(train_dataset, batch_size=train_bs, shuffle=True)

for img, label in train_loader:
    print('Iteration')
    print(img.shape)
    print(img.min())
    print(img.mean())
    print(img.max())
    print(label.shape)
    break

This should give anyone intending to use 3D CNNs in PyTorch a head-start.