# Import packages

In [None]:
# !pip install imutils
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pydicom as dicom
import matplotlib.pylab as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from skimage import measure
# ipywidgets for some interactive plots
from ipywidgets.widgets import * 
import ipywidgets as widgets
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16, preprocess_input
from keras import layers
from keras.models import Model, Sequential
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping
import plotly.express as px
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras.utils import np_utils
from sklearn.metrics import accuracy_score
import gc

# Load data

In [None]:
sample_submission_file = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv')
train_data_labels = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv')

# EDA

In [None]:
print('Shape of train data:', train_data_labels.shape)
train_data_labels.head()

In [None]:
print('Number of image folders:', len(os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train')))

In [None]:
set1 = set(os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train'))
set2 = set(train_data_labels['BraTS21ID'].unique())
print('Intersection between IDs in train data labels file and train data folders:', len(set1.intersection(set2)))

In [None]:
# Convert folder IDs to numbers
set1 = set([int(x) for x in list(set1)])
print('Intersection between IDs in train data labels file and train data folders:', len(set1.intersection(set2)))

In [None]:
print('Shape of test data:', sample_submission_file.shape)
sample_submission_file.head()

In [None]:
# FLAIR
# specify your image path
# using images of a patient with brain tumor
image_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-101.dcm'
ds = dicom.dcmread(image_path)

plt.imshow(ds.pixel_array)

In [None]:
# T1w
image_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T1w/Image-11.dcm'
ds = dicom.dcmread(image_path)

plt.imshow(ds.pixel_array)

In [None]:
# T1wCE
image_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T1wCE/Image-101.dcm'
ds = dicom.dcmread(image_path)

plt.imshow(ds.pixel_array)

In [None]:
# T2w
image_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T2w/Image-101.dcm'
ds = dicom.dcmread(image_path)

img = ds.pixel_array

plt.imshow(img)

In [None]:
ds.pixel_array

In [None]:
ds

In [None]:
# The unit of measurement in CT scans is the Hounsfield Unit (HU), which is a measure of radiodensity. CT scanners are carefully calibrated to accurately measure this
# Load the scans in given folder path
def load_scan(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2])) # Sort by patient's position while scan was taken
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
    return slices
        
    return slices

In [None]:
def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/'

# slide through dicom images using a slide bar 
plt.figure(1)
def dicom_animation(x):
    first_patient = load_scan(path)
    first_patient_pixels = get_pixels_hu(first_patient)
    plt.hist(first_patient_pixels.flatten(), bins=80, color='c')
    plt.xlabel("Hounsfield Units (HU)")
    plt.ylabel("Frequency")
    plt.show()

    # Show some slice in the middle
    # cmap=plt.cm.gray
    plt.imshow(first_patient_pixels[x], cmap=plt.cm.bone)
    plt.show()
    return x
interact(dicom_animation, x=(0, len(os.listdir(path))-1))

In [None]:
first_patient = load_scan(path)
first_patient_pixels = get_pixels_hu(first_patient)

def sample_stack(stack, rows=6, cols=6, start_with=10, show_every=3):
    fig,ax = plt.subplots(rows,cols,figsize=[12,12])
    for i in range(rows*cols):
        ind = start_with + i*show_every
        ax[int(i/rows),int(i % rows)].set_title('slice %d' % ind)
        ax[int(i/rows),int(i % rows)].imshow(stack[ind],cmap='gray')
        ax[int(i/rows),int(i % rows)].axis('off')
    plt.show()

sample_stack(first_patient_pixels)

In [None]:
def resize_scan(scan, new_shape):
    # read slice as 32 bit signed integers
    img = Image.fromarray(scan)
    # do the resizing
    img = img.resize(new_shape, resample=Image.LANCZOS)
    # convert back to 16 bit integers
    resized_scan = np.array(img, dtype=np.int16)
    return resized_scan

In [None]:
def crop_scan(scan):
    img = Image.fromarray(scan)
    
    left = (scan.shape[0]-512)/2
    right = (scan.shape[0]+512)/2
    top = (scan.shape[1]-512)/2
    bottom = (scan.shape[1]+512)/2

    img = img.crop((left, top, right, bottom))
    # convert back to 16 bit integers
    cropped_scan = np.array(img, dtype=np.int16)
    return cropped_scan

In [None]:
def crop_and_resize(scan, new_shape):
    img = Image.fromarray(scan)
    
    left = (scan.shape[0]-512)/2
    right = (scan.shape[0]+512)/2
    top = (scan.shape[1]-512)/2
    bottom = (scan.shape[1]+512)/2
    
    img = img.crop((left, top, right, bottom))
    img = img.resize(new_shape, resample=Image.LANCZOS)
    
    cropped_resized_scan = np.array(img, dtype=np.int16)
    return cropped_resized_scan

In [None]:
first_patient = load_scan(path)
first_patient_pixels = get_pixels_hu(first_patient)
processed_img = crop_and_resize(first_patient_pixels[242,:,:], new_shape = [512,512])
plt.imshow(processed_img, cmap=plt.cm.bone)
plt.show()

# Create data for modelling

In [None]:
train_data = train_data_labels.copy()

In [None]:
train_data['# FLAIR images'] = np.NaN
train_data['# T1w images'] = np.NaN
train_data['# T1wCE images'] = np.NaN
train_data['# T2w images'] = np.NaN

In [None]:
# Using a subset of the complete train data
num_patients = 30
train_data = train_data.iloc[0:num_patients]

In [None]:
# Removing images with issues
invalid_ids = [109, 123, 709]
print('Train data shape before:', train_data.shape)
train_data = train_data[~train_data['BraTS21ID'].isin(invalid_ids)].reset_index(drop = True)
print('Train data shape after:', train_data.shape)

In [None]:
# Create master data containing FLAIR, T1w, T1wCE, T2w for every patient

flair_dict = {}
T1w_dict = {}
T1wCE_dict = {}
T2w_dict = {}

for i in tqdm(range(0, len(train_data))):
    pat_id_for_folder = str(train_data['BraTS21ID'].iloc[i]).rjust(5, "0")
    
    for image_type in ['FLAIR', 'T1w', 'T1wCE', 'T2w']:
        image_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/' + pat_id_for_folder + f'/{image_type}/'    
        image_list = os.listdir(image_path)
        train_data[f'# {image_type} images'].iloc[i] = len(image_list)

        all_scans_of_patient = load_scan(image_path)
        all_scans_of_patient = get_pixels_hu(all_scans_of_patient)
        temp_dict = {}
        for j in range(0, len(image_list)):
            temp_dict[image_list[j]] = np.resize(all_scans_of_patient[j], (512,512)) # Resize into same shape
            
        if image_type == 'FLAIR':
            flair_dict[train_data['BraTS21ID'].iloc[i]] = temp_dict
        elif image_type == 'T1w':
            T1w_dict[train_data['BraTS21ID'].iloc[i]] = temp_dict
        elif image_type == 'T1wCE':
            T1wCE_dict[train_data['BraTS21ID'].iloc[i]] = temp_dict
        elif image_type == 'T2w':
            T2w_dict[train_data['BraTS21ID'].iloc[i]] = temp_dict

In [None]:
train_data.head()

In [None]:
grp_data = train_data.groupby(['MGMT_value']).count().reset_index().iloc[:,0:2].rename(columns = {'BraTS21ID':'# Patients'})
fig = px.bar(grp_data, x='MGMT_value', y='# Patients', color = 'MGMT_value')
fig.show()

To convert all slices to a single entry, I am averaging all the scans under the 4 types for each patient. Let's see the created image after averaging.

In [None]:
flair_avg_dict = {}
T1w_avg_dict = {}
T1wCE_avg_dict = {}
T2w_avg_dict = {}

for i in tqdm(range(0, len(train_data))):
    pat_id_for_folder = str(train_data['BraTS21ID'].iloc[i]).rjust(5, "0")
    flair_avg_dict[train_data['BraTS21ID'].iloc[i]] = np.array([v for k, v in flair_dict[train_data['BraTS21ID'].iloc[i]].items()]).mean(axis = 0)
    T1w_avg_dict[train_data['BraTS21ID'].iloc[i]] = np.array([v for k, v in T1w_dict[train_data['BraTS21ID'].iloc[i]].items()]).mean(axis = 0)
    T1wCE_avg_dict[train_data['BraTS21ID'].iloc[i]] = np.array([v for k, v in T1wCE_dict[train_data['BraTS21ID'].iloc[i]].items()]).mean(axis = 0)
    T2w_avg_dict[train_data['BraTS21ID'].iloc[i]] = np.array([v for k, v in T2w_dict[train_data['BraTS21ID'].iloc[i]].items()]).mean(axis = 0)

In [None]:
del flair_dict
del T1w_dict
del T1wCE_dict
del T2w_dict
del all_scans_of_patient

In [None]:
rows = 4
cols = 5
stack = [v for k,v in flair_avg_dict.items()]
stack_ids = [k for k,v in flair_avg_dict.items()]
pat_with_tumor = train_data[train_data['MGMT_value']==1]['BraTS21ID'].tolist()
fig,ax = plt.subplots(cols,rows,figsize=[12,12])
for i in range(rows*cols):
    ind = stack_ids[i]
    if ind in pat_with_tumor:
        ax[max(int(i/rows),0),int(i % rows)].set_title('With tumor', color = 'red')
    else:
        ax[max(int(i/rows),0),int(i % rows)].set_title('No tumor', color = 'green')
    ax[max(int(i/rows),0),int(i % rows)].imshow(stack[i],cmap='gray')
    ax[max(int(i/rows),0),int(i % rows)].axis('off')
plt.show()

Treating all slices as a single entry through averaging makes it hard to distinguish the tumor very well

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data['BraTS21ID'], train_data['MGMT_value'], test_size=0.3, random_state=42, stratify = train_data['MGMT_value'])

In [None]:
train_data['is_train_flag'] = np.where(train_data['BraTS21ID'].isin(X_train), 1, 0)

In [None]:
grp_data = train_data.groupby(['is_train_flag']).agg({'BraTS21ID':'count', 'MGMT_value':'sum'}).reset_index().rename(columns = {'BraTS21ID':'# Patients', 'MGMT_value':'# Patients with brain tumor'})
grp_data['% Patients with tumor'] = grp_data['# Patients with brain tumor']/grp_data['# Patients'] * 100
grp_data

In [None]:
X_train_for_modelling = np.concatenate([np.array([v for k,v in flair_avg_dict.items() if k in list(X_train)]),
         np.array([v for k,v in T1w_avg_dict.items() if k in list(X_train)]),
         np.array([v for k,v in T1wCE_avg_dict.items() if k in list(X_train)]),
         np.array([v for k,v in T2w_avg_dict.items() if k in list(X_train)])])

y_train_for_modelling = np.array(list(train_data[train_data['BraTS21ID'].isin([k for k,v in flair_avg_dict.items() if k in list(X_train)])]['MGMT_value']) * 4)

X_valid_for_modelling = np.concatenate([np.array([v for k,v in flair_avg_dict.items() if k in list(X_valid)]),
         np.array([v for k,v in T1w_avg_dict.items() if k in list(X_valid)]),
         np.array([v for k,v in T1wCE_avg_dict.items() if k in list(X_valid)]),
         np.array([v for k,v in T2w_avg_dict.items() if k in list(X_valid)])])

y_valid_for_modelling = np.array(list(train_data[train_data['BraTS21ID'].isin([k for k,v in flair_avg_dict.items() if k in list(X_valid)])]['MGMT_value']) * 4)

print('X train shape:', X_train_for_modelling.shape)
print('y train shape:', y_train_for_modelling.shape)
print('X validation shape:', X_valid_for_modelling.shape)
print('y validation shape:', y_valid_for_modelling.shape)

In [None]:
# building the input vector from the 512x512 pixels
X_train_for_modelling = X_train_for_modelling.reshape(X_train_for_modelling.shape[0], 512, 512, 1)
X_valid_for_modelling = X_valid_for_modelling.reshape(X_valid_for_modelling.shape[0], 512, 512, 1)
X_train_for_modelling = X_train_for_modelling.astype('float32')
X_valid_for_modelling = X_valid_for_modelling.astype('float32')

# one-hot encoding using keras' numpy-related utilities
n_classes = 1
# print("Shape before one-hot encoding: ", y_train_for_modelling.shape)
# y_train_for_modelling = np_utils.to_categorical(y_train_for_modelling, n_classes)
# y_test_for_modelling = np_utils.to_categorical(y_test_for_modelling, n_classes)
# print("Shape after one-hot encoding: ", y_train_for_modelling.shape)

print('X train shape:', X_train_for_modelling.shape)
print('y train shape:', y_train_for_modelling.shape)
print('X validation shape:', X_valid_for_modelling.shape)
print('y validation shape:', y_valid_for_modelling.shape)

In [None]:
gc.collect()

# Modelling

In [None]:
# Transfer learning code to be updated
# import keras
# base_model = keras.applications.Xception(
#     weights='imagenet',  # Load weights pre-trained on ImageNet.
#     input_shape=(512, 512, 1),
#     include_top=False)  # Do not include the ImageNet classifier at the top.

In [None]:
# base_model.trainable = False

In [None]:
# inputs = keras.Input(shape=(512, 512, 1))
# # We make sure that the base_model is running in inference mode here,
# # by passing `training=False`. This is important for fine-tuning, as you will
# # learn in a few paragraphs.
# x = base_model(inputs, training=False)
# # Convert features of shape `base_model.output_shape[1:]` to vectors
# x = keras.layers.GlobalAveragePooling2D()(x)
# # A Dense classifier with a single unit (binary classification)
# outputs = keras.layers.Dense(1)(x)
# model = keras.Model(inputs, outputs)

In [None]:
# building a linear stack of layers with the sequential model
model = Sequential()
# convolutional layer
model.add(Conv2D(25, kernel_size=(3,3), strides=(1,1), activation='relu', input_shape=(512, 512, 1)))
model.add(MaxPool2D(pool_size=(50,50)))
# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(100, activation='relu'))
# output layer
model.add(Dense(n_classes, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[keras.metrics.BinaryAccuracy()])
model.fit(X_train_for_modelling, y_train_for_modelling, epochs=20, validation_data=(X_valid_for_modelling, y_valid_for_modelling))