In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
"""
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

import pydicom
import glob
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.metrics import log_loss

import cv2

In [None]:
root_dir = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/'
data = pd.read_csv(root_dir + 'train_labels.csv')

to_exclude = [109, 123, 709]
data = data[~data['BraTS21ID'].isin(to_exclude)]

In [None]:
num_samples = data.shape[0]
num_positives = np.sum(data['MGMT_value'] == 1)
num_negatives = np.sum(data['MGMT_value'] == 0)
data.hist(column="MGMT_value")

In [None]:
print("Number of samples: " + str(num_samples))
print("Number of positive labels: " + str(num_positives))
print("Number of negative labels: " + str(num_negatives))

# Baseline AUC (always predict the most common class)

In [None]:
y_true = data['MGMT_value'].values
y_pred = np.array([1]*num_samples)
baseline_auc = roc_auc_score(y_true, y_pred)

In [None]:
baseline_auc

This makes sense because the two classes are roughly balanced.

In [None]:
def full_ids(data):
    zeros = 5 - len(str(data))
    if zeros > 0:
        prefix = ''.join(['0' for i in range(zeros)])
    
    return prefix+str(data)

In [None]:
data['BraTS21ID_full'] = data['BraTS21ID'].apply(full_ids)

# Add all the paths to the df for easy access
data['flair'] = data['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/FLAIR/')
data['t1w'] = data['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T1w/')
data['t1wce'] = data['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T1wCE/')
data['t2w'] = data['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T2w/')
data

In [None]:
test_data = pd.read_csv(root_dir + 'sample_submission.csv')
test_data['BraTS21ID_full'] = test_data['BraTS21ID'].apply(full_ids)
test_data['flair'] = test_data['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/FLAIR/')
test_data['t1w'] = test_data['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T1w/')
test_data['t1wce'] = test_data['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T1wCE/')
test_data['t2w'] = test_data['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T2w/')
test_data

In [None]:
def get_image(data):
    '''
    Returns the image data as a numpy array.
    '''  
    if np.max(data.pixel_array)==0:
        img = data.pixel_array
    else:
        img = data.pixel_array/np.max(data.pixel_array)
        img = (img * 255).astype(np.uint8)
        
    return img

In [None]:
test_mri_image_data = '/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T1w/Image-25.dcm'

In [None]:
img_data = pydicom.dcmread(test_mri_image_data)
img = get_image(img_data)
plt.imshow(img, cmap='gray')

In [None]:
def sorted_image_dirs(path: str, sort=True):
    '''
    Sorts the list of image directories by image number in a path
    '''
    dirs = glob.glob(path+'*')
    if sort:
        dirs.sort(key=lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]))
    
    return dirs


def get_all_images(path: str, sort=True):
    '''
    Returns a list of (non blank) images from a given path (of shape [non_blank_image_count, 512, 512])
    '''
    image_dirs = sorted_image_dirs(path, sort)
    images = []
    
    for directory in image_dirs:
        data = pydicom.dcmread(directory)
        img = get_image(data)
        
        # Exclude the blank images
        if np.max(img)!=0:
            images.append(img)
        else:
            pass
    
    return images

def show_animation(images: list):
    '''
    Displays an animation from the list of images.
    
    set: matplotlib.rcParams['animation.html'] = 'jshtml'
    
    '''
    fig = plt.figure(figsize=(6, 6))
    plt.axis('off')
    im = plt.imshow(images[0], cmap='gray')
    
    def animate_func(i):
        im.set_array(images[i])
        return [im]
    
    return matplotlib.animation.FuncAnimation(fig, animate_func, frames = len(images), interval = 20)

In [None]:
"""
patient = 10

flair_images = get_all_images(data['flair'][patient])
print('No of images:', len(flair_images))
print('MGMT: ', data['MGMT_value'][patient])

fig = plt.figure(figsize=(30,30))

c = 1
for image in flair_images:
    ax = fig.add_subplot(len(flair_images)//10+1, 10, c)
    ax.imshow(image, cmap='gray')
    c+=1
    
    plt.axis('off')
    
fig.tight_layout()
"""

In [None]:
excluded_indexes = np.setdiff1d(list(range(585)), list(data.index))

In [None]:
"""
pd.set_option('mode.chained_assignment', None)
flattened_image_df = data.copy(deep=True)
shape_df = data.copy(deep=True)
MRI_types = ['flair', 't1w', 't1wce', 't2w']
start = datetime.now() 
for img_type in MRI_types:
    for patient_id in np.setdiff1d(list(range(585)), excluded_indexes):
        images = get_all_images(data[img_type][patient_id], sort=False)
        shape_df[img_type][patient_id] = images[0].shape
        
        images = np.array(images)
        flattened_image = np.mean(images, axis=0)
        flattened_image_df[img_type][patient_id] = flattened_image
        
        if patient_id % 40 == 0:
            print(str(img_type) + " " + str(patient_id))
             
end = datetime.now()
duration = end - start
seconds_elapsed = duration.total_seconds()
print("Time elapsed: " + str(seconds_elapsed))

shape_df.to_pickle("/kaggle/working/input_image_shapes.p")
flattened_image_df.to_pickle("/kaggle/working/input_flattened_images.p")
"""

In [None]:
"""
pd.set_option('mode.chained_assignment', None)
flattened_image_df = test_data.copy(deep=True)
MRI_types = ['flair', 't1w', 't1wce', 't2w']
start = datetime.now() 
for img_type in MRI_types:
    for patient_id in list(test_data.index):
        images = get_all_images(test_data[img_type][patient_id], sort=False)
        images = np.array(images)
        flattened_image = np.mean(images, axis=0)
        flattened_image_df[img_type][patient_id] = flattened_image
        
        if patient_id % 10 == 0:
            print(str(img_type) + " " + str(patient_id))
             
end = datetime.now()
duration = end - start
seconds_elapsed = duration.total_seconds()
print("Time elapsed: " + str(seconds_elapsed))

#flattened_image_df.to_pickle("/kaggle/working/input_flattened_images.p")
"""

In [None]:
#shape_df = pd.read_pickle('/kaggle/input/brain-tumour-image-data-zip/input_image_shapes.p')
#flattened_image_df = pd.read_pickle('/kaggle/input/brain-tumour-image-data-zip/input_flattened_images.p')

# First Model

In [None]:
#from tensorflow.keras.applications.vgg16 import VGG16
#from tensorflow.keras.applications.inception_v3 import InceptionV3
#from tensorflow.keras.applications import ResNet50

In [None]:
device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

In [None]:
width = 256
height = 256

In [None]:
pd.set_option('mode.chained_assignment', None)
def compute_embeddings(mri_type, batch_size):

    assert mri_type in ['flair', 't1w', 't1wce', 't2w']
    
    model_input = np.zeros((batch_size, width, height, 3))
    raw_data = flattened_image_df[mri_type].values
    for index in range(batch_size):
        model_input[index, :, :, 0] = raw_data[index]
        model_input[index, :, :, 1] = raw_data[index]
        model_input[index, :, :, 2] = raw_data[index]

    image_embedding = ResNet_model.predict(model_input)
    image_embedding = image_embedding.reshape([batch_size, -1])
    
    entry_index = 0
    #for patient_id in np.setdiff1d(list(range(585)), excluded_indexes):
    for patient_id in list(flattened_image_df.index):
        flattened_image_df[mri_type][patient_id] = image_embedding[entry_index, :]
        entry_index += 1
    
    return

In [None]:
"""
#ResNet_model = ResNet50(input_shape=(width, height, 3), include_top=False, weights="imagenet")
#ResNet_model = keras.models.load_model("/kaggle/input/resnet50-weights/ResNet50.h5")
flattened_image_df['flair'] = flattened_image_df['flair'].apply(lambda x: cv2.resize(x, (width, height)))
flattened_image_df['t1w'] = flattened_image_df['t1w'].apply(lambda x: cv2.resize(x, (width, height)))
flattened_image_df['t1wce'] = flattened_image_df['t1wce'].apply(lambda x: cv2.resize(x, (width, height)))
flattened_image_df['t2w'] = flattened_image_df['t2w'].apply(lambda x: cv2.resize(x, (width, height)))

for mri_type in ['flair', 't1w', 't1wce', 't2w']:
    compute_embeddings(mri_type, flattened_image_df.shape[0])
y = flattened_image_df['MGMT_value'].values.reshape((-1,1))
X = np.zeros((y.shape[0], len(flattened_image_df['flair'][0]) * 4))

for index in range(y.shape[0]):
    data = flattened_image_df.iloc[index]
    features = np.concatenate((data['flair'], data['t1w'], data['t1wce'], data['t2w']))
    X[index, :] = features
    
X_test = X
#np.save("/kaggle/working/y.npy", y)
#np.save("/kaggle/working/X_test.npy", X)
"""

In [None]:
y = np.load("/kaggle/input/d/nicholasjohnson2020/brain-tumour-features-resnet50-256/256_ResNet50_features/y.npy")
X = np.load("/kaggle/input/d/nicholasjohnson2020/brain-tumour-features-resnet50-256/256_ResNet50_features/X.npy")
X_test = np.load("/kaggle/input/d/nicholasjohnson2020/brain-tumour-features-resnet50-256/256_ResNet50_features/X_test.npy")

In [None]:
pca = PCA(n_components = 100)
X_trim = pca.fit_transform(X)
X_test_trim = pca.transform(X_test)

In [None]:
print(X.shape)
print(X_trim.shape)
print(X_test.shape)
print(X_test_trim.shape)

In [None]:
def plot_hist(hist, last = None):
    if last == None:
        last = len(hist.history["loss"])
    plt.plot(hist.history["loss"][-last:])
    plt.plot(hist.history["val_loss"][-last:])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
saved_X = X

In [None]:
def l3_res_model(input_shape, no_classes, lr):
    inputs = tf.keras.Input(shape=input_shape)
    x = layers.Dense(128, activation='sigmoid')(inputs)
    x = layers.BatchNormalization()(x)
    b_1 = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(b_1)
    x = layers.BatchNormalization()(x)
    b_2 = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='sigmoid')(b_2)
    x = layers.BatchNormalization()(x)
    b_3 = layers.Dropout(0.2)(x)
    tot_op = tf.keras.layers.add([b_1, b_2, b_3])
    outputs = layers.Dense(no_classes, activation='sigmoid')(tot_op)
    model = tf.keras.Model(inputs, outputs)
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate = lr), metrics=['binary_crossentropy'])
    return model

In [None]:
X = X_trim
input_dim = X.shape[1]

losses_NN=[]
auc_NN=[]
kf = KFold(n_splits=10)
tf.random.set_seed(1010)
np.random.seed(1010)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    nnclf = l3_res_model((input_dim,),1,0.00001)
    hist = nnclf.fit(X_train, y_train, batch_size=64, epochs=50, validation_data=(X_test, y_test), verbose=0)
    plot_hist(hist, last=20)

    preds = nnclf.predict(X_test) # list of preds per class

    loss = log_loss(np.ravel(y_test), np.ravel(preds))
    auc = roc_auc_score(y_test[:, 0], preds[:, 0])
    print('Loss: '+str(loss))
    print('AUC: '+str(auc))
    losses_NN.append(loss)
    auc_NN.append(auc)

print('Average Loss: '+str(np.average(losses_NN)))
print('Average AUC: '+str(np.average(auc_NN)))

In [None]:
X = X_trim
input_dim = X.shape[1]
week1_model = l3_res_model((input_dim,),1,0.00001)
week1_model.fit(X, y, batch_size=16, epochs=50, verbose=0)

In [None]:
preds = week1_model.predict(X_test_trim)

In [None]:
submission_file = pd.read_csv("/kaggle/input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")
submission_file["BraTS21ID"] = submission_file['BraTS21ID'].apply(full_ids)
submission_file["MGMT_value"] = preds
submission_file.to_csv("submission.csv", index=False)

# Miscellaneous Prototyping

In [None]:
"""
MRI_types = ['flair', 't1w', 't1wce', 't2w']
for mri_type in MRI_types:
    model_input = np.zeros((num_patients, img_width, img_height, 3))
        for patient_id in range(num_patients):
            image_array = get_all_images(data[image_type][patient_id], sort=False)
            image_array = np.array(image_array)
            print("Patient " + str(patient_id) + " data shape: " + str(image_array.shape))
            flattened_image = np.mean(image_array, axis=0)
            
            model_input[patient_id, :, :, 0] = flattened_image
            model_input[patient_id, :, :, 1] = flattened_image
            model_input[patient_id, :, :, 2] = flattened_image
            
        image_embedding = embedding_model.predict(model_input)
        image_embedding = image_embedding.reshape([num_patients, -1])
        
        for patient_id in range(num_patients):
            output_df[image_type][patient_id] = image_embedding[patient_id, :]
"""

In [None]:
"""
flair_count = shape_df['flair'].value_counts()
t1w_count = shape_df['t1w'].value_counts()
t1wce_count = shape_df['t1wce'].value_counts()
t2w_count = shape_df['t2w'].value_counts()

flair_shapes = list(flair_count.index)
t1w_shapes = list(t1w_count.index)
t1wce_shapes = list(t1wce_count.index)
t2w_shapes = list(t2w_count.index)
unique_shapes = list(set(flair_shapes + t1w_shapes + t1wce_shapes + t2w_shapes))
shape_counts = {}
for shape in unique_shapes:
    shape_counts[shape] = 0
for shape in flair_shapes:
    shape_counts[shape] += flair_count[shape]
for shape in t1w_shapes:
    shape_counts[shape] += t1w_count[shape]
for shape in t1wce_shapes:
    shape_counts[shape] += t1wce_count[shape]
for shape in t2w_shapes:
    shape_counts[shape] += t2w_count[shape]
"""

In [None]:
"""
def compute_embeddings(mri_types, batch_size, output_df, img_width, img_height):

    model_input = np.zeros((batch_size, img_width, img_height, 3))
    temp_legend = {}
    entry_index = 0
    for mri_type in mri_types:
        for patient_id in np.setdiff1d(list(range(585)), excluded_indexes):
            if shape_df[mri_type][patient_id] == (img_width, img_height):
                temp_legend[entry_index] = (mri_type, patient_id)
                model_input[entry_index, :, :, 0] = flattened_image_df[mri_type][patient_id]
                model_input[entry_index, :, :, 1] = flattened_image_df[mri_type][patient_id]
                model_input[entry_index, :, :, 2] = flattened_image_df[mri_type][patient_id]
                entry_index += 1
    assert entry_index == batch_size            

    ResNet_model = ResNet50(input_shape=(img_width, img_height,3), include_top=False, weights="imagenet")
    image_embedding = ResNet_model.predict(model_input)
    image_embedding = image_embedding.reshape([batch_size, -1])
    for entry_index in range(batch_size):
        (mri_type, patient_id) = temp_legend[entry_index]
        output_df[mri_type][patient_id] = image_embedding[entry_index, :]

    return

start = datetime.now()      
MRI_types = ['flair', 't1w', 't1wce', 't2w']
pd.set_option('mode.chained_assignment', None)
#embedding_df = flattened_image_df.copy(deep=True)
embedding_df = flattened_image_df

for (img_width, img_height) in unique_shapes:
    if (img_width, img_height) == (512, 512):
        continue
    compute_embeddings(MRI_types, shape_counts[(img_width, img_height)], embedding_df,
                       img_width, img_height)
    print("Finished " + str((img_width, img_height)))


    
compute_embeddings(['flair', 't1w'], flair_count[(512, 512)] + t1w_count[(512, 512)],
                   embedding_df, 512, 512)
compute_embeddings(["t1wce", 't2w'], t1wce_count[(512, 512)] + t2w_count[(512, 512)],
                   embedding_df, 512, 512)

end = datetime.now()
duration = end - start
seconds_elapsed = duration.total_seconds()
print("Time elapsed: " + str(seconds_elapsed))

embedding_df.to_pickle("/kaggle/working/embeddeding_512.p")
"""

In [None]:
"""
def transform_patient_data_v1(embedding_model, num_patients=num_samples,
                              img_width=512, img_height=512):
    
    pd.set_option('mode.chained_assignment', None)
    output_df = data.copy(deep=True)
    
    MRI_types = ['flair', 't1w', 't1wce', 't2w']
    for image_type in MRI_types:
        model_input = np.zeros((num_patients, img_width, img_height, 3))
        for patient_id in range(num_patients):
            image_array = get_all_images(data[image_type][patient_id], sort=False)
            image_array = np.array(image_array)
            print("Patient " + str(patient_id) + " data shape: " + str(image_array.shape))
            flattened_image = np.mean(image_array, axis=0)
            
            model_input[patient_id, :, :, 0] = flattened_image
            model_input[patient_id, :, :, 1] = flattened_image
            model_input[patient_id, :, :, 2] = flattened_image
            
        image_embedding = embedding_model.predict(model_input)
        image_embedding = image_embedding.reshape([num_patients, -1])
        
        for patient_id in range(num_patients):
            output_df[image_type][patient_id] = image_embedding[patient_id, :]
            
    return output_df
    
def transform_patient_data_v2(patient_id, embedding_model):
    
    MRI_types = ['flair', 't1w', 't1wce', 't2w']
    output_dict = {}
    
    for image_type in MRI_types:
        image_array = get_all_images(data[image_type][patient_id])
        model_input = np.zeros((len(image_array), image_array[0].shape[0],
                                image_array[0].shape[1], 3))
        for i in range(model_input.shape[0]):
            model_input[i, :, :, 0] = image_array[i]
            model_input[i, :, :, 1] = image_array[i]
            model_input[i, :, :, 2] = image_array[i]
        
        image_embedding = embedding_model.predict(model_input)
        image_embedding = image_embedding.reshape([image_embedding.shape[0], -1])
        image_embedding = np.mean(image_embedding, axis=0)
        
        output_dict[image_type] = image_embedding

    return output_dict
"""

In [None]:
#scan_width = 512
#scan_height = 512

In [None]:
#VGG_model = VGG16(input_shape = (scan_width, scan_height, 3), include_top = False, weights = 'imagenet')
#Inception_model = InceptionV3(input_shape = (scan_width, scan_height, 3), include_top = False, weights = 'imagenet')
#ResNet_model = ResNet50(input_shape=(scan_width, scan_height,3), include_top=False, weights="imagenet")

In [None]:
#start = datetime.now()      
#VGG_embeddings = transform_patient_data_v2(0, VGG_model)
#end = datetime.now()
#duration = end - start
#seconds_elapsed = duration.total_seconds()
#print("Time elapsed: " + str(seconds_elapsed))