In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

import random
from random import sample
import sklearn.model_selection as skl
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras_preprocessing.image.dataframe_iterator import DataFrameIterator
from tensorflow.keras.layers import GlobalAveragePooling2D, BatchNormalization, Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau

import pydicom
import cv2

In [None]:
root_dir = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/'
df = pd.read_csv(root_dir+'train_labels.csv')

In [None]:
# Add the full paths for each id for different types of sequences to the csv 
def full_ids(data):
    zeros = 5 - len(str(data))
    if zeros > 0:
        prefix = ''.join(['0' for i in range(zeros)])
    
    return prefix+str(data)
        

df['BraTS21ID_full'] = df['BraTS21ID'].apply(full_ids)

# Add all the paths to the df for easy access
df['flair'] = df['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/FLAIR/')
df['t1w'] = df['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T1w/')
df['t1wce'] = df['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T1wCE/')
df['t2w'] = df['BraTS21ID_full'].apply(lambda file_id : root_dir+'train/'+file_id+'/T2w/')

In [None]:
df_test = pd.read_csv(root_dir+'sample_submission.csv')

df_test['BraTS21ID_full'] = df_test['BraTS21ID'].apply(full_ids)

# Add all the paths to the df for easy access
df_test['flair'] = df_test['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/FLAIR/')
df_test['t1w'] = df_test['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T1w/')
df_test['t1wce'] = df_test['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T1wCE/')
df_test['t2w'] = df_test['BraTS21ID_full'].apply(lambda file_id : root_dir+'test/'+file_id+'/T2w/')

# load the images

00109 (FLAIR images are blank)
00123 (T1w images are blank)
00709 (FLAIR images are blank)

In [None]:
def get_train_val_dataframe(mri_type):

    all_img_files = []
    all_img_labels = []
    all_img_patient_ids = []
    for row in df.iterrows():
        if row[1]['BraTS21ID_full'] == '00109' and mri_type == 'flair':
            continue
        if row[1]['BraTS21ID_full'] == '00123' and mri_type == 't1w':
            continue
        if row[1]['BraTS21ID_full'] == '00709' and mri_type == 'flair':
            continue
        img_dir = row[1][mri_type]
        img_files = os.listdir(img_dir)
        img_nums = sorted([int(ele.replace('Image-', '').replace('.dcm', '')) for ele in img_files])
        mid_point = int(len(img_nums)/2)
        start_point = mid_point - max(int(mid_point*0.1), 1)
        end_point = mid_point + max(int(mid_point*0.1), 1)
        img_names = [f'Image-{img_nums[i]}.dcm' for i in range(start_point, end_point+1)]
        img_paths = [img_dir+ele for ele in img_names]
        img_labels = [row[1]['MGMT_value']]*len(img_paths)
        img_patient_ids = [row[1]['BraTS21ID']]*len(img_paths)
        all_img_files.extend(img_paths)
        all_img_labels.extend(img_labels)
        all_img_patient_ids.extend(img_patient_ids)

    train_val_df = pd.DataFrame({'patient_ids': all_img_patient_ids,
                  'labels': all_img_labels,
                  'file_paths': all_img_files})

    train_val_df['labels'] = train_val_df['labels'].map({1: '1', 0: '0'})
    
    return train_val_df
    
def get_test_dataframe(mri_type):
    
    all_test_img_files = []
    all_test_img_labels = []
    all_test_img_patient_ids = []
    for row in df_test.iterrows():
        img_dir = row[1][mri_type]
        img_files = os.listdir(img_dir)
        img_nums = sorted([int(ele.replace('Image-', '').replace('.dcm', '')) for ele in img_files])
        mid_point = int(len(img_nums)/2)
        start_point = mid_point - max(int(mid_point*0.1), 1)
        end_point = mid_point + max(int(mid_point*0.1), 1)
        img_names = [f'Image-{img_nums[i]}.dcm' for i in range(start_point, end_point+1)]
        img_paths = [img_dir+ele for ele in img_names]
        img_labels = [row[1]['MGMT_value']]*len(img_paths)
        img_patient_ids = [row[1]['BraTS21ID']]*len(img_paths)
        all_test_img_files.extend(img_paths)
        all_test_img_labels.extend(img_labels)
        all_test_img_patient_ids.extend(img_patient_ids)

    test_df = pd.DataFrame({'patient_ids': all_test_img_patient_ids,
                  'labels': all_test_img_labels,
                  'file_paths': all_test_img_files})
    
    test_df['labels'] = ['1']*(len(test_df)-1) + ['0'] # workaround for testing data gen
    
    return test_df

In [None]:
class DCMDataFrameIterator(DataFrameIterator):
    def __init__(self, *arg, **kwargs):
        self.white_list_formats = ('dcm')
        super(DCMDataFrameIterator, self).__init__(*arg, **kwargs)
        self.dataframe = kwargs['dataframe']
        self.x = self.dataframe[kwargs['x_col']]
        self.y = self.dataframe[kwargs['y_col']]
        self.color_mode = kwargs['color_mode']
        self.target_size = kwargs['target_size']

    def _get_batches_of_transformed_samples(self, indices_array):
        # get batch of images
        batch_x = np.array([self.read_dcm_as_array(dcm_path, self.target_size, color_mode=self.color_mode)
                            for dcm_path in self.x.iloc[indices_array]])

        batch_y = np.array(self.y.iloc[indices_array].astype(np.uint8))  # astype because y was passed as str

        # transform images
        if self.image_data_generator is not None:
            for i, (x, y) in enumerate(zip(batch_x, batch_y)):
                transform_params = self.image_data_generator.get_random_transform(x.shape)
                batch_x[i] = self.image_data_generator.apply_transform(x, transform_params)
                # you can change y here as well, eg: in semantic segmentation you want to transform masks as well 
                # using the same image_data_generator transformations.

        return batch_x, batch_y

    @staticmethod
    def read_dcm_as_array(dcm_path, target_size=(224, 224), color_mode='rgb'):
        image_array = pydicom.dcmread(dcm_path).pixel_array
        pixels = image_array - np.min(image_array)
        pixels = pixels / np.max(pixels)
        image_manual_norm = (pixels * 255).astype(np.uint8)
        image_array = cv2.resize(image_manual_norm, target_size, interpolation=cv2.INTER_NEAREST)  #this returns a 2d array
#         image_array = np.expand_dims(image_array, -1)
        if color_mode == 'rgb':
            image_array = np.dstack((image_array, np.zeros_like(image_array), np.zeros_like(image_array)))
        return image_array

In [None]:
SEED = 369
BATCH_SIZE = 512
CLASS_MODE = 'binary'
COLOR_MODE = 'rgb'
TARGET_SIZE = (224, 224)

In [None]:
def get_data_generators(train_df, test_df):
    train_augmentation_parameters = dict(
    #     rescale=1.0/255.0,
        zoom_range=0.2,
    #     horizontal_flip=True,
    #     vertical_flip = True,
        fill_mode='nearest',
        height_shift_range= 0.1,
        width_shift_range=0.1,
        brightness_range = [0.8, 1.2]
    )

    test_augmentation_parameters = dict(
    #     rescale=1.0/255.0
    )

    train_consts = {
        'seed': SEED,
        'batch_size': BATCH_SIZE,
        'class_mode': CLASS_MODE,
        'color_mode': COLOR_MODE,
        'target_size': TARGET_SIZE,  
    }

    test_consts = {
        'batch_size': BATCH_SIZE,
        'class_mode': CLASS_MODE,
        'color_mode': COLOR_MODE,
        'target_size': TARGET_SIZE,
        'shuffle': False
    }

    train_augmenter = ImageDataGenerator(**train_augmentation_parameters)
    test_augmenter = ImageDataGenerator(**test_augmentation_parameters)

    train_generator = DCMDataFrameIterator(dataframe=train_df,
                                 x_col='file_paths',
                                 y_col='labels',
                                 image_data_generator=train_augmenter,
                                 **train_consts)

    test_generator = DCMDataFrameIterator(dataframe=test_df,
                                 x_col='file_paths',
                                 y_col='labels',
                                 image_data_generator=test_augmenter,
                                 **test_consts)
    
    return train_generator, test_generator

# Build model and train

In [None]:
def build_model(weights_path):
    model = EfficientNetB3(include_top=False, weights=weights_path)

    # Freeze the pretrained weights
    model.trainable = False

    # Rebuild top
    x = GlobalAveragePooling2D(name="avg_pool")(model.output)
    x = BatchNormalization()(x)

    top_dropout_rate = 0.1
    x = Dropout(top_dropout_rate, name="top_dropout")(x)
    dense_1 = Dense(32, activation="relu")(x)
    outputs = Dense(1, activation="sigmoid", name="pred")(dense_1)

    # Compile
    model = Model(model.inputs, outputs, name="EfficientNet")
    optimizer = Adam(learning_rate=1e-3)
    model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["binary_accuracy"]
    )
    return model

In [None]:
def train_model(model_name, train_generator, epochs):
    
    print('training', model_name)
    
    model = build_model("../input/efficentnet-b0b5-tensorflow-24-notop/efficientnet-b3_tf24_imagenet_1000_notop.h5")
    
    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=len(train_generator),
        epochs=epochs,
        workers=2
    )

    return model

In [None]:
# train a model for each of the mri types and then ensemble predictions
all_test_preds = []

for mt in ['flair', 't1w', 't1wce', 't2w']:
    train_df = get_train_val_dataframe(mt)
    test_df = get_test_dataframe(mt)
    train_g, test_g = get_data_generators(train_df, test_df)
    best_model = train_model(mt, train_g, epochs=5)
    test_pred = best_model.predict(test_g, steps=len(test_g))
    test_df['pred_y'] = test_pred
    # aggregate the predictions on all image for each person (take the most confident prediction out of all image predictions)
    mean_pred = test_pred.mean()
    test_pred_agg = test_df.groupby('patient_ids').apply(
        lambda x: x['pred_y'].max()
        if (x['pred_y'].max() - mean_pred) > (mean_pred - x['pred_y'].min()) 
        else x['pred_y'].min())
    all_test_preds.append(test_pred_agg.values)

# Submission

In [None]:
all_test_preds = np.array(all_test_preds)

In [None]:
plt.hist(all_test_preds.mean(0))

In [None]:
subm = pd.read_csv(root_dir+'sample_submission.csv')
subm['MGMT_value'] = all_test_preds.mean(0)
subm.to_csv("submission.csv", index=False)

In [None]:
subm