In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import matplotlib.pyplot as plt
import os
import random
from sklearn import model_selection as sk_model_selection

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import activations
from tensorflow.keras import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from keras_preprocessing.image.dataframe_iterator import DataFrameIterator
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from operator import itemgetter
from scipy import ndimage
import matplotlib.image as mpimg
import matplotlib.pyplot as plt


In [None]:
class Config:
    INPUT_PATH_DCM = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/'
    INPUT_PATH_PNG = '../input/rsna-miccai-png/'
    TENSORBOARD_LOG_DIR = '../working/log_tensorboard/'
    SEED = 42
    #These should be removed from the dataset
    EXCLUDED_STR = ['00109', '00123', '00709']
    EXCLUDED_INT = [109, 123, 709]

    #Defining target size of image
    IMG_SIZE = 224
    NUM_SLICES_3D = 64
    MIN_SLICES = 12
    
    BATCH_SIZE = 64
    
    CLASS_MODE = 'binary'
    COLOR_MODE = 'rgb'
    TARGET_SIZE = (256, 256)
    def __self__():
        pass
    @staticmethod
    def set_seed(seed_val):
        tf.random.set_seed(seed_val)
        random.seed(seed_val)
        os.environ['PYTHONHASHSEED'] = str(seed_val)
        np.random.seed(seed_val)
        

In [None]:
Config.set_seed(Config.SEED)


# Getting to know the data

In [None]:
!ls ../input/rsna-miccai-brain-tumor-radiogenomic-classification/

In [None]:
df = pd.read_csv(Config.INPUT_PATH_DCM+'train_labels.csv', dtype={
    'BraTS21ID': str, 'MGMT_value':int
})


df.head()

In [None]:
#Lendo dataset 
df = df[~df['BraTS21ID'].isin(Config.EXCLUDED_STR)]

#Para testes, usando 1/3 do dataset
# df = df[:int(len(df)/3)]


print(df.shape)

In [None]:
# adapted from: https://www.kaggle.com/leandronidas/tf-efficientnet-transfer-learning-strat-split/edit

df['flair'] = df['BraTS21ID'].apply(lambda file_id : Config.INPUT_PATH_PNG+'train/'+file_id+'/FLAIR/')
df['t1w'] = df['BraTS21ID'].apply(lambda file_id : Config.INPUT_PATH_PNG+'train/'+file_id+'/T1w/')
df['t1wce'] = df['BraTS21ID'].apply(lambda file_id : Config.INPUT_PATH_PNG+'train/'+file_id+'/T1wCE/')
df['t2w'] = df['BraTS21ID'].apply(lambda file_id : Config.INPUT_PATH_PNG+'train/'+file_id+'/T2w/')

df['brats21idInt'] = df['BraTS21ID'].astype(int)


In [None]:
sample_y = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv')
pred = sample_y
pred['BraTS21ID5'] = [format(x, '05d') for x in pred.BraTS21ID]
pred.head(5)


In [None]:
#Divisao estratificada em treino, teste e validação
df_trainval, df_test = sk_model_selection.train_test_split(
    df, 
    test_size=0.15, 
    random_state=Config.SEED, 
    stratify=df["MGMT_value"],
)
df_train, df_val = sk_model_selection.train_test_split(
    df_trainval, 
    test_size=0.2, 
    random_state=Config.SEED, 
    stratify=df_trainval["MGMT_value"],
)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

Checking if sample 109 is neither present on train and validation sets

In [None]:
    
def get_iterating_dataframe(df, mri_type):
    
    all_img_files = []
    all_img_labels = []
    all_img_patient_ids = []
    for row in df.iterrows():

        img_dir = row[1][mri_type]
        img_files = os.listdir(img_dir)
        img_nums = sorted([int(ele.replace('Image-', '').replace('.png', '')) for ele in img_files])
        totalnum_images = len(img_nums)
        mid_point = int(totalnum_images//2)
        start_point = mid_point - max(int(mid_point*0.1), Config.MIN_SLICES//2)
        end_point = mid_point + max(int(mid_point*0.1), Config.MIN_SLICES//2)

        img_names = [f'Image-{img_nums[i]}.png' for i in range(start_point, end_point+1)]

        img_paths = [img_dir+ele for ele in img_names]
        img_labels = [row[1]['MGMT_value']]*len(img_paths)
        img_patient_ids = [row[1]['brats21idInt']]*len(img_paths)
        all_img_files.extend(img_paths)
        all_img_labels.extend(img_labels)
        all_img_patient_ids.extend(img_patient_ids)

    new_df = pd.DataFrame({'patient_ids': all_img_patient_ids,
                  'labels': all_img_labels,
                  'file_paths': all_img_files})
            
    return new_df

In [None]:
class PNGDataFrameIterator(DataFrameIterator):
    def __init__(self, *arg, **kwargs):
        self.white_list_formats = ('png')
        super(PNGDataFrameIterator, self).__init__(*arg, **kwargs)
        self.dataframe = kwargs['dataframe']
        self.x = self.dataframe[kwargs['x_col']]
        self.y = self.dataframe[kwargs['y_col']]
        self.color_mode = kwargs['color_mode']
        self.target_size = kwargs['target_size']

    def _get_batches_of_transformed_samples(self, indices_array):
        # get batch of images
        batch_x = np.array([self.read_png_as_array(path, self.target_size, 
                                                   color_mode=self.color_mode)
                            for path in self.x.iloc[indices_array]])

        batch_y = np.array(self.y.iloc[indices_array].astype(np.uint8))  # astype because y was passed as str

        # transform images
        if self.image_data_generator is not None:
            for i, (x, y) in enumerate(zip(batch_x, batch_y)):
                transform_params = self.image_data_generator.get_random_transform(x.shape)
                batch_x[i] = self.image_data_generator.apply_transform(x, transform_params)
      

        return batch_x, batch_y

    
    @staticmethod
    def read_png_as_array(path, target_size=(Config.IMG_SIZE, Config.IMG_SIZE),
                          color_mode='rgb'):
        im_gray = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        pixels = im_gray - np.min(im_gray)
        pixels = pixels / np.max(pixels)
        image_manual_norm = (pixels * 255).astype(np.uint8)
        image_array = cv2.resize(image_manual_norm, target_size, interpolation=cv2.INTER_CUBIC) 
        
        if color_mode == 'rgb':
            image_array = np.dstack((image_array,
                                     image_array,
                                     image_array,
#                                      np.zeros_like(image_array),
#                                      np.zeros_like(image_array)
                                    ))
        return image_array

        
    

In [None]:
# exam_list = ['flair','t1w','t1wce','t2w']
exam_list = ['flair', 't1wce']

dftrainIter_dict = {}
dfvalIter_dict = {}
dftestIter_dict = {}
train_gen_dict = {}
val_gen_dict = {}
test_gen_dict = {}
model_check_dict = {}
model_dict = {}

for exam in exam_list:
    dftrainIter = get_iterating_dataframe(df_train, exam)
    dftrainIter['labels_str'] = dftrainIter['labels'].astype(str)

    dfvalIter = get_iterating_dataframe(df_val, exam)
    dfvalIter['labels_str'] = dfvalIter['labels'].astype(str)

    dftestIter = get_iterating_dataframe(df_test, exam)
    dftestIter['labels_str'] = dftestIter['labels'].astype(str)

    datagen = ImageDataGenerator(
            preprocessing_function=tf.keras.applications.resnet50.preprocess_input,
            zoom_range=0.2,
            rotation_range=45,
            fill_mode='nearest',
            height_shift_range= 0.15,
            width_shift_range=0.15,
            horizontal_flip=True,
            vertical_flip=True,
            brightness_range = [0.8, 1.2],
            rescale=1.0/255,
    )
    test_datagen = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.resnet50.preprocess_input,
        rescale=1.0/255,
    )

    train_generator = PNGDataFrameIterator(dataframe=dftrainIter,
                                     x_col='file_paths',
                                     y_col='labels_str',
                                     image_data_generator=datagen,
                                     seed=Config.SEED,
                                     batch_size=Config.BATCH_SIZE,
                                     class_mode=Config.CLASS_MODE,
                                     color_mode=Config.COLOR_MODE,
                                     target_size=Config.TARGET_SIZE,  
                                    )

    val_generator = PNGDataFrameIterator(dataframe=dfvalIter,
                                     x_col='file_paths',
                                     y_col='labels_str',
                                     image_data_generator=test_datagen,
                                     seed=Config.SEED,
                                     batch_size=Config.BATCH_SIZE,
                                     class_mode=Config.CLASS_MODE,
                                     color_mode=Config.COLOR_MODE,
                                     target_size=Config.TARGET_SIZE,  
                                    )

    test_generator = PNGDataFrameIterator(dataframe=dftestIter,
                                     x_col='file_paths',
                                     y_col='labels_str',
                                     image_data_generator=test_datagen,
                                     seed=Config.SEED,
                                     batch_size=Config.BATCH_SIZE,
                                     class_mode=Config.CLASS_MODE,
                                     color_mode=Config.COLOR_MODE,
                                     target_size=Config.TARGET_SIZE,  
                                    )

    dftrainIter_dict[exam] = dftrainIter
    dfvalIter_dict[exam] = dfvalIter
    dftestIter_dict[exam] = dftestIter 
    train_gen_dict[exam] = train_generator
    val_gen_dict[exam] = val_generator
    test_gen_dict[exam] = test_generator


In [None]:
model_deep = tf.keras.applications.ResNet50(weights='imagenet',
                                              input_shape=(Config.IMG_SIZE, Config.IMG_SIZE, 3),
                                              include_top=False)

# Congela todas as camadas exceto as ultimas
for layer in model_deep.layers:
    layer.trainable = False

#     # Debug - checa qual layer está congelado    
#     for i, layer in enumerate(model_deep.layers):
#         print( i, layer.name, "-" ,layer.trainable)


x = layers.GlobalAveragePooling2D()(model_deep.output)
model = Model(inputs=model_deep.input, outputs=x)

model.summary()

## Leitura dos Feature Vectors

In [None]:
feats_train_dict = {}
feats_val_dict = {}
feats_test_dict = {}

for exam in exam_list:
    feats_train = model.predict(train_gen_dict[exam],)
    feats_val = model.predict(val_gen_dict[exam])
    feats_test = model.predict(test_gen_dict[exam])
    feats_train_dict[exam] = feats_train
    feats_val_dict[exam] = feats_val
    feats_test_dict[exam] = feats_test
    
    print('done!')

In [None]:
feats_mean_train = {}
feats_mean_val = {}
feats_mean_test = {}

for exam in exam_list:
    list_ = [feats_train_dict[exam][i] for i in range(feats_train_dict[exam].shape[0])]
    dftrainIter_dict[exam]['featvecs'] =  list_
    mean_ = dftrainIter_dict[exam][['patient_ids','featvecs']].groupby('patient_ids').mean()
    feats_mean_train[exam] = mean_
    
    list_ = [feats_val_dict[exam][i] for i in range(feats_val_dict[exam].shape[0])]
    dfvalIter_dict[exam]['featvecs'] =  list_
    mean_ = dfvalIter_dict[exam][['patient_ids','featvecs']].groupby('patient_ids').mean()
    feats_mean_val[exam] = mean_
    
    list_ = [feats_test_dict[exam][i] for i in range(feats_test_dict[exam].shape[0])]
    dftestIter_dict[exam]['featvecs'] =  list_
    mean_ = dftestIter_dict[exam][['patient_ids','featvecs']].groupby('patient_ids').mean()
    feats_mean_test[exam] = mean_

In [None]:
featvecs_dict_train = {}
featvecs_dict_val = {}
featvecs_dict_test = {}

for exam in exam_list:
    featvecs_unstacked = feats_mean_train[exam]['featvecs'].values
    featvecs_train = np.stack(featvecs_unstacked[:])
    featvecs_dict_train[exam] = featvecs_train
    
    featvecs_unstacked = feats_mean_val[exam]['featvecs'].values
    featvecs_val = np.stack(featvecs_unstacked[:])
    featvecs_dict_val[exam] = featvecs_val
    
    featvecs_unstacked = feats_mean_test[exam]['featvecs'].values
    featvecs_test = np.stack(featvecs_unstacked[:])
    featvecs_dict_test[exam] = featvecs_test

In [None]:
# for exam, array in featvecs_dict_train.items():
X_train = np.hstack(list(featvecs_dict_train.values()))
X_val = np.hstack(list(featvecs_dict_val.values()))
X_test = np.hstack(list(featvecs_dict_test.values()))

In [None]:
y_train = df_train.sort_values(by='brats21idInt')['MGMT_value']
y_train = y_train.values

y_val = df_val.sort_values(by='brats21idInt')['MGMT_value']
y_val = y_val.values

y_test = df_test.sort_values(by='brats21idInt')['MGMT_value']
y_test = y_test.values

In [None]:
from sklearn.utils import shuffle

xshuffled, yshuffled = shuffle(X_train, y_train, random_state=42)


In [None]:
from sklearn.svm import SVC

clf = SVC(C=5,
          kernel='rbf',
          gamma='scale', class_weight='balanced',
          verbose=True,
          probability=True,
)
clf.fit(xshuffled, yshuffled)

In [None]:
# y_pred_train = clf.predict(X_train)
# y_pred_val = clf.predict(X_val)
# y_pred_test = clf.predict(X_test)

y_pred_train = clf.predict_proba(X_train)
y_pred_val = clf.predict_proba(X_val)
y_pred_test = clf.predict_proba(X_test)
y_pred_train = y_pred_train[:,1]
y_pred_val = y_pred_val[:,1]
y_pred_test = y_pred_test[:,1]

In [None]:
def ROC_AUC(probs, labels):
    ns_probs = [0 for _ in range(len(labels))]

    ns_auc = roc_auc_score(labels, ns_probs)
    lr_auc = roc_auc_score(labels, probs)
    print('No skill: ROC AUC=%.3f' % (ns_auc))
    print('CNN: ROC AUC=%.3f' % (lr_auc))
    lr_fpr, lr_tpr, _ = roc_curve(labels, probs)
    plt.plot(lr_fpr, lr_tpr, marker='.', label='CNN')

    ns_fpr, ns_tpr, _ = roc_curve(labels, ns_probs)
    plt.plot(ns_fpr, ns_tpr, marker='.', label='No Skill')

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

In [None]:
ROC_AUC(y_pred_train, y_train)
ROC_AUC(y_pred_val, y_val)

In [None]:
ROC_AUC(y_pred_test, y_test)