## EDA + Data Preprocessing - from Pratyush's notebook

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os, glob, random, cv2, glob, pydicom

In [None]:
df = pd.read_csv('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.MGMT_value.value_counts()

In [None]:
train_sample_path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train'
len(os.listdir(train_sample_path)), df.BraTS21ID.nunique()

In [None]:
def read_dicom_xray(path):
    data = pydicom.read_file(path).pixel_array
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
# following function took from: https://www.kaggle.com/ihelon/brain-tumor-eda-with-animations-and-modeling?scriptVersionId=68202876&cellId=11
def visualize_sample(
    brats21id, 
    mgmt_value,
    slice_i,
    types=("FLAIR", "T1w", "T1wCE", "T2w")
):
    plt.figure(figsize=(16, 5))
    patient_path = os.path.join(
        train_sample_path, 
        str(brats21id).zfill(5),
    )
    for i, t in enumerate(types, 1):
        t_paths = sorted(
            glob.glob(os.path.join(patient_path, t, "*")), 
            key=lambda x: int(x[:-4].split("-")[-1]),
        )
        data = read_dicom_xray(t_paths[int(len(t_paths) * slice_i)])
        plt.subplot(1, 4, i)
        plt.imshow(data, cmap="gray")
        plt.title(f"{t}", fontsize=16)
        plt.axis("off")

    plt.suptitle(f"MGMT_value: {mgmt_value}", fontsize=16)
    plt.show()
    
    
for i in random.sample(range(df.shape[0]), 2):
    visualize_sample(df.iloc[i]["BraTS21ID"], df.iloc[i]["MGMT_value"], slice_i=0.5)

In [None]:
# Data Preprocessing DICOM into 2D Slices as Generator
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers.experimental.preprocessing import RandomFlip,RandomRotation,RandomTranslation

# Keras Data Augmentation
augmentation_layers = tf.keras.Sequential(
    [
        RandomRotation(factor=0.01),
        RandomTranslation(height_factor=0.0, width_factor=0.1),
    ],
    name='keras_augment_layers'
)

# More manual data augmentation
def preprocessing_image(img, augment=True):   
    img = tf.cast(img, tf.float32) / 255.0

    # only true for train set 
    if augment:
        # augment each slices 
        # todo: integrate better technique 
        splitted_img = tf.split(img, input_depth, axis=-1)

        augment_img = []
        for each_img in splitted_img:
            img = tf.repeat(each_img, repeats=3, axis=-1)
            img = tf.image.random_flip_left_right(img)
            img = tf.image.random_saturation(img, 0.9, 1.3)
            img = tf.image.random_contrast(img, 0.8, 1.2)
            img = tf.image.random_brightness(img, 0.2)
            img, _, _ = tf.split(img, 3, axis=-1)
            img = tfa.image.random_cutout(tf.expand_dims(img, 0),
                                          mask_size=(20, 20), 
                                          constant_values=0)
            augment_img.append(img)
            
        img = tf.concat(augment_img, axis=-1)
    img = tf.reshape(img, [input_height, input_width, input_depth])
    return img

def get_data_generator(data, is_train=False, shuffle=True, augment=False, repeat=True, batch_size=32):
    if repeat: 
        data = data.repeat()
    
    if shuffle:
        data = data.shuffle(batch_size * 10)
        
    data = data.map(lambda x, y: (preprocessing_image(x, augment), y), num_parallel_calls=AUTO)
    data = data.batch(batch_size, drop_remainder=is_train)
    
    if shuffle:
        data = data.map(lambda x, y: (augmentation_layers(x), y), num_parallel_calls=AUTO) 
    
    data = data.prefetch(AUTO)
    return data 

In [None]:
# data loader 
class BrainTumorGenerator(tf.keras.utils.Sequence):
    def __init__(self, dicom_path, data, is_train=True):
        self.is_train = is_train # to control training/validation/inference part         
        self.data = data
        self.dicom_path = dicom_path
        self.label = self.data['MGMT_value']
  
    def __len__(self):
        return self.data['BraTS21ID'].shape[0]
    
    def __getitem__(self, index):
        patient_ids = f"{self.dicom_path}/{str(self.data['BraTS21ID'][index]).zfill(5)}/"
   
        channel = []
        for t in ("FLAIR", "T1w", "T1wCE", "T2w"): 
            t_paths = sorted(
                glob.glob(os.path.join(patient_ids, t, "*")), 
                key=lambda x: int(x[:-4].split("-")[-1]),
            )
            
            # pick 15 slices 
            K = 15
            # computing strt, and end index 
            strt_idx = (len(t_paths) // 2) - (K // 2)
            end_idx = (len(t_paths) // 2) + (K // 2)
            # slicing extracting elements with 3 intervals 
            r = t_paths[strt_idx + 3: end_idx + 3: 3]
    
            # removing black borders 
            # and add multi-modal features maps / channel depth
            threshold = 0
            for i in r:
                image = self.read_dicom_xray(i)
                temp_image = image
                
                rows = np.where(np.max(temp_image, 0) > threshold)[0]
                if rows.size:
                    cols = np.where(np.max(temp_image, 1) > threshold)[0]
                    image = image[cols[0]: cols[-1] + 1, rows[0]: rows[-1] + 1]
                else:
                    image = image[:1, :1]
                
                channel.append(cv2.resize(image, (input_height, input_width)))
                break # remove it for r-times frames for each series
                    
        if self.is_train:
            return np.array(channel).T, self.label.iloc[index,]
        else:
            return np.array(channel).T
    
    def read_dicom_xray(self, path):
        data = pydicom.read_file(path).pixel_array
        data = data - np.min(data)
        data = data / np.max(data)
        data = (data * 255).astype(np.uint8)
        return data

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_index, val_index) in enumerate(skf.split(X=df.index, 
                                                           y=df.MGMT_value)):
    df.loc[val_index, 'fold'] = index
    
print(df.groupby(['fold', df.MGMT_value]).size())

In [None]:
def fold_generator(fold):
    # for way one - data generator
    train_labels = df[df.fold != fold].reset_index(drop=True)
    val_labels = df[df.fold == fold].reset_index(drop=True)
    
    return (
        BrainTumorGenerator(train_sample_path, train_labels),
        BrainTumorGenerator(train_sample_path, val_labels)
    )

# first fold 
train_gen, val_gen = fold_generator(0)

In [None]:
# params 
AUTO = tf.data.AUTOTUNE
input_height = 312
input_width = 312
input_depth = 4
batch_size = 3
fold = 0

train_data = tf.data.Dataset.from_generator(
    lambda: map(tuple, train_gen),
    (tf.float32, tf.float32),
    (
        tf.TensorShape([input_height, input_width, input_depth]),
        tf.TensorShape([]),
    ),
)

# generate train sets 
train_generator = get_data_generator(train_data, is_train=True, repeat=False, shuffle=True, augment=True, batch_size=batch_size)

In [None]:
# train generator visualization 
x, y = next(iter(train_generator))
print(x.shape, y.shape)  
plt.figure(figsize=(35, 15))
for i in range(input_depth):
    plt.subplot(1, input_depth, i + 1)
    plt.imshow(x[0 ,:, :, i], cmap="gray")
    plt.axis("off")
    plt.title(y[0].numpy())

In [None]:
# wrapping sequence generator to tf.data API 
val_data = tf.data.Dataset.from_generator(
    lambda: map(tuple, val_gen),
    (tf.float32, tf.float32),
    (
        tf.TensorShape([input_height, input_width, input_depth]),
        tf.TensorShape([]),
    ),
)

# generate validation sets 
valid_generator = get_data_generator(val_data, is_train=True, 
                                     shuffle=False, repeat=False, augment=False, 
                                     batch_size=batch_size)

# visualization 
x, y = next(iter(valid_generator))
print(x.shape, y.shape)  
plt.figure(figsize=(35, 15))
for i in range(input_depth):
    plt.subplot(1, input_depth, i + 1)
    plt.imshow(x[0 ,:, :, i], cmap="gray")
    plt.axis("off")
    plt.title(y[0].numpy())

In [None]:
x,y = next(iter(train_generator))
print(x.shape, y.shape)

In [None]:
def final_generator(data_gen):
    while True:
        for x,y in iter(data_gen):
            newy = {'clf': y, 'recon': x}
            yield x,newy
        
        '''newy = {'clf': y, 'recon': tf.reshape(x, (312,312,4))}
        yield tf.reshape(x, (312,312,4)),newy'''

In [None]:
gen_tr = final_generator(train_generator)
gen_va = final_generator(valid_generator)

x,y = next(gen_tr)

print(x.shape, y['clf'].shape, y['recon'].shape)

## 2D U-net:

In [None]:
# dual loss: 
#from tensorflow.keras import Input, Model 
#from tensorflow.keras.layers import Conv3D, GlobalAveragePooling2D, Dense
from tensorflow.keras.applications import *

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from tensorflow import losses, optimizers
from tensorflow.keras import Input, Model, models, layers, callbacks, utils, metrics

# --- Define kwargs dictionary
kwargs = {
    'kernel_size': (3, 3),
    'padding': 'same'}

# --- Define lambda functions
conv = lambda x, filters, strides : layers.Conv2D(filters=filters, strides=strides, **kwargs)(x)
norm = lambda x : layers.BatchNormalization()(x)
relu = lambda x : layers.ReLU()(x)
tran = lambda x, filters, strides : layers.Conv2DTranspose(filters=filters, strides=strides, **kwargs)(x)

# --- Define stride-1, stride-2 blocks
conv1 = lambda filters, x : relu(norm(conv(x, filters, strides=1)))
conv2 = lambda filters, x : relu(norm(conv(x, filters, strides=2)))

tran2 = lambda filters, x : relu(norm(tran(x, filters, strides=2)))


In [None]:
# define inputs:
input_dim = (input_height, input_width, input_depth)
input_tensor = Input(input_dim, name='input2d')

In [None]:
input_tensor.shape

In [None]:
# --- Define contracting layers
l1 = conv1(8, input_tensor) #312
l2 = conv1(16, conv2(16, l1)) #156
l3 = conv1(32, conv2(32, l2)) #78
l4 = conv1(48, conv2(48, l3)) #39
#print(l4.shape)
#l5 = conv1(64, conv2(64, l4))

# --- Define expanding layers
#l6  = tran2(48, l5)
#
#l7  = tran2(32, conv1(48, l6))
l7  = tran2(32, l4)
#print(l6.shape)
l8  = tran2(16, conv1(32, l7))
l9  = tran2(8,  conv1(16, l8))
l10 = conv1(8,  l9)
#print(l10.shape)

# --- Define survival prediction
h0 = layers.Flatten()(l4)
h1 = layers.Dense(32, activation='relu')(h0)

# --- Define all logits
logits = {}
logits['clf'] = layers.Dense(1, activation='sigmoid', name='clf')(h1) # prob use relu here instead of sigmoid cuz 
# sig is for pixel by pixel prediction
final = layers.Conv2D(filters=4, name='recon', **kwargs)(l10) # 389 376
logits['recon'] = final #layers.Flatten()(final)

print(f"\nSurvival logits: {logits['clf'].shape}")
print(f"Tumor logits: {logits['recon'].shape}")

In [None]:
# --- Create model
tf.keras.backend.clear_session()
model = Model(inputs=input_tensor, outputs=logits)
#model.summary()

# --- Compile model
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss={
        'clf': losses.BinaryCrossentropy(from_logits = True),
        'recon': losses.BinaryCrossentropy(from_logits = True)},
    metrics={
        'clf': metrics.AUC()}, 
             #metrics.BinaryAccuracy(name='bacc')],
    experimental_run_tf_function=False)

# define callbacks.
checkpoint_cb = callbacks.ModelCheckpoint(
    "model.h5", monitor='val_auc', 
    mode='max', save_best_only=True
)

In [None]:
# --- Define callbacks
def lrdecay(epoch):
    lr = 2e-4
    if epoch > 32:
        lr *= 1e-5
    elif epoch > 16:
        lr *= 2e-5
    elif epoch > 8:
        lr *= 1e-4
    return lr

lrdecay = callbacks.LearningRateScheduler(lrdecay)


estop = callbacks.EarlyStopping(monitor='loss', patience=8, mode='min')
# note: wanted to train to clearer convergence but kept running out of ram, hence the harsher estop penalty

tensorboard_callback = callbacks.TensorBoard('./logs')

In [None]:
# fitting the model 

epochs = 50
model.fit(
    gen_tr, 
    steps_per_epoch = 250,
    epochs=epochs,
    validation_data=gen_va,
    validation_steps = 250,
    callbacks = [lrdecay, estop])#,
    #callbacks=[checkpoint_cb])

# 2d Cnn - from Pratyush's notebook

In [None]:
from tensorflow.keras import Input, Model 
from tensorflow.keras.layers import Conv2D, GlobalAveragePooling2D, Dense
from tensorflow.keras.applications import *

input_dim = (input_height, input_width, input_depth)
input_tensor = Input(input_dim, name='input2d')
efnet = DenseNet121(weights=None, 
                       include_top = False, 
                       input_shape=(input_height, input_width, 3))
mapping3feat = Conv2D(3, (3, 3), padding='same', use_bias=False)(input_tensor)

output = efnet(mapping3feat)
output = GlobalAveragePooling2D()(output)
output = Dense(1, activation='sigmoid')(output)

print(output.shape)

tf.keras.backend.clear_session()
model = Model(input_tensor, output)
model.summary()

In [None]:
from tensorflow import keras 
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow_addons.optimizers import RectifiedAdam, Lookahead

# compiling 
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer=Adam(learning_rate=1e-3),
    metrics=[tf.keras.metrics.AUC(), 
             tf.keras.metrics.BinaryAccuracy(name='bacc')],
)

# define callbacks.
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    "model.h5", monitor='val_auc', 
    mode='max', save_best_only=True
)


# fitting the model 
epochs = 5
model.fit(
    train_generator, 
    epochs=epochs,
    validation_data=valid_generator, 
    callbacks=[checkpoint_cb], verbose=2
)