# Genetic Biomarker prediction with CRNN - train
A lot of this work is inspired from work shared by fellow kagglers, thank everyone for teaching me good stuff !!

### Credits
- https://keras.io/examples/vision/video_classification/
- https://www.kaggle.com/ayuraj/train-brain-tumor-as-video-classification-w-b
- https://www.kaggle.com/sergeybulanov/tf-simple-prediction-with-vgg16#2.-DataLoader
- https://www.kaggle.com/michaelfumery/brain-tumor-transfert-learning-flair-kfold

In [None]:
import os
import re 
import glob
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import random as rn
import matplotlib.pyplot as plt
import imageio
import pydicom

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Deep learning packages
import tensorflow as tf
from tensorflow.keras.callbacks import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow import keras

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import backend as K, optimizers, regularizers

import wandb
from wandb.keras import WandbCallback

rn.seed(30)
np.random.seed(30)
tf.compat.v1.random.set_random_seed(30)
print('W&B version: ', wandb.__version__)
from pydicom.pixel_data_handlers.util import apply_voi_lut

Min Images for MRI Type
- T2w = 15
- FLAIR = 14
- T1w = 14
- T1wCE = 15

In [None]:
config = {
  'images_source_path' : '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train',
  'test_images_source_path' : '../input/rsna-miccai-brain-tumor-radiogenomic-classification/test',
  'csv_path': '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv',
  'data_path': '../input/rsna-miccai-brain-tumor-radiogenomic-classification',
  'output_path': './crnn/',
  'nfolds': 3,
  'global_seed': 42,
  'batch_size': 4,
  'frames_per_seq': 24,
  'img_size': 224,
  'learning_rate': 0.0001,
  'rnn_cells': 16,  
  'num_epochs': 10,
  'channels': 3,
  'scale' : 0.75
}

# mri_types = ['T2w'] 
mri_types = ['FLAIR','T1w','T1wCE','T2w']

## Data Preprocessing

In [None]:
class BrainTumor_GeneticSequence():
    """Prepares the train and the validation data pipeline for mri_type, for ex: mri_type = FLAIR"""
    mri_type = "FLAIR"
    df_data = None
    df_train_labels = pd.read_csv(config['csv_path'])
    
    def __init__(self, mri_type):
        self.mri_type = mri_type
        self.df_data = pd.DataFrame(columns=['BraTS21ID'] + mri_types)
        for key in mri_types:
            self.df_data[key] = self.df_data[key].astype(int)
        self.df_data['BraTS21ID'] = self.df_data['BraTS21ID'].astype(int)

    def prepare_dataframe(self, mode='train'):
        train_folders = ''
        if mode == 'test':
            folders_path = "test_images_source_path"
        else:
            folders_path = "images_source_path"
        train_folders = config[folders_path] + '/'
        for f in tqdm(os.listdir(train_folders)):
            if f in ["00109", "00123", "00709"]: 
                continue
            BraTS21ID = int(f)
            self.df_data = self.df_data.append({'BraTS21ID': BraTS21ID, 'FLAIR': 0, 'T1w': 0, 'T1wCE': 0, 'T2w' : 0}, ignore_index=True)
            BraTS21ID_key_path = f'{config[folders_path]}/{format(BraTS21ID, "05d")}/{self.mri_type}/*.dcm'
            files_len = len(glob.glob(BraTS21ID_key_path))
            # update file count or remove the patient from the dataset for the mri_type chosen
            if files_len > 0:
                self.df_data.loc[self.df_data['BraTS21ID'] == BraTS21ID, self.mri_type] = files_len
            else:
                self.df_data = self.df_data.loc[self.df_data.BraTS21ID!=BraTS21ID]
        self.df_data["folder_name"] = [format(x, '05d') for x in self.df_data["BraTS21ID"]]
        self.df_data["folder_path"] = [os.path.join(config[folders_path], x) for x in self.df_data["folder_name"]]
#         self.df_data = self.df_data.head(30) # for testing
#         print(self.df_data.head())
        self.df_data = pd.merge(self.df_data, self.df_train_labels,how='left',on='BraTS21ID')
    
    def rotate_image(self, image, angle):
        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        result = cv2.warpAffine(image, rot_mat, image.shape[1::-1], flags=cv2.INTER_LINEAR)
        return result
    
    def normalize(self, image):
        result = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        return result
    
    def read_mri(self, path, voi_lut = True, fix_monochrome = True):
        # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
        dicom = pydicom.read_file(path)
        if voi_lut:
            data = apply_voi_lut(dicom.pixel_array, dicom)
        else:
            data = dicom.pixel_array
        if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
            data = np.amax(data) - data
        data = data - np.min(data)
        data = data / np.max(data)
        data = (data * 255).astype(np.uint8)
        data = self.normalize(data)
        data = self.rotate_image(data, np.random.randint(0,20))
        data = self.crop_center_square(data)
        data = cv2.resize(data, (config['img_size'], config['img_size']))
#         (thresh, im_bw) = cv2.threshold(data, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
#         data = cv2.threshold(data, thresh, 255, cv2.THRESH_BINARY)[1]
        data = np.repeat(data[..., np.newaxis], 3, -1)
        return data
    
    def crop_center_square(self, frame, scale=config['scale']):
        y, x = frame.shape[0:2]
        center_x, center_y = x / 2, y / 2
        width_scaled, height_scaled = x * scale, y * scale
        left_x, right_x = center_x - width_scaled / 2, center_x + width_scaled / 2
        top_y, bottom_y = center_y - height_scaled / 2, center_y + height_scaled / 2
        return frame[int(top_y):int(bottom_y), int(left_x):int(right_x)]

    def get_img_path_3d(self, dir_path):
        modality_path = os.path.join(dir_path.decode('utf8'), self.mri_type)
        files = sorted(glob.glob(f"{modality_path}/*.dcm"), key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])
        total_img_num = len(files) 
        mid_num = total_img_num // 2
        num_3d2 = config['frames_per_seq'] // 2
        start_idx = max(0, mid_num - num_3d2)
        end_idx = min(len(files), mid_num + num_3d2)
        target_file_paths = tf.convert_to_tensor(files[start_idx:end_idx], dtype=tf.string) 
        
        def get_frames(path):
            file_path = path.numpy().decode('UTF-8')
            image = self.read_mri(file_path)    
            return image
    
        mri_images = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn=get_frames, elems=target_file_paths, fn_output_signature=tf.float32))

        # padding null images 
        if mri_images.shape[0] < config['frames_per_seq']:
            n_zero = tf.zeros((config['frames_per_seq'] - mri_images.shape[0], config['img_size'], config['img_size'], config['channels']), dtype=tf.dtypes.float32)
            mri_images = np.concatenate((mri_images,  n_zero), axis = 0)
        return mri_images
    
    def load_frame(self, df_dict):
        dirname = df_dict['folder_path']
        paths = tf.numpy_function(self.get_img_path_3d, [dirname], tf.float32)
        label = df_dict['MGMT_value']
        label = tf.cast(label, tf.float32)
        return paths, label
    
    
    def train(self, epochs=config['num_epochs'], output_path = config['output_path'], cnn_model_arch='custom', wandblogging = False):
        tf.keras.backend.clear_session() 
        self.prepare_dataframe()
        skf = StratifiedKFold(n_splits=config['nfolds'], shuffle=True)
        fold = 1
        history = {}
        run =  None
        
        if wandblogging:
            wandb.login()    
            run = wandb.init(project='brain-tumor-video', job_type='dataloader-viz')
            os.makedirs('gifs/', exist_ok=True)
        
        for train_index, valid_index in skf.split(self.df_data, self.df_data.MGMT_value.values):
    
            train_df = self.df_data.loc[train_index,:]
            valid_df = self.df_data.loc[valid_index,:]
            print(f'Size of train_df: {len(train_df)}; valid_df: {len(valid_df)}')

            AUTOTUNE = tf.data.AUTOTUNE
            
            trainloader = tf.data.Dataset.from_tensor_slices(dict(train_df))
            validloader = tf.data.Dataset.from_tensor_slices(dict(valid_df))
            
            trainloader = (
                trainloader
                .shuffle(1024)
                .map(self.load_frame, num_parallel_calls=AUTOTUNE)
                .batch(config['batch_size'])
                .prefetch(AUTOTUNE)
            )

            validloader = (
                validloader
                .map(self.load_frame, num_parallel_calls=AUTOTUNE)
                .batch(config['batch_size'])
                .prefetch(AUTOTUNE)
            )
            
            if wandblogging:
                frames, labels = next(iter(trainloader))
                for i, frame in enumerate(frames):
                    imageio.mimsave(f'gifs/out_{i}.gif', (frame).numpy().astype('uint8')) 
                    wandb.log({'examples': [wandb.Image(f'gifs/out_{i}.gif', caption=f'{label.numpy()}') for i, label in enumerate(labels)]})
        
            model = create_custom_crnn_model(cnn_model_arch)
#             model.summary()
            optimizer = tf.keras.optimizers.Adam(learning_rate=config['learning_rate'])
            loss_fn = tf.keras.losses.BinaryCrossentropy()
            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)
            model_checkpoint = tf.keras.callbacks.ModelCheckpoint(f'{cnn_model_arch}_{fold}.h5', save_best_only=True, save_weights_only=False)
            LR = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5, min_lr=0.000001, verbose=1, mode='max')
            model.compile(optimizer=optimizer,loss=loss_fn, metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
            print(f'Training for fold...{fold}')
            history[fold] = model.fit(trainloader, epochs=epochs,validation_data=validloader, callbacks=[LR, early_stopping, model_checkpoint])
            fold+=1
        
        if wandblogging:
            run.finish()
        return history
    
    def predit(self, model):
        AUTOTUNE = tf.data.AUTOTUNE
        testloader = tf.data.Dataset.from_tensor_slices(dict(self.df_data))
        testloader = (testloader
                    .map(self.load_frame, num_parallel_calls=AUTOTUNE)
                    .batch(config['batch_size'])
                    .prefetch(AUTOTUNE)
                    )
        proba = model.predict(testloader, verbose=1)
        return proba 

## Test Transformations
Here let us see difference between original image and transformed image

In [None]:
dp = BrainTumor_GeneticSequence('FLAIR')
sample_img = pydicom.read_file(f'{config["images_source_path"]}/00068/FLAIR/Image-132.dcm').pixel_array
processed_img = dp.read_mri(f'{config["images_source_path"]}/00068/FLAIR/Image-132.dcm')

fig = plt.figure(figsize=(12,8))
ax1 = plt.subplot(1,2,1)
ax1.imshow(sample_img, cmap="gray")
ax1.set_title(f"Original image shape = {sample_img.shape}")

ax2 = plt.subplot(1,2,2)
ax2.imshow(processed_img[:,:,0], cmap="gray")
ax2.set_title(f"Preproc image shape = {processed_img.shape}")
plt.show()

In [None]:
from tensorflow.keras.applications import *

def FeatureExtractor(model_arch='custom'):
    shape=(config['img_size'],config['img_size'],config['channels'])
    model = Sequential()
    if model_arch == 'custom':
        model.add(Conv2D(64, (3, 3), padding='same', activation='relu', input_shape=shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(BatchNormalization())

        model.add(Conv2D(64, (3, 3), padding='same', activation='relu',))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(BatchNormalization())

        model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(BatchNormalization())
        
        model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(256))
    elif model_arch == 'resnet50':
        model = ResNet50(include_top=False, weights='imagenet', input_shape=shape)
        for layer in model.layers[:-10]:
            layer.trainable=True
    elif model_arch == 'efb7':
        model = EfficientNetB7(include_top=False,weights="imagenet", input_shape=shape)
        for layer in model.layers[:-10]:
            layer.trainable=True
    elif model_arch == 'xception':
        model = Xception(include_top=False,weights="imagenet", input_shape=shape)
        for layer in model.layers[:-10]:
            layer.trainable=True
    return model

def create_custom_crnn_model(cnn_model_arch='custom'):
    feature_extractor = FeatureExtractor(cnn_model_arch)
    model = Sequential()
    model.add(TimeDistributed(feature_extractor,input_shape=(config['frames_per_seq'], config['img_size'], config['img_size'], config['channels'])))
    model.add(TimeDistributed(Flatten()))
    model.add(GRU(config['rnn_cells'],return_sequences=True)) # using GRU here because they have less parameters compared to LSTM
    model.add(GRU(config['rnn_cells']//2))
    model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(0.4))
    model.add(Dense(1,activation='sigmoid'))
    return model

def plot(history):
    fig, ax = plt.subplots(1, 3, figsize=(20, 7))
    ax = ax.ravel()
    for fold in history:
        for i, metric in enumerate(["accuracy","loss","auc"]):
            ax[i].plot(history[fold].history[metric], label="train "+str(fold))
            ax[i].plot(history[fold].history["val_" + metric], linestyle="dotted", label="val "+str(fold))
            ax[i].set_title("Model {}".format(metric))
            ax[i].set_xlabel("epochs")
            ax[i].set_ylabel(metric)
            ax[i].legend()
    
def train_each_mri_type(mri_types, cnn_model_arch):
    for m_type in mri_types:
        dp = BrainTumor_GeneticSequence(m_type)
        print("*"*100)
        print(f"Training for {m_type} with {cnn_model_arch} as feature extractor, configuration: {config}")
        print('*'*100)
        history = dp.train(epochs=10,cnn_model_arch=cnn_model_arch)
        plot(history)

## Training

In [None]:
%%time
model_arch = 'custom'
train_each_mri_type(mri_types, model_arch)

## Prediction

In [None]:
model = tf.keras.models.load_model(f'./{model_arch}_1.h5')
dp = BrainTumor_GeneticSequence(mri_types[0])
dp.prepare_dataframe(mode='test')
dp.df_data['MGMT_pred'] = 0
proba = dp.predit(model)
sample_submission_path = os.path.join(config['data_path'], 'sample_submission.csv')
sample_df = pd.read_csv(sample_submission_path); 
dp.df_data['MGMT_pred'] += proba.squeeze()    
dp.df_data[['BraTS21ID','MGMT_pred']].head()