# RSNA-MICCAI Brain Tumor Radiogenomic Classification - Exploratory Data Analysis and Modeling


### Predict the status of a genetic biomarker important for brain cancer treatment

Quick Exploratory Data Analysis for [RSNA-MICCAI Brain Tumor Radiogenomic Classification](https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification) challenge    




![](https://storage.googleapis.com/kaggle-competitions/kaggle/29653/logos/header.png)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:white; background:darkviolet; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [Overview](#1)
* [Data Visualization](#2)
    

* [Competition Metric](#10)
* [Sample Submission](#20)
    

* [Modeling](#100)

<a id="1"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Overview<center><h2>

The work uses some ideas from next great works:
- https://www.kaggle.com/avloss/eda-with-animation - animation technique
- https://www.kaggle.com/victorfernandezalbor/brats-20-win-nnunet-segment-with-brats-21-rsna - nnUnet

Research papers to try out: 
<https://link.springer.com/article/10.1007/s40998-021-00426-9>

In [None]:
!git clone https://github.com/MIC-DKFZ/nnUNet.git
!git clone https://github.com/NVIDIA/apex
!pip install -e ./nnUNet
!pip install --upgrade git+https://github.com/nanohanno/hiddenlayer.git@bugfix/get_trace_graph#egg=hiddenlayer

In [None]:
!pip install git+https://github.com/shijianjian/EfficientNet-PyTorch-3D
!pip install efficientnet_pytorch

In [None]:
import sys
sys.path.append('../input/efficientnetpyttorch3d/EfficientNet-PyTorch-3D')
from efficientnet_pytorch_3d import EfficientNet3D
package_path = "../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master/"
sys.path.append(package_path)

In [None]:
import os
import glob
import json
import glob
import random
import collections
import re
import numpy as np
import pandas as pd
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import time
import torch
from torch import nn
from torch.utils import data as torch_data
from sklearn import model_selection as sk_model_selection
from torch.nn import functional as torch_functional
import efficientnet_pytorch
from sklearn.model_selection import StratifiedKFold
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.multioutput import MultiOutputClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score
from sklearn import metrics
import optuna
# from boostaroota import BoostARoota
from sklearn.metrics import log_loss
from optuna.samplers import TPESampler
import functools
from functools import partial
import xgboost as xgb
from sklearn.metrics import confusion_matrix, average_precision_score, recall_score, accuracy_score, f1_score
import pylab as pl
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import torch.nn as nn
import tensorflow
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from random import shuffle
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import AUC
import math
import joblib
DEVICE = "GPU"

In [None]:
# Download pretrained Task001_BrainTumour, and setting up the nnUnet environment.
os.mkdir('./nnUNet_raw_data_base')
os.mkdir('./nnUNet_raw_data_base/nnUNet_raw_data')
os.mkdir('./RESULTS_FOLDER')
os.environ['nnUNet_raw_data_base'] = './nnUNet_raw_data_base/nnUNet_raw_data'
os.environ['RESULTS_FOLDER'] = './RESULTS_FOLDER'
os.environ['nnUNet_preprocessed'] = './nnUNet_preprocessed'
!nnUNet_download_pretrained_model Task001_BrainTumour

In [None]:
if os.path.exists("../input/rsna-miccai-brain-tumor-radiogenomic-classification"):
    data_directory = '../input/rsna-miccai-brain-tumor-radiogenomic-classification'
    pytorch3dpath = "../input/efficientnetpyttorch3d/EfficientNet-PyTorch-3D"
else:
    data_directory = '/media/roland/data/kaggle/rsna-miccai-brain-tumor-radiogenomic-classification'
    pytorch3dpath = "EfficientNet-PyTorch-3D"

**train/** - folder containing the training files, with each top-level folder representing a subject  
**train_labels.csv** - file containing the target MGMT_value for each subject in the training data (e.g. the presence of MGMT promoter methylation)   
**test/** - the test files, which use the same structure as train/; your task is to predict the MGMT_value for each subject in the test data. NOTE: the total size of the rerun test set (Public and Private) is ~5x the size of the Public test set   
**sample_submission.csv** - a sample submission file in the correct format

<a id="2"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Data Visualization<center><h2>

In [None]:
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
train_df

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(data=train_df, x="MGMT_value");

In [None]:
def crop_image_pixel_array(arr, margin = 5, crop = True):

    arr_formatted = arr.copy()
    arr_formatted = arr_formatted - np.min(arr_formatted)
    if np.max(arr_formatted) != 0:
        arr_formatted = arr_formatted / np.max(arr_formatted)

    percent_pixels = ((arr_formatted>0).sum()/(arr_formatted.shape[0]*arr_formatted.shape[1])) * 100
    
    if (crop) & (percent_pixels>5):
        # Get rows containing brain portion
        first_row_index = list([arr_formatted.sum(axis = 1)>0][0])
        first_row_index = first_row_index.index(True) - margin
        if first_row_index<0:
            first_row_index = 0

        last_row_index = list([arr_formatted.sum(axis = 1)>0][0])
        last_row_index.reverse()
        last_row_index = (len(last_row_index) - last_row_index.index(True)) + margin
        if last_row_index>arr_formatted.shape[0]:
            last_row_index = arr_formatted.shape[0]

        # Get columns containing brain portion
        first_column_index = list([arr_formatted.sum(axis = 0)>0][0])
        first_column_index = first_column_index.index(True) - margin
        if first_column_index<0:
            first_column_index = 0

        last_column_index = list([arr_formatted.sum(axis = 0)>0][0])
        last_column_index.reverse()
        last_column_index = (len(last_column_index) - last_column_index.index(True)) + margin
        if last_column_index>arr_formatted.shape[1]:
            last_column_index = arr_formatted.shape[1]

        num_rows = last_row_index - first_row_index
        num_columns = last_column_index - first_column_index
        
        if ((num_rows<arr_formatted.shape[0]) & (num_rows<arr_formatted.shape[1])) & ((num_columns<arr_formatted.shape[0]) & (num_columns<arr_formatted.shape[1])):
            if (num_columns > num_rows):
                if arr_formatted.shape[0] < (last_row_index+(num_columns-num_rows)):
                    last_row_index = arr_formatted.shape[0]
                    first_row_index = (last_row_index - num_columns)
                else:
                    last_row_index = last_row_index+(num_columns-num_rows)
            elif (num_columns < num_rows):
                if arr_formatted.shape[1] < (last_column_index+(num_rows-num_columns)):
                    last_column_index = arr_formatted.shape[1]
                    first_column_index = (last_column_index - num_rows)
                else:
                    last_column_index = last_column_index+(num_rows-num_columns)

            arr_crop = arr[first_row_index:last_row_index,:]
            arr_crop = arr_crop[:,first_column_index:last_column_index]   
        else:
            first_row_index = np.NaN
            last_row_index = np.NaN
            first_column_index = np.NaN
            last_column_index = np.NaN
            arr_crop = arr.copy()
    
    else:
        first_row_index = np.NaN
        last_row_index = np.NaN
        first_column_index = np.NaN
        last_column_index = np.NaN
        arr_crop = arr.copy()
    return arr_crop, [first_row_index,last_row_index,first_column_index,last_column_index]

In [None]:
def get_image_plane(data):
    x1, y1, _, x2, y2, _ = [round(j) for j in data.ImageOrientationPatient]
    cords = [x1, y1, x2, y2]

    if cords == [1, 0, 0, 0]:
        return 'Coronal'
    elif cords == [1, 0, 0, 1]:
        return 'Axial'
    elif cords == [0, 1, 0, 0]:
        return 'Sagittal'
    else:
        return 'Unknown'

In [None]:
def get_voxel(dcm_path):
    imgs = []
    positions = []
    
    img = pydicom.dcmread(str(dcm_path))
    imgs.append(img.pixel_array)
    positions.append(img.ImagePositionPatient)

    plane = get_image_plane(img)
    voxel = np.stack(imgs)
    
    # reorder planes if needed and rotate voxel
    if plane == "Coronal":
        if positions[0][1] < positions[-1][1]:
            voxel = voxel[::-1]
        voxel = voxel.transpose((1, 0, 2))
    elif plane == "Sagittal":
        if positions[0][0] < positions[-1][0]:
            voxel = voxel[::-1]
        voxel = voxel.transpose((1, 2, 0))
        voxel = np.rot90(voxel, 2, axes=(1, 2))
    elif plane == "Axial":
        if positions[0][2] > positions[-1][2]:
            voxel = voxel[::-1]
        voxel = np.rot90(voxel, 2)
        
    min_index = np.argmin(voxel.shape)
    
    if min_index == 0:
        voxel = voxel[0,:,:]
    elif min_index == 1:
        voxel = voxel[:,0,:]
    elif min_index == 2:
        voxel = voxel[:,:,0]
    
    return voxel, plane

In [None]:
def load_dicom_for_saving_into_png(path, voi_lut = False, fix_monochrome = True, rotate = 0):
    dicom = pydicom.read_file(path)
    data, plane = get_voxel(path)
    
    if voi_lut:
        data = apply_voi_lut(data, dicom)
    
    if rotate > 0:
        rot_choices = [0, cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE, cv2.ROTATE_180]
        data = cv2.rotate(data, rot_choices[rotate])
    
    # MONOCHROME1 indicates that the greyscale ranges from bright to dark with ascending pixel values, whereas MONOCHROME2 ranges from dark to bright with ascending pixel values
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
        
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
from matplotlib import animation, rc
rc('animation', html='jshtml')

def create_animation(ims):
    fig = plt.figure()
    plt.axis('off')
    im = plt.imshow(ims[0], cmap="gray")

    def animate_func(i):
        im.set_array(ims[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames = len(ims), interval = 1000//24)

def get_gray(org_img):
    gray_img=cv2.cvtColor(org_img.copy(),cv2.COLOR_RGB2GRAY)
    return gray_img

def get_RGB(gray_img):
    rgb_img=cv2.cvtColor(gray_img.copy(),cv2.COLOR_GRAY2RGB)
    return rgb_img

def get_threshold(org_img,blur=False,erode=False,dilate=False):
    gray_img=get_gray(org_img.copy())
    if blur:
        img=cv2.GaussianBlur(gray_img.copy(), (5, 5), 0)
    img=cv2.threshold(img,5,255,cv2.THRESH_BINARY)[1]
    if erode:
        img=cv2.erode(img, None, iterations=2)
    if dilate:
        img=cv2.dilate(img, None, iterations=2)
    return img

def get_contours(th_img):
    cnts,_ = cv2.findContours(th_img.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
    return cnts
   
def edge_smoothing(org_img,cnts):
    gray_img=get_gray(org_img.copy())
    if len(cnts)==0:  
        return gray_img
    c = max(cnts, key=cv2.contourArea)
    black_img=np.zeros_like(gray_img)
    black_cnt=cv2.drawContours(black_img.copy(),c,-1, (255, 255, 255), 2)
    black_cnt=cv2.dilate(black_cnt.copy(), None, iterations=10)
    white_cnt=cv2.bitwise_not(black_cnt.copy())
    white_cnt=get_RGB(white_cnt)
    smooth_img=cv2.bitwise_and(white_cnt.copy(),org_img.copy())
    return smooth_img

def get_iou(bb1, bb2):
    assert bb1[0] < bb1[2]
    assert bb1[1] < bb1[3]
    assert bb2[0] < bb2[2]
    assert bb2[1] < bb2[3]
    x_left = max(bb1[0], bb2[0])
    y_top = max(bb1[1], bb2[1])
    x_right = min(bb1[2], bb2[2])
    y_bottom = min(bb1[3], bb2[3])
    if x_right < x_left or y_bottom < y_top:
        return 0.0

    intersection_area = (x_right - x_left) * (y_bottom - y_top)
    bb1_area = (bb1[2] - bb1[0]) * (bb1[3] - bb1[1])
    bb2_area = (bb2[2] - bb2[0]) * (bb2[3] - bb2[1])
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

def check_overlapping(prev_cnt,curr_cnt):
    x,y,w,h=cv2.boundingRect(prev_cnt)
    bb1=[x,y,x+w,y+h]
    x,y,w,h=cv2.boundingRect(curr_cnt)
    bb2=[x,y,x+w,y+h]
    iou=get_iou(bb1,bb2)
    return iou
    

def load_patient_images(path,threshold=False,roi_threshold=False,matchpattern=False,template_path=None):
    t_paths = sorted(
        glob.glob(os.path.join(path, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    templates=[]
    if not template_path is None:
        for t_path in template_path:
            templates.append(cv2.imread(t_path))
    
    images = []
    for filename in t_paths:
        imageio.imsave('temp_png.png',load_dicom_for_saving_into_png(filename))
        data = cv2.imread('temp_png.png')
        if data.max() == 0:
            continue
        images.append(data)
    
    mid_idx=len(images)//2
    th_middle_image=get_threshold(images[mid_idx].copy(),blur=True,erode=True,dilate=True)
    x1,y1,w1,h1=cv2.boundingRect(th_middle_image)
    print('-->',x1,y1,w1,h1)
    max_area=w1*h1
    
    images=np.array(images)
    new_images=[]
    if images[0].shape[0]<=256 or images[1].shape[0]<=256:
        t_size=1
    elif images[0].shape[0]>=500 or images[1].shape[0]>=500:
        t_size=3
    else:
        t_size=2
    print('text size: ',t_size)
     
    prev_cnt=0
    initialize_prev_cnt=False
    for i,data in enumerate(images):
        th_data=get_threshold(data.copy(),blur=True,erode=True,dilate=True)
        x,y,w,h=cv2.boundingRect(th_data)
        area=w*h
        ratio=area/max_area
        if ratio<0.4:
            continue
            
        org_cnts=get_contours(th_data.copy())
        if org_cnts:
            org_max = max(org_cnts, key=cv2.contourArea)
            org_cnts_area=cv2.contourArea(org_max)
        
        if matchpattern:
            image=data.copy()
            comm_image=image.copy()
            result=cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
            (minVal, maxVal, minLoc, maxLoc) = cv2.minMaxLoc(result)
            (startX, startY) = maxLoc
            endX = startX + template.shape[1]
            endY = startY + template.shape[0]
        
        if roi_threshold:
            image=data.copy()
            g_image=cv2.cvtColor(image.copy(),cv2.COLOR_BGR2GRAY)
            thresh=g_image.mean()+((g_image.max()-g_image.mean())//3)
            th_data=cv2.threshold(g_image,thresh,g_image.max(),cv2.THRESH_BINARY)[1]
            g_image=cv2.putText(g_image,f"{i}",(20,25),3,1,(255,255,0),2) 
            data=np.hstack([g_image,th_data])   
        
        if threshold:
            image=data.copy()  
            
            g_image=cv2.cvtColor(image.copy(),cv2.COLOR_BGR2GRAY)
            mean_values=g_image[np.nonzero(g_image)]
            thresh=mean_values.mean()+((mean_values.max()-mean_values.mean())//2)
            smooth_image=edge_smoothing(image,org_cnts)
            smooth_image=cv2.putText(smooth_image,f"{i}",(20,25),t_size,1,(thresh+1,thresh+1,thresh+1),2)
            gray_smooth_image=get_gray(smooth_image)
            
            th_image=cv2.threshold(gray_smooth_image.copy(),thresh,g_image.max(),cv2.THRESH_BINARY)[1]
            cnts,_ = cv2.findContours(th_image.copy(),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
            c = max(cnts, key=cv2.contourArea)
            data=cv2.drawContours(image.copy(),c,-1, (0, 255, 255), 2)

        new_images.append(data)        
               
    return new_images

def get_images(i,mri_type,threshold=False,roi_threshold=False,matchpattern=False,template_path=None):
    patient_id=str(train_df['BraTS21ID'][i]).zfill(5)
    mgmt=train_df['MGMT_value'][i]
    path=f'{data_directory}/train/{patient_id}/{mri_type}'
    print('Path: ',path)
    print('# Images: ',len(os.listdir(path)))
    print('MGMT: ',mgmt)
    images=load_patient_images(path,threshold,roi_threshold,matchpattern,template_path)
    print(np.array(images).shape)
    return images

In [None]:
import imageio 
i=80
mri_type='FLAIR'
images=get_images(i,mri_type,threshold=True)
create_animation(images)

In [None]:
def load_dicom(path, voi_lut = False, fix_monochrome = True, rotate = 0):
    dicom = pydicom.read_file(path)
#     data, plane = get_voxel(path)
    data = dicom.pixel_array
    
    if voi_lut:
        data = apply_voi_lut(data, dicom)
    
    if rotate > 0:
        rot_choices = [0, cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_90_COUNTERCLOCKWISE, cv2.ROTATE_180]
        data = cv2.rotate(data, rot_choices[rotate])
    
    # MONOCHROME1 indicates that the greyscale ranges from bright to dark with ascending pixel values, whereas MONOCHROME2 ranges from dark to bright with ascending pixel values
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
#     data = data - np.min(data)
#     if np.max(data) != 0:
#         data = data / np.max(data)
        
#     data = (data * 255).astype(np.uint8)
    return data

In [None]:
def visualize_sample(
    brats21id, 
    slice_i,
    mgmt_value,
    types=("FLAIR", "T1w", "T1wCE", "T2w"), 
    rotate = 0
):
    plt.figure(figsize=(16, 5))
    patient_path = os.path.join(
        "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/", 
        str(brats21id).zfill(5),
    )
    for i, t in enumerate(types, 1):
        t_paths = sorted(
            glob.glob(os.path.join(patient_path, t, "*")), 
            key=lambda x: int(x[:-4].split("-")[-1]),
        )
        data = load_dicom(t_paths[int(len(t_paths) * slice_i)], rotate = rotate)
#         if data.sum()!=0:
#             data = crop_image_pixel_array(data, crop = False)
        data = cv2.resize(data, (256, 256)) / 255
        plt.subplot(1, 4, i)
        plt.imshow(data, cmap="gray")
        plt.title(f"{t}", fontsize=16)
        plt.axis("off")

    plt.suptitle(f"MGMT_value: {mgmt_value}", fontsize=16)
    plt.show()

In [None]:
print('Before cropping')
data = load_dicom('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-101.dcm', rotate = 0)
print(data.shape)
plt.imshow(data, cmap="gray")

In [None]:
print('After cropping')
data = load_dicom('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-101.dcm', rotate = 0)
data,_ = crop_image_pixel_array(data, crop = True)
print(data.shape)
plt.imshow(data, cmap="gray")

In [None]:
NUM_IMAGES = 64

def load_dicom_images_3d(scan_id, num_imgs=NUM_IMAGES, img_size=256, mri_type="FLAIR", split="train", rotate=0):

    files = sorted(glob.glob(f"{data_directory}/{split}/{scan_id}/{mri_type}/*.dcm"), 
               key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])

    sum_pixels_across_files = [load_dicom(x).sum() for x in files]
    files_with_pixels = [files[x] for x in range(0, len(sum_pixels_across_files)) if sum_pixels_across_files[x]>0]
    middle = len(files_with_pixels)//2
    num_imgs2 = num_imgs//2

    if len(files_with_pixels)>=num_imgs:
#         best_interval = int(len(files_with_pixels)/num_imgs)
        best_interval = 1
        p1 = max(0, middle - best_interval*num_imgs2)
        p2 = min(len(files), middle + best_interval*num_imgs2)    
        selected_files = files_with_pixels[p1:p2:best_interval]
    else:
        p1 = max(0, middle - num_imgs2)
        p2 = min(len(files), middle + num_imgs2)  
        selected_files = files_with_pixels[p1:p2]
    
    images_list_array = [load_dicom(f, rotate=rotate) for f in selected_files]
    
    #Cropping
    mean_image_for_cropping = np.array(images_list_array).mean(axis = 0)
    img_cropped,dim_for_cropping = crop_image_pixel_array(mean_image_for_cropping)
    first_row_index = dim_for_cropping[0]
    last_row_index = dim_for_cropping[1]
    first_column_index = dim_for_cropping[2]
    last_column_index = dim_for_cropping[3]
    if ((not pd.isnull(first_row_index)) & (not pd.isnull(last_row_index)) & (not pd.isnull(first_column_index)) & (not pd.isnull(last_column_index))):
        images_list_array = [f[first_row_index:last_row_index,first_column_index:last_column_index] for f in images_list_array]

    images_list_array = [cv2.resize(f, (img_size, img_size)) for f in images_list_array]
    img3d = np.stack(images_list_array).T

    if img3d.shape[-1] < num_imgs:
        n_zero = np.zeros((img_size, img_size, num_imgs - img3d.shape[-1]))
        img3d = np.concatenate((img3d,  n_zero), axis = -1)
        
    if np.min(img3d) < np.max(img3d):
        img3d = img3d - np.min(img3d)
        img3d = img3d / np.max(img3d)

    return np.expand_dims(img3d,0)

a = load_dicom_images_3d("00012",mri_type = 'FLAIR')
print(a.shape)
print(np.min(a), np.max(a), np.mean(a), np.median(a))
print(a.shape)
plt.imshow(a[0,:,:,10], cmap="gray")

In [None]:
for i in random.sample(range(train_df.shape[0]), 10):
    _brats21id = train_df.iloc[i]["BraTS21ID"]
    _mgmt_value = train_df.iloc[i]["MGMT_value"]
    visualize_sample(brats21id=_brats21id, mgmt_value=_mgmt_value, slice_i=0.5)

In [None]:
from matplotlib import animation, rc
rc('animation', html='jshtml')

def create_animation(ims):
    fig = plt.figure(figsize=(6, 6))
    plt.axis('off')
    im = plt.imshow(ims[0], cmap="gray")

    def animate_func(i):
        im.set_array(ims[i])
        return [im]

    return animation.FuncAnimation(fig, animate_func, frames = len(ims), interval = 1000//24)

In [None]:
def load_dicom_line(path, crop = False):
    t_paths = sorted(
        glob.glob(os.path.join(path, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    images = []
    for filename in t_paths:
        data = load_dicom(filename)
        
        data = data - np.min(data)
        if np.max(data) != 0:
            data = data / np.max(data)
        
        if crop:
            if data.sum()!=0:
                data = crop_image_pixel_array(data, crop = False)
        
        data = cv2.resize(data, (256, 256))
        if data.max() == 0:
            continue
        images.append(data)
        
    return images

In [None]:
images = load_dicom_line("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR")
create_animation(images)

In [None]:
images = load_dicom_line("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T1w")
create_animation(images)

In [None]:
images = load_dicom_line("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T1wCE")
create_animation(images)

In [None]:
images = load_dicom_line("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/T2w")
create_animation(images)

<a id="10"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Competition Metric<center><h2>

Submissions are evaluated on [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) between the predicted probability and the observed target.

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

list_y_true = [
    [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
    [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
    [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.], #  IMBALANCE
]
list_y_pred = [
    [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
    [0.9, 0.9, 0.9, 0.9, 0.1, 0.9, 0.9, 0.1, 0.9, 0.1, 0.1, 0.5],
    [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], #  IMBALANCE
]

for y_true, y_pred in zip(list_y_true, list_y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(5, 5))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

<a id="20"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Sample Submission<center><h2>

In [None]:
submission = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")
submission

<a id="100"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Modeling<center><h2>

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

# set_seed(42)
set_seed(3407)

In [None]:
df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
df = df[~(df['BraTS21ID'].isin([109, 123, 709]))].reset_index(drop = True)
df_train, df_valid = sk_model_selection.train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df["MGMT_value"])

In [None]:
df_train.head()

Ratio 'T1w/T2w' : <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6465519/>

# nnUnet

In [None]:
!mkdir -p ./RESULTS_FOLDER/nnUNet/2d/image_raw
!cp ../input/brats2021dataset/nnUNet_raw_data_base/nnUNet_raw_data/Task101_BrainTumour/imagesTs/BRATS_188_0000.nii ./RESULTS_FOLDER/nnUNet/2d/image_raw/BRATS_188_0000.nii.gz
!cp ../input/brats2021dataset/nnUNet_raw_data_base/nnUNet_raw_data/Task101_BrainTumour/imagesTs/BRATS_188_0001.nii ./RESULTS_FOLDER/nnUNet/2d/image_raw/BRATS_188_0001.nii.gz
!cp ../input/brats2021dataset/nnUNet_raw_data_base/nnUNet_raw_data/Task101_BrainTumour/imagesTs/BRATS_188_0002.nii ./RESULTS_FOLDER/nnUNet/2d/image_raw/BRATS_188_0002.nii.gz
!cp ../input/brats2021dataset/nnUNet_raw_data_base/nnUNet_raw_data/Task101_BrainTumour/imagesTs/BRATS_188_0003.nii ./RESULTS_FOLDER/nnUNet/2d/image_raw/BRATS_188_0003.nii.gz
!nnUNet_predict -i ./RESULTS_FOLDER/nnUNet/2d/image_raw -o ./RESULTS_FOLDER/nnUNet/2d/ -t "001" -tr nnUNetTrainerV2 -m 2d

In [None]:
!ls ./RESULTS_FOLDER/nnUNet/2d/image_raw/

In [None]:
path_raw="../input/brats2021dataset/nnUNet_raw_data_base/nnUNet_raw_data/Task101_BrainTumour/imagesTs/BRATS_188_0000.nii"
path="./RESULTS_FOLDER/nnUNet/2d/"

import nibabel as nib
plt.figure(figsize=(12,6))
plt.subplot(121)
flair_nib = nib.load(path_raw)
flair_nib_array = flair_nib.get_fdata()
plt.imshow(flair_nib_array[:,:,flair_nib_array.shape[2]//2], cmap = 'gray')
plt.subplot(122)
flair_nib = nib.load(path+"BRATS_188.nii.gz")
flair_nib_array = flair_nib.get_fdata()
plt.imshow(flair_nib_array[:,:,flair_nib_array.shape[2]//2], cmap = 'gray')

In [None]:
import SimpleITK as sitk
from skimage.transform import resize
from tqdm import tqdm
from os.path import join
from fastai.vision.all import *
import nibabel as nib

In [None]:
!mkdir -p ./test/post/
reader = sitk.ImageSeriesReader()
reader.LoadPrivateTagsOn()
train_path="../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/"
# studi_id=["00000","00002","00003"]
studi_id = [str(x).zfill(5) for x in df[df['BraTS21ID'].isin([840])]['BraTS21ID'].unique().tolist()]
mri_types = ["FLAIR","T1w","T1wCE","T2w"]
pixel_size_h=240
pixel_size_w=240
inner_count=0

def dicom2nifti(image_dir, out_dir, save=True):
    "given a dicom directory, loads them into single file and can save it as .nii file"
    reader = sitk.ImageSeriesReader()
    reader.LoadPrivateTagsOn()
    filenamesDICOM = reader.GetGDCMSeriesFileNames(str(image_dir))
    reader.SetFileNames(filenamesDICOM)
    img = reader.Execute()
    img = sitk.Cast(img, sitk.sitkFloat32)
    
    if save:
        sitk.WriteImage(img, f'{out_dir}/{image_dir.parent.name}.nii')
    else:
        return img

def resample_nifti(image_dir, ref_image, fn, save=True):
    "resample using a reference image"

    image = sitk.ReadImage(str(image_dir), sitk.sitkFloat32)
    
    initial_transform = sitk.CenteredTransformInitializer(ref_image, 
                                                          image, 
                                                          sitk.Euler3DTransform(), 
                                                          sitk.CenteredTransformInitializerFilter.GEOMETRY)

    resampler = sitk.ResampleImageFilter()
    resampler.SetReferenceImage(ref_image)
    resampler.SetInterpolator(sitk.sitkLinear)
    resampler.SetTransform(initial_transform)
    resampler.SetOutputSpacing(ref_image.GetSpacing())
    resampler.SetSize((ref_image.GetSize()))
    resampler.SetOutputDirection(ref_image.GetDirection())
    resampler.SetOutputOrigin(ref_image.GetOrigin())
    resampler.SetDefaultPixelValue(image.GetPixelIDValue())
    resamped_image = resampler.Execute(image)
    
    if save:
        sitk.WriteImage(resamped_image, fn)

    return resamped_image

ref_image = sitk.ReadImage('../input/sri24-dataset/sri24/spgr.nii', sitk.sitkFloat32)
!mkdir -p ./tmp/T1w
!mkdir -p ./tmp/T1wCE
!mkdir -p ./tmp/T2w
!mkdir -p ./tmp/FLAIR

for c in tqdm(studi_id):
    path=join(train_path,c)
    samples = [Path("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/"+c)]
    path_train_t2w, path_train_t1wce,path_train_t1w,path_train_flair = [],[],[],[]
    for each in samples:
        path_train_t2w.append(each.ls()[0])
        path_train_t1wce.append(each.ls()[1])
        path_train_t1w.append(each.ls()[2])
        path_train_flair.append(each.ls()[3])
    for fn in path_train_t1w: dicom2nifti(fn, "./tmp/T1w/")
    for fn in path_train_t1wce: dicom2nifti(fn, "./tmp/T1wCE/")
    for fn in path_train_t1w: dicom2nifti(fn, "./tmp/T2w/")
    for fn in path_train_flair: dicom2nifti(fn, "./tmp/FLAIR/")      
    for b in tqdm(mri_types):
        path=join(train_path,c)
        file=[]
        for each in  [Path(join('./tmp/',b))]:
            file.append(each.ls()[0])        
        for fn2 in file:
            pat_id = str(fn2).split('/')[-1].split('.')[0]
            if b=="FLAIR":
                final_fn = f"./test/post/"+pat_id+"_0000.nii.gz"
            if b=="T1w":
                final_fn = f"./test/post/"+pat_id+"_0001.nii.gz"
            if b=="T1wCE":
                final_fn = f"./test/post/"+pat_id+"_0002.nii.gz"    
            if b=="T2w":
                final_fn = f"./test/post/"+pat_id+"_0003.nii.gz"
            resample_nifti(fn2, ref_image, final_fn, True)
            os.remove(str(fn2))

In [None]:
# import shutil
# shutil.rmtree("./RESULTS_FOLDER")
# shutil.rmtree("./tmp")
# shutil.rmtree("./apex")
# shutil.rmtree("./nnUNet")
# shutil.rmtree("./nnUNet_preprocessed")

In [None]:
!nnUNet_predict -i ./test/post/ -o ./RESULTS_FOLDER/nnUNet/2d/ -t 001 -tr nnUNetTrainerV2 -m 3d_fullres --disable_tta

In [None]:
# !ls ./RESULTS_FOLDER/nnUNet/2d/

In [None]:
# !ls ./test/post/

In [None]:
def get_array(fn):
    "opens .nii file and return the array"
    img = sitk.ReadImage(str(fn))
    imgd = sitk.GetArrayFromImage(img)
    return imgd

def plot_slice(imgd, sli):
    "given an image of shape slices x height x width, plots a slice"
    plt.imshow(imgd[sli], cmap='gray')
    plt.axis('off')
    
def get_array_plot(fn, sli):
    imgd = get_array(fn)
    plot_slice(imgd, sli)

In [None]:
get_array_plot(f'./RESULTS_FOLDER/nnUNet/2d/00003.nii.gz', 125)

In [None]:
get_array_plot(f'./test/post/00003_0000.nii.gz', 125)

In [None]:
get_array_plot(f'./test/post/00003_0001.nii.gz', 125)

In [None]:
get_array_plot(f'./test/post/00003_0002.nii.gz', 125)

In [None]:
get_array_plot(f'./test/post/00003_0003.nii.gz', 125)

In [None]:
x = 0
slice_num = 70
display(df[df['BraTS21ID']==x])
plt.figure(figsize=(30, 5))
plt.subplot(1, 5,1)
get_array_plot(f'./RESULTS_FOLDER/nnUNet/2d/{str(x).zfill(5)}.nii.gz', slice_num)
plt.subplot(1, 5,2)
get_array_plot(f'./test/post/{str(x).zfill(5)}_0000.nii.gz', slice_num) 
plt.subplot(1, 5,3)
get_array_plot(f'./test/post/{str(x).zfill(5)}_0001.nii.gz', slice_num)
plt.subplot(1, 5,4)
get_array_plot(f'./test/post/{str(x).zfill(5)}_0002.nii.gz', slice_num)
plt.subplot(1, 5,5)
get_array_plot(f'./test/post/{str(x).zfill(5)}_0003.nii.gz', slice_num)

In [None]:
imgd = get_array(f'./RESULTS_FOLDER/nnUNet/2d/{str(x).zfill(5)}.nii.gz')
plt.figure(figsize=(30, 5))
img_slice = imgd[slice_num]
img_slice = imgd[slice_num]
img_slice[img_slice > 0] = 1

plt.figure(figsize=(30, 5))

plt.subplot(1, 4,1)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0000.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,2)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0001.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,3)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0002.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,4)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0003.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')
plt.show()

In [None]:
imgd = get_array(f'./RESULTS_FOLDER/nnUNet/2d/{str(x).zfill(5)}.nii.gz')
plt.figure(figsize=(30, 5))
img_slice = imgd[slice_num]
img_slice = imgd[slice_num]
# img_slice[img_slice > 0] = 1

plt.figure(figsize=(30, 5))

plt.subplot(1, 4,1)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0000.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,2)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0001.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,3)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0002.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')

plt.subplot(1, 4,4)
temp = get_array(f'./test/post/{str(x).zfill(5)}_0003.nii.gz')
plt.imshow(img_slice*temp[slice_num], cmap = 'gray')
plt.show()

### Brain tumor identification model

In [None]:
image_data = cv2.imread('../input/ct-head-scans/Necrosis/Necrosis14.jpg')
print(image_data.shape)
plt.imshow(image_data)

In [None]:
plt.imshow(cv2.resize(image_data,(256,256)))

In [None]:
image_data.shape, image_data.mean(), image_data.min(), image_data.max()

In [None]:
image_data[:,:,0].sum(), image_data[:,:,1].sum(), image_data[:,:,2].sum()

In [None]:
data_tumor_identification = pd.DataFrame(data=None, columns = ['ID','Tumor flag'])
data_tumor_identification['ID'] = ['Tumor/' + x for x in os.listdir('../input/ct-head-scans/Tumor')]
data_tumor_identification['Tumor flag'] = 1
data = pd.DataFrame(data=None, columns = ['ID','Tumor flag'])
data['ID'] = ['Control/' + x for x in os.listdir('../input/ct-head-scans/Control')]
data['Tumor flag'] = 0
data_tumor_identification = pd.concat([data_tumor_identification, data], axis = 0).reset_index(drop = True)
data = pd.DataFrame(data=None, columns = ['ID','Tumor flag'])
data['ID'] = ['Necrosis/' + x for x in os.listdir('../input/ct-head-scans/Necrosis')]
data['Tumor flag'] = 0
data_tumor_identification = pd.concat([data_tumor_identification, data], axis = 0).reset_index(drop = True)

In [None]:
print(data_tumor_identification.shape)
data_tumor_identification.head()

In [None]:
data_tumor_identification = data_tumor_identification.sample(frac=1).reset_index(drop=True)
data_tumor_identification.head()

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(data=data_tumor_identification, x="Tumor flag");

In [None]:
train_data_tumor_identification, valid_data_tumor_identification = sk_model_selection.train_test_split(
    data_tumor_identification, 
    test_size=0.2, 
    random_state=42, 
    stratify=data_tumor_identification["Tumor flag"])

In [None]:
print('Training data')
plt.figure(figsize=(5, 5))
sns.countplot(data=train_data_tumor_identification, x="Tumor flag");

In [None]:
print('Validation data')
plt.figure(figsize=(5, 5))
sns.countplot(data=valid_data_tumor_identification, x="Tumor flag");

In [None]:
def load_data_brain_tumor_identification(data):
    paths = ['../input/ct-head-scans/'+x for x in data['ID']]
    image_data_list = []
    for path in paths:
        image_data = cv2.imread(path)[:,:,0].astype('float32')
        image_data = cv2.resize(image_data,(256,256))/255
        image_data_list.append(image_data.reshape((256,256,1)))
    return image_data_list, data['Tumor flag']

train_batches_tumor_identification = load_data_brain_tumor_identification(train_data_tumor_identification)
valid_batches_tumor_identification = load_data_brain_tumor_identification(valid_data_tumor_identification)

In [None]:
print(len(train_batches_tumor_identification[0]))
train_batches_tumor_identification[0][0].shape, train_batches_tumor_identification[1].shape

In [None]:
np.array(train_batches_tumor_identification[0]).shape, np.array(train_batches_tumor_identification[1]).shape

In [None]:
# ##model building
# model = Sequential()
# #convolutional layer with rectified linear unit activation
# model.add(layers.Conv2D(32, kernel_size=(3, 3),
#                  activation='relu',
#                  input_shape=(256,256,1)))
# #32 convolution filters used each of size 3x3
# #choose the best features via pooling
# model.add(layers.MaxPooling2D(pool_size=(2, 2)))
# #randomly turn neurons on and off to improve convergence
# model.add(layers.Dropout(0.25))
# #flatten since too many dimensions, we only want a classification output
# model.add(layers.Flatten())
# #fully connected to get all relevant data
# model.add(layers.Dense(128, activation='relu'))
# #one more dropout for convergence' sake :) 
# model.add(layers.Dropout(0.5))
# #output a softmax to squash the matrix into output probabilities
# model.add(layers.Dense(1, activation='sigmoid'))
# model.summary()

In [None]:
len(train_batches_tumor_identification[0]), len(train_batches_tumor_identification[1]), train_batches_tumor_identification[1].min(), train_batches_tumor_identification[1].max()

In [None]:
# Train the model, doing validation at the end of each epoch
epochs = 500

initial_learning_rate = 0.001
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True
)

# model.compile(
#     loss="binary_crossentropy",
#     optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
#     metrics=[AUC(name='auc'),"acc"])
    
# model_save = ModelCheckpoint(f'brain_tumor_identification_model.h5', 
#                              save_best_only = True, 
#                              monitor = 'val_auc', 
#                              mode = 'max', verbose = 1)
# early_stop = EarlyStopping(monitor = 'val_auc', 
#                            patience = 50, mode = 'max', verbose = 1,
#                            restore_best_weights = True)
# model.fit(
#     np.array(train_batches_tumor_identification[0]),np.array(train_batches_tumor_identification[1]),
#     validation_data=(np.array(valid_batches_tumor_identification[0]),np.array(valid_batches_tumor_identification[1])),
#     batch_size=10,
#     epochs=epochs,
#     shuffle=True,
#     verbose=1,
#     callbacks = [model_save, early_stop])

In [None]:
class Model_2D_CNN_brain_tumor_identification(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = efficientnet_pytorch.EfficientNet.from_name("efficientnet-b0")
#         checkpoint = torch.load('../input/efficientnet-pytorch/efficientnet-b7-dcc49843.pth')
#         self.net.load_state_dict(checkpoint)
        n_features = self.net._fc.in_features
        self.net._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)

    def forward(self, x):
        out = self.net(x)
        return out

In [None]:
class DataRetriever_2D_CNN_brain_tumor_identification(torch_data.Dataset):
    def __init__(self, paths, targets, label_smoothing=0.01, rotate= 0):
        self.paths = paths
        self.targets = targets
        self.target_flag = True
        self.train_flag = 'train'
        self.label_smoothing = label_smoothing
        self.rotate = rotate
        if len(targets)==0:
            self.target_flag = False
            self.train_flag = 'test'
          
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        _id = self.paths[index]
        patient_path = '../input/ct-head-scans/' + _id
        channels = []
        
        image_data = cv2.imread(patient_path).astype('float32')
        image_data = cv2.resize(image_data,(256,256))/255

        image_data = image_data - np.min(image_data)
        if np.max(image_data) != 0:
            image_data = image_data / np.max(image_data)

        channels.append(image_data[:,:,0])
        channels.append(image_data[:,:,0])
        channels.append(image_data[:,:,0])
                    
        if (self.target_flag):
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(channels).float(), "y": y}
        else:
            return {"X": torch.tensor(channels).float(), "id": _id}

In [None]:
# train_data_retriever = DataRetriever_2D_CNN_brain_tumor_identification(
#     data_tumor_identification["ID"].values, 
#     data_tumor_identification["Tumor flag"].values, rotate = 0)

In [None]:
# train_data_retriever[0]['X'].shape, train_data_retriever[0]['y']

In [None]:
# class Trainer_brain_tumor_identification:
#     def __init__(
#         self, 
#         model, 
#         device, 
#         optimizer, 
#         criterion,
#         best_valid_score
#     ):
#         self.model = model
#         self.device = device
#         self.optimizer = optimizer
#         self.criterion = criterion
        
#         self.best_valid_score = best_valid_score
#         self.n_patience = 0
        
#         self.messages = {
#             "epoch": "[Epoch {}: {}] loss: {:.5f}, score: {:.5f}, time: {} s",
#             "checkpoint": "The score improved from {:.5f} to {:.5f}. Save model to '{}'",
#             "patience": "\nValid loss didn't improve last {} epochs."
#         }
    
#     def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
#         for n_epoch in range(1, epochs + 1):
#             self.info_message("EPOCH: {}", n_epoch)
            
#             train_loss, train_score, train_time = self.train_epoch(train_loader)
#             valid_loss, valid_score, valid_time = self.valid_epoch(valid_loader)
            
#             self.info_message(
#                 self.messages["epoch"], "Train", n_epoch, train_loss, train_score, train_time
#             )
            
#             self.info_message(
#                 self.messages["epoch"], "Valid", n_epoch, valid_loss, valid_score, valid_time
#             )

#             if (self.best_valid_score < valid_score):
#                 self.info_message(
#                     self.messages["checkpoint"], self.best_valid_score, valid_score, save_path
#                 )
#                 self.best_valid_score = valid_score
#                 self.save_model(n_epoch, save_path)
#                 self.n_patience = 0
#             else:
#                 self.n_patience += 1
            
#             if self.n_patience >= patience:
#                 self.info_message(self.messages["patience"], patience)
#                 break
            
#     def train_epoch(self, train_loader):
#         self.model.train()
#         t = time.time()
#         sum_loss = 0
#         y_all =[]
#         outputs_all = []
        
#         for step, batch in enumerate(train_loader, 1):
#             X = batch["X"].to(self.device)
#             targets = batch["y"].to(self.device)
#             self.optimizer.zero_grad()
#             outputs = self.model(X).squeeze(1)
            
#             loss = self.criterion(outputs, targets)
#             loss.backward()

#             sum_loss += loss.detach().item()
#             y_all.extend(batch["y"].tolist())
#             outputs_all.extend(torch.sigmoid(outputs).tolist())

#         y_all = [1 if x > 0.5 else 0 for x in y_all]
#         auc = roc_auc_score(y_all, outputs_all)

#         _loss, _score = sum_loss/step, auc
#         message = 'Train Step {}/{}, train_loss: {:.5f}, train_roc_auc: {:.5f}'
#         self.info_message(message, step, len(train_loader), _loss, _score, end="\r")
        
#         return _loss, _score, int(time.time() - t)
    
#     def valid_epoch(self, valid_loader):
#         self.model.eval()
#         t = time.time()
#         sum_loss = 0
#         y_all =[]
#         outputs_all = []

#         for step, batch in enumerate(valid_loader, 1):
#             with torch.no_grad():
#                 X = batch["X"].to(self.device)
#                 targets = batch["y"].to(self.device)

#                 outputs = self.model(X).squeeze(1)
#                 loss = self.criterion(outputs, targets)

#                 sum_loss += loss.detach().item()
#                 y_all.extend(batch["y"].tolist())
#                 outputs_all.extend(torch.sigmoid(outputs).tolist())
            
#         y_all = [1 if x > 0.5 else 0 for x in y_all]
#         auc = roc_auc_score(y_all, outputs_all)

#         _loss, _score = sum_loss/step, auc
#         message = 'Valid Step {}/{}, valid_loss: {:.5f}, valid_roc_auc: {:.5f}'
#         self.info_message(message, step, len(valid_loader), _loss, _score, end="\r")

#         return _loss, _score, int(time.time() - t)
    
#     def save_model(self, n_epoch, save_path):
#         torch.save(
#             {
#                 "model_state_dict": self.model.state_dict(),
#                 "optimizer_state_dict": self.optimizer.state_dict(),
#                 "best_valid_score": self.best_valid_score,
#                 "n_epoch": n_epoch,
#             },
#             save_path,
#         )
    
#     @staticmethod
#     def info_message(message, *args, end="\n"):
#         print(message.format(*args), end=end)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# gc.collect()
# torch.cuda.empty_cache()
# model = Model_2D_CNN_brain_tumor_identification()
# model.to(device)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# criterion = torch_functional.binary_cross_entropy_with_logits

# best_valid_score = -np.inf

# trainer = Trainer_brain_tumor_identification(
#     model, 
#     device, 
#     optimizer, 
#     criterion,
#     best_valid_score
# )

# train_data_retriever = DataRetriever_2D_CNN_brain_tumor_identification(
#     train_data_tumor_identification["ID"].values, 
#     train_data_tumor_identification["Tumor flag"].values, rotate = 0)

# valid_data_retriever = DataRetriever_2D_CNN_brain_tumor_identification(
#     valid_data_tumor_identification["ID"].values, 
#     valid_data_tumor_identification["Tumor flag"].values, rotate = 0)

# train_loader = torch_data.DataLoader(
#     train_data_retriever,
#     batch_size=2,
#     shuffle=True,
#     num_workers=8,
# )

# valid_loader = torch_data.DataLoader(
#     valid_data_retriever, 
#     batch_size=2,
#     shuffle=False,
#     num_workers=8,
# )

# history = trainer.fit(
#     30, 
#     train_loader,
#     valid_loader, 
#     f"effnet-best-model-brain-tumor-identification.pth",
#     10,
# )

In [None]:
del train_data_tumor_identification, valid_data_tumor_identification, data_tumor_identification

### Identify images with tumor

In [None]:
##brain_tumor_identification_model building
brain_tumor_identification_model = Sequential()
#convolutional layer with rectified linear unit activation
brain_tumor_identification_model.add(layers.Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(256,256,1)))
#32 convolution filters used each of size 3x3
#choose the best features via pooling
brain_tumor_identification_model.add(layers.MaxPooling2D(pool_size=(2, 2)))
#randomly turn neurons on and off to improve convergence
brain_tumor_identification_model.add(layers.Dropout(0.25))
#flatten since too many dimensions, we only want a classification output
brain_tumor_identification_model.add(layers.Flatten())
#fully connected to get all relevant data
brain_tumor_identification_model.add(layers.Dense(128, activation='relu'))
#one more dropout for convergence' sake :) 
brain_tumor_identification_model.add(layers.Dropout(0.5))
#output a softmax to squash the matrix into output probabilities
brain_tumor_identification_model.add(layers.Dense(1, activation='sigmoid'))
brain_tumor_identification_model.summary()

In [None]:
brain_tumor_identification_model.load_weights('../input/brain-tumor-identification-model/brain_tumor_identification_model.h5')

In [None]:
img_num = 63
a = load_dicom_images_3d("00012")
print('MGMT_value:',train_df[train_df['BraTS21ID']==12]['MGMT_value'].iloc[0])
plt.imshow(a[0,:,:,img_num], cmap="gray")
print('Model result on brain tumor identification',brain_tumor_identification_model.predict(a[0,:,:,img_num].reshape((1,256,256,1)))[0][0]*100,'%')

In [None]:
img_num = 10
a = load_dicom_images_3d("00012")
print('MGMT_value:',train_df[train_df['BraTS21ID']==12]['MGMT_value'].iloc[0])
plt.imshow(a[0,:,:,img_num], cmap="gray")
print('Model result on brain tumor identification',brain_tumor_identification_model.predict(a[0,:,:,img_num].reshape((1,256,256,1)))[0][0]*100,'%')

In [None]:
img_num = 0
a = load_dicom_images_3d("00002")
print('MGMT_value:',train_df[train_df['BraTS21ID']==2]['MGMT_value'].iloc[0])
plt.imshow(a[0,:,:,img_num], cmap="gray")
print('Model result on brain tumor identification',brain_tumor_identification_model.predict(a[0,:,:,img_num].reshape((1,256,256,1)))[0][0]*100,'%')

In [None]:
img_num = 56
a = load_dicom_images_3d("00002")
print('MGMT_value:',train_df[train_df['BraTS21ID']==2]['MGMT_value'].iloc[0])
plt.imshow(a[0,:,:,img_num], cmap="gray")
print('Model result on brain tumor identification',brain_tumor_identification_model.predict(a[0,:,:,img_num].reshape((1,256,256,1)))[0][0]*100,'%')

In [None]:
# from tqdm import tqdm

# all_files_list = []
# for i in tqdm(os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train')):
#     for j in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'+i):
#         for f in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'+i+'/'+j):
#             all_files_list.append('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/' + i + '/' + j + '/' + f)
            
# for i in tqdm(os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test')):
#     for j in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/'+i):
#         for f in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/'+i+'/'+j):
#             all_files_list.append('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/' + i + '/' + j + '/' + f)
            
# print(len(all_files_list))

In [None]:
def get_img_data(f):
    image_data = load_dicom(f, rotate=0)
    image_data = cv2.resize(image_data, (256, 256))
    image_data = image_data - np.min(image_data)
    if np.max(image_data) != 0:
        image_data = image_data / np.max(image_data)
    return image_data

In [None]:
# brain_tumor_identification_model_pred_df = pd.DataFrame(data = None, columns = ['Filename','Brain tumor pred'])
# tqdm.pandas()
# brain_tumor_identification_model_pred_df['Filename'] = all_files_list
# brain_tumor_identification_model_pred_df['Brain tumor pred'] = brain_tumor_identification_model_pred_df['Filename'].progress_apply(lambda x: brain_tumor_identification_model.predict(get_img_data(x).reshape((1,256,256,1)))[0][0])
# joblib.dump(brain_tumor_identification_model_pred_df,'brain_tumor_identification_model_pred_df.pkl')

In [None]:
from tqdm import tqdm
all_files_list = []
for i in tqdm(os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test')):
    for j in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/'+i):
        for f in os.listdir('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/'+i+'/'+j):
            all_files_list.append('../input/rsna-miccai-brain-tumor-radiogenomic-classification/test/' + i + '/' + j + '/' + f)
            
print(len(all_files_list))

In [None]:
brain_tumor_identification_model_pred_df = joblib.load('../input/brain-tumor-predictions/brain_tumor_identification_model_pred_df.pkl')
print(brain_tumor_identification_model_pred_df.shape)
brain_tumor_identification_model_pred_df.tail()

In [None]:
intersection_files = set.intersection(set(all_files_list), set(brain_tumor_identification_model_pred_df['Filename'].tolist()))
all_files_list = list(set(all_files_list) - intersection_files)

In [None]:
len(all_files_list)

In [None]:
print(brain_tumor_identification_model_pred_df.shape)
for i in range(0, len(all_files_list)):
    brain_tumor_identification_model_pred_df.loc[brain_tumor_identification_model_pred_df.shape[0]] = all_files_list[i], brain_tumor_identification_model.predict(get_img_data(all_files_list[i]).reshape((1,256,256,1)))[0][0]
print(brain_tumor_identification_model_pred_df.shape)

In [None]:
del brain_tumor_identification_model, get_img_data

In [None]:
# def select_best_images(files,img_size,num_imgs):
#     model_pred_df = pd.DataFrame(data = None, columns = ['Order','Filename','Brain tumor pred'])
#     count = 0

#     for f in files:
#         image_data = load_dicom(f, rotate=0)
#         image_data = cv2.resize(image_data, (img_size, img_size))
#         image_data = image_data - np.min(image_data)
#         if np.max(image_data) != 0:
#             image_data = image_data / np.max(image_data)
#         model_pred_df.loc[count] = (count+1), f, brain_tumor_identification_model.predict(image_data.reshape((1,256,256,1)))[0][0]
#         count += 1
    
#     selected_image_paths = model_pred_df.sort_values(by = ['Brain tumor pred'], ascending = False).head(num_imgs)
#     selected_image_paths = selected_image_paths.sort_values(by = 'Order', ascending = True).reset_index(drop = True)
    
#     return selected_image_paths['Filename'].tolist()

In [None]:
# img_num = 20
# a = load_dicom_images_3d("00002")
# print('MGMT_value:',train_df[train_df['BraTS21ID']==2]['MGMT_value'].iloc[0])
# plt.imshow(a[0,:,:,img_num], cmap="gray")
# print('Model result on brain tumor identification:',brain_tumor_identification_model.predict(a[0,:,:,img_num].reshape((1,256,256,1)))[0][0]*100,'%')

In [None]:
# Update function to include top images containing tumor

NUM_IMAGES = 64

def load_dicom_images_3d(scan_id, num_imgs=NUM_IMAGES, img_size=256, mri_type="FLAIR", split="train", rotate=0):

    files = sorted(glob.glob(f"{data_directory}/{split}/{scan_id}/{mri_type}/*.dcm"), 
               key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]|[0-9]+', var)])

    files = [x for x in files if load_dicom(x).sum()!=0]
    
    #----Selecting best images START----    

#     selected_files = select_best_images(files,img_size,num_imgs)

    files_df = pd.DataFrame(files)
    files_df.columns = ['Filename']
    files_df['Order'] = files_df.index + 1
    
    selected_image_paths = brain_tumor_identification_model_pred_df[brain_tumor_identification_model_pred_df['Filename'].isin(files)].sort_values(by = ['Brain tumor pred'], ascending = False).head(num_imgs)
    selected_image_paths = selected_image_paths.merge(files_df, on = 'Filename', how = 'left')
    selected_image_paths = selected_image_paths.sort_values(by = 'Order', ascending = True).reset_index(drop = True)
    selected_files = selected_image_paths['Filename'].tolist()
    
    #----Selecting best images END----    

    # crop_image_pixel_array()
    images_list_array = [load_dicom(f, rotate=rotate) for f in selected_files]
    
    #Cropping
    mean_image_for_cropping = np.array(images_list_array).mean(axis = 0)
    img_cropped,dim_for_cropping = crop_image_pixel_array(mean_image_for_cropping)
    first_row_index = dim_for_cropping[0]
    last_row_index = dim_for_cropping[1]
    first_column_index = dim_for_cropping[2]
    last_column_index = dim_for_cropping[3]
    if ((not pd.isnull(first_row_index)) & (not pd.isnull(last_row_index)) & (not pd.isnull(first_column_index)) & (not pd.isnull(last_column_index))):
        images_list_array = [f[first_row_index:last_row_index,first_column_index:last_column_index] for f in images_list_array]

    images_list_array = [cv2.resize(f, (img_size, img_size)) for f in images_list_array]
    img3d = np.stack(images_list_array).T

    if img3d.shape[-1] < num_imgs:
        n_zero = np.zeros((img_size, img_size, num_imgs - img3d.shape[-1]))
        img3d = np.concatenate((img3d,  n_zero), axis = -1)
        
    if np.min(img3d) < np.max(img3d):
        img3d = img3d - np.min(img3d)
        img3d = img3d / np.max(img3d)

    return np.expand_dims(img3d,0)

## 2D CNN

In [None]:
valid_combinations = [['T2w', 'T1wCE', 'T1w'], ['FLAIR', 'T2w', 'T1w'], ['FLAIR', 'T2w', 'T1wCE'], ['FLAIR', 'T1wCE', 'T1w']]

In [None]:
class DataRetriever_2D_CNN(torch_data.Dataset):
    def __init__(self, paths, targets, list_combinations, label_smoothing=0.001, rotate= 0):
        self.paths = paths
        self.targets = targets
        self.target_flag = True
        self.train_flag = 'train'
        self.label_smoothing = label_smoothing
        self.list_combinations = list_combinations
        self.rotate = rotate
        if len(targets)==0:
            self.target_flag = False
            self.train_flag = 'test'
          
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        _id = self.paths[index]
        patient_path = f"../input/rsna-miccai-brain-tumor-radiogenomic-classification/{self.train_flag}/{str(_id).zfill(5)}/"
        channels = []
        for t in list(self.list_combinations):
            t_paths = sorted(
                glob.glob(os.path.join(patient_path, t, "*")), 
                key=lambda x: int(x[:-4].split("-")[-1]),
            )

            x = len(t_paths)
            num_images = 5
                
            if x < num_images:
                r = range(x)
            else:
                d = x // num_images
                r = range(d, x - d, d)

            channel = []

            for i in r:
                pixel_array_processed = load_dicom(t_paths[i], rotate = self.rotate)
                pixel_array_processed = pixel_array_processed - np.min(pixel_array_processed)
                if np.max(pixel_array_processed) != 0:
                    pixel_array_processed = pixel_array_processed / np.max(pixel_array_processed)
                #Crop image
                if pixel_array_processed.sum() != 0:
                    pixel_array_processed,_ = crop_image_pixel_array(pixel_array_processed)
                pixel_array_processed = (pixel_array_processed * 255).astype(np.uint8)
                pixel_array_processed = cv2.resize(pixel_array_processed, (256, 256)) / 255
                channel.append(pixel_array_processed)
            channel = np.mean(channel, axis=0)
            channels.append(channel)
                    
        if (self.target_flag):
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(channels).float(), "y": y}
        else:
            return {"X": torch.tensor(channels).float(), "id": _id}

In [None]:
train_data_retriever_comb_1 = DataRetriever_2D_CNN(
    df_train["BraTS21ID"].values,
    df_train["MGMT_value"].values, 
    list(valid_combinations[0]), rotate = 0)

valid_data_retriever_comb_1 = DataRetriever_2D_CNN(
    df_valid["BraTS21ID"].values, 
    df_valid["MGMT_value"].values,
    list(valid_combinations[0]), rotate = 0)

In [None]:
img = train_data_retriever_comb_1[0]['X']
print(img.shape)
plt.imshow(img[1], cmap="gray")

In [None]:
num = 230
plt.figure(figsize=(16, 6))
if (df_train.iloc[num]['MGMT_value'])==0:
    print('Patient without tumor')
else:
    print('Patient with tumor')
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.imshow(train_data_retriever_comb_1[num]["X"].numpy()[i], cmap="gray")

In [None]:
plt.figure(figsize=(16, 6))
if (df_train.iloc[101]['MGMT_value'])==0:
    print('Patient without tumor')
else:
    print('Patient with tumor')
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.imshow(train_data_retriever_comb_1[101]["X"].numpy()[i], cmap="gray")

In [None]:
del train_data_retriever_comb_1, valid_data_retriever_comb_1

In [None]:
class Model_2D_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = efficientnet_pytorch.EfficientNet.from_name("efficientnet-b7")
#         checkpoint = torch.load('../input/efficientnet-pytorch/efficientnet-b7-dcc49843.pth')
#         self.net.load_state_dict(checkpoint)
        n_features = self.net._fc.in_features
        self.net._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)

    def forward(self, x):
        out = self.net(x)
        return out

In [None]:
# class LossMeter:
#     def __init__(self):
#         self.avg = 0
#         self.n = 0

#     def update(self, val):
#         self.n += 1
#         # incremental update
#         self.avg = val / self.n + (self.n - 1) / self.n * self.avg

        
# class AccMeter:
#     def __init__(self):
#         self.true_count = 0
#         self.n = 0
#         self.avg = 0
        
#     def update(self, y_true, y_pred):
#         y_true = y_true.cpu().numpy() >= 0.5
#         y_pred = y_pred.cpu().numpy() >= 0
#         self.n += len(y_true)
#         self.true_count += np.sum(y_true == y_pred)
#         # incremental update
#         if self.n != 0:
#             self.avg = self.true_count / self.n
#         else:
#             self.avg = 0

In [None]:
# class Trainer:
#     def __init__(
#         self, 
#         model, 
#         device, 
#         optimizer, 
#         criterion,
#         best_valid_score
#     ):
#         self.model = model
#         self.device = device
#         self.optimizer = optimizer
#         self.criterion = criterion
        
#         self.best_valid_score = best_valid_score
#         self.n_patience = 0
        
#         self.messages = {
#             "epoch": "[Epoch {}: {}] loss: {:.5f}, score: {:.5f}, time: {} s",
#             "checkpoint": "The score improved from {:.5f} to {:.5f}. Save model to '{}'",
#             "patience": "\nValid loss didn't improve last {} epochs."
#         }
    
#     def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
#         for n_epoch in range(1, epochs + 1):
#             self.info_message("EPOCH: {}", n_epoch)
            
#             train_loss, train_score, train_time = self.train_epoch(train_loader)
#             valid_loss, valid_score, valid_time = self.valid_epoch(valid_loader)
            
#             self.info_message(
#                 self.messages["epoch"], "Train", n_epoch, train_loss, train_score, train_time
#             )
            
#             self.info_message(
#                 self.messages["epoch"], "Valid", n_epoch, valid_loss, valid_score, valid_time
#             )

#             if (self.best_valid_score < valid_score):
#                 self.info_message(
#                     self.messages["checkpoint"], self.best_valid_score, valid_score, save_path
#                 )
#                 self.best_valid_score = valid_score
#                 self.save_model(n_epoch, save_path)
#                 self.n_patience = 0
#             else:
#                 self.n_patience += 1
            
#             if self.n_patience >= patience:
#                 self.info_message(self.messages["patience"], patience)
#                 break
            
#     def train_epoch(self, train_loader):
#         self.model.train()
#         t = time.time()
#         sum_loss = 0
#         y_all =[]
#         outputs_all = []
        
#         for step, batch in enumerate(train_loader, 1):
#             X = batch["X"].to(self.device)
#             targets = batch["y"].to(self.device)
#             self.optimizer.zero_grad()
#             outputs = self.model(X).squeeze(1)
            
#             loss = self.criterion(outputs, targets)
#             loss.backward()

#             sum_loss += loss.detach().item()
#             y_all.extend(batch["y"].tolist())
#             outputs_all.extend(torch.sigmoid(outputs).tolist())

#         y_all = [1 if x > 0.5 else 0 for x in y_all]
#         auc = roc_auc_score(y_all, outputs_all)

#         _loss, _score = sum_loss/step, auc
#         message = 'Train Step {}/{}, train_loss: {:.5f}, train_roc_auc: {:.5f}'
#         self.info_message(message, step, len(train_loader), _loss, _score, end="\r")
        
#         return _loss, _score, int(time.time() - t)
    
#     def valid_epoch(self, valid_loader):
#         self.model.eval()
#         t = time.time()
#         sum_loss = 0
#         y_all =[]
#         outputs_all = []

#         for step, batch in enumerate(valid_loader, 1):
#             with torch.no_grad():
#                 X = batch["X"].to(self.device)
#                 targets = batch["y"].to(self.device)

#                 outputs = self.model(X).squeeze(1)
#                 loss = self.criterion(outputs, targets)

#                 sum_loss += loss.detach().item()
#                 y_all.extend(batch["y"].tolist())
#                 outputs_all.extend(torch.sigmoid(outputs).tolist())
            
#         y_all = [1 if x > 0.5 else 0 for x in y_all]
#         auc = roc_auc_score(y_all, outputs_all)

#         _loss, _score = sum_loss/step, auc
#         message = 'Valid Step {}/{}, valid_loss: {:.5f}, valid_roc_auc: {:.5f}'
#         self.info_message(message, step, len(valid_loader), _loss, _score, end="\r")

#         return _loss, _score, int(time.time() - t)
    
#     def save_model(self, n_epoch, save_path):
#         torch.save(
#             {
#                 "model_state_dict": self.model.state_dict(),
#                 "optimizer_state_dict": self.optimizer.state_dict(),
#                 "best_valid_score": self.best_valid_score,
#                 "n_epoch": n_epoch,
#             },
#             save_path,
#         )
    
#     @staticmethod
#     def info_message(message, *args, end="\n"):
#         print(message.format(*args), end=end)

In [None]:
valid_combinations

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# for i in range(0,len(valid_combinations)):
#     print('Combination :', list(valid_combinations[i]))
    
#     gc.collect()
#     torch.cuda.empty_cache()
#     model = Model_2D_CNN()
#     model.to(device)

#     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#     criterion = torch_functional.binary_cross_entropy_with_logits

#     best_valid_score = -np.inf

#     trainer = Trainer(
#         model, 
#         device, 
#         optimizer, 
#         criterion,
#         best_valid_score
#     )
    
#     for rotate_dir in [0,2]:
#         train_data_retriever = DataRetriever_2D_CNN(
#             df_train["BraTS21ID"].values, 
#             df_train["MGMT_value"].values, 
#             list(valid_combinations[i]), rotate = rotate_dir)

#         valid_data_retriever = DataRetriever_2D_CNN(
#             df_valid["BraTS21ID"].values, 
#             df_valid["MGMT_value"].values,
#             list(valid_combinations[i]), rotate = rotate_dir)

#         train_loader = torch_data.DataLoader(
#             train_data_retriever,
#             batch_size=4,
#             shuffle=True,
#             num_workers=8,
#         )

#         valid_loader = torch_data.DataLoader(
#             valid_data_retriever, 
#             batch_size=4,
#             shuffle=False,
#             num_workers=8,
#         )

#         history = trainer.fit(
#             30, 
#             train_loader,
#             valid_loader, 
#             f"best-model-{i}.pth",
#             10,
#         )

In [None]:
# from __future__ import print_function  # for Python2

# local_vars = list(locals().items())
# for var, obj in local_vars:
#     print(var, sys.getsizeof(obj))

In [None]:
# del image_data, intersection_files, brain_tumor_identification_model

### User defined 3D CNN

In [None]:
def get_model(width=128, height=128, depth=64):
    """Build a 3D convolutional neural network model."""

    inputs = tensorflow.keras.Input((width, height, depth, 1))

    x = layers.Conv3D(filters=64, kernel_size=3, activation="relu")(inputs)
    x = layers.MaxPool3D(pool_size=2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv3D(filters=64, kernel_size=3, activation="relu")(x)
    x = layers.MaxPool3D(pool_size=2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv3D(filters=128, kernel_size=3, activation="relu")(x)
    x = layers.MaxPool3D(pool_size=2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Conv3D(filters=256, kernel_size=3, activation="relu")(x)
    x = layers.MaxPool3D(pool_size=2)(x)
    x = layers.BatchNormalization()(x)

    x = layers.GlobalAveragePooling3D()(x)
    x = layers.Dense(units=512, activation="relu")(x)
    x = layers.Dropout(0.3)(x)

    outputs = layers.Dense(units=1, activation="sigmoid")(x)

    # Define the model.
    model = tensorflow.keras.Model(inputs, outputs, name="3dcnn")
    return model

In [None]:
df_train['BraTS21ID5'] = df_train['BraTS21ID'].apply(lambda x: format(x, '05d'))
df_valid['BraTS21ID5'] = df_valid['BraTS21ID'].apply(lambda x: format(x, '05d'))

In [None]:
class Dataset(Sequence):
    def __init__(self,df,mri_type,is_train=True,batch_size=4,shuffle=True):
        self.idx = df["BraTS21ID"].values
        self.paths = df["BraTS21ID5"].values
        self.y =  df["MGMT_value"].values
        self.is_train = is_train
        self.batch_size = batch_size
        self.mri_type = mri_type
        self.shuffle = shuffle
    def __len__(self):
        return math.ceil(len(self.idx)/self.batch_size)
   
    def __getitem__(self,ids):
        id_path= self.paths[ids]
        batch_paths = self.paths[ids * self.batch_size:(ids + 1) * self.batch_size]
        
        if self.y is not None:
            batch_y = self.y[ids * self.batch_size: (ids + 1) * self.batch_size]
        
        if self.is_train:
            list_x =  [load_dicom_images_3d(x,split="train") for x in batch_paths]
            batch_X = np.stack(list_x, axis=4)
            return batch_X,batch_y
        else:
            list_x =  load_dicom_images_3d(id_path,split="test", mri_type = self.mri_type)#str(scan_id).zfill(5)
            batch_X = np.stack(list_x)
            return batch_X
    
    def on_epoch_end(self):
        if self.shuffle and self.is_train:
            ids_y = list(zip(self.idx, self.y))
            shuffle(ids_y)
            self.idx, self.y = list(zip(*ids_y))

In [None]:
mri_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

In [None]:
initial_learning_rate = 0.0001
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True
)

# for i in range(0, len(mri_types)):
#     train_dataset = Dataset(df_train,mri_type = mri_types[i], batch_size=4)
#     valid_dataset = Dataset(df_valid,mri_type = mri_types[i], batch_size=4)

#     # Define callbacks
#     model_save = ModelCheckpoint(f'{mri_types[i]}.h5', 
#                                  save_best_only = True, 
#                                  monitor = 'val_auc', 
#                                  mode = 'max', verbose = 1)
#     early_stop = EarlyStopping(monitor = 'val_auc', 
#                                patience = 10, mode = 'max', verbose = 1,
#                                restore_best_weights = True)

#     # Train the model, doing validation at the end of each epoch
#     epochs = 1

#     # Build model.
#     model = get_model(width=256, height=256, depth=64)
#     model.summary()

#     model.compile(
#         loss="binary_crossentropy",
#         optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
#         metrics=[AUC(name='auc'),"acc"],
#     )

#     model.fit(
#         train_dataset,
#         validation_data=valid_dataset,
#         epochs=epochs,
#         shuffle=True,
#         verbose=1,
#         callbacks = [model_save, early_stop],
#     )

### Transfer learning - 3D CNN

In [None]:
class DataRetriever(torch_data.Dataset):
    def __init__(self, paths, targets, mri_type, label_smoothing=0.01, augment = True):
        self.paths = paths
        self.targets = targets
        self.target_flag = True
        self.train_flag = 'train'
        self.mri_type = mri_type
        self.label_smoothing = label_smoothing
        self.augment = augment
        if len(targets)==0:
            self.target_flag = False
            self.train_flag = 'test'

    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        _id = self.paths[index]
        if self.augment:
            rotation = np.random.randint(0,4)
        else:
            rotation = 0
        data = load_dicom_images_3d(str(_id).zfill(5), mri_type=self.mri_type, split=self.train_flag, rotate=rotation)        
        if (self.target_flag):
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(data).float(), "y": y}
        else:
            return {"X": torch.tensor(data).float(), "id": _id}

In [None]:
class Model_3D_effnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = EfficientNet3D.from_name("efficientnet-b0", override_params={'num_classes': 2}, in_channels=1)
        n_features = self.net._fc.in_features
        self.net._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)
    
    def forward(self, x):
        out = self.net(x)
        return out

In [None]:
class Trainer:
    def __init__(
        self, 
        model, 
        device, 
        optimizer, 
        criterion
    ):
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.criterion = criterion

        self.best_valid_score = np.inf
        self.n_patience = 0
        self.lastmodel = None
        
    def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
        for n_epoch in range(1, epochs + 1):
            self.info_message("EPOCH: {}", n_epoch)
            
            train_loss, train_time = self.train_epoch(train_loader)
            valid_loss, valid_auc, valid_time = self.valid_epoch(valid_loader)
            
            self.info_message(
                "[Epoch Train: {}] loss: {:.4f}, time: {:.2f} s            ",
                n_epoch, train_loss, train_time
            )
            
            self.info_message(
                "[Epoch Valid: {}] loss: {:.4f}, auc: {:.4f}, time: {:.2f} s",
                n_epoch, valid_loss, valid_auc, valid_time
            )

            # if True:
            #if self.best_valid_score < valid_auc: 
            if self.best_valid_score > valid_loss: 
                self.save_model(n_epoch, save_path, valid_loss, valid_auc)
                self.info_message(
                     "Valid loss improved from {:.4f} to {:.4f}. Saved model to '{}'", 
                    self.best_valid_score, valid_loss, self.lastmodel
                )
                self.best_valid_score = valid_loss
                self.n_patience = 0
            else:
                self.n_patience += 1
            
            if self.n_patience >= patience:
                self.info_message("\nValid auc didn't improve last {} epochs.", patience)
                break
            
    def train_epoch(self, train_loader):
        self.model.train()
        t = time.time()
        sum_loss = 0

        for step, batch in enumerate(train_loader, 1):
            X = batch["X"].to(self.device)
            targets = batch["y"].to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(X).squeeze(1)
            
            loss = self.criterion(outputs, targets)
            loss.backward()

            sum_loss += loss.detach().item()

            self.optimizer.step()
            
            message = 'Train Step {}/{}, train_loss: {:.4f}'
            self.info_message(message, step, len(train_loader), sum_loss/step, end="\r")
        
        return sum_loss/len(train_loader), int(time.time() - t)
    
    def valid_epoch(self, valid_loader):
        self.model.eval()
        t = time.time()
        sum_loss = 0
        y_all = []
        outputs_all = []

        for step, batch in enumerate(valid_loader, 1):
            with torch.no_grad():
                X = batch["X"].to(self.device)
                targets = batch["y"].to(self.device)

                outputs = self.model(X).squeeze(1)
                loss = self.criterion(outputs, targets)

                sum_loss += loss.detach().item()
                y_all.extend(batch["y"].tolist())
                outputs_all.extend(torch.sigmoid(outputs).tolist())

            message = 'Valid Step {}/{}, valid_loss: {:.4f}'
            self.info_message(message, step, len(valid_loader), sum_loss/step, end="\r")
            
        y_all = [1 if x > 0.5 else 0 for x in y_all]
        auc = roc_auc_score(y_all, outputs_all)
        
        return sum_loss/len(valid_loader), auc, int(time.time() - t)
    
    def save_model(self, n_epoch, save_path, loss, auc):
        self.lastmodel = f"{save_path}"
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "best_valid_score": self.best_valid_score,
                "n_epoch": n_epoch,
            },
            self.lastmodel,
        )
    
    @staticmethod
    def info_message(message, *args, end="\n"):
        print(message.format(*args), end=end)

In [None]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mri_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

for i in tqdm(range(0,len(mri_types))):
    print('MRI type :', mri_types[i])
    
    gc.collect()
    torch.cuda.empty_cache()
    model = Model_3D_effnet()
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch_functional.binary_cross_entropy_with_logits
    
    trainer = Trainer(
        model, 
        device, 
        optimizer, 
        criterion
    )
    
    train_data_retriever = DataRetriever(
        df_train["BraTS21ID"].values, 
        df_train["MGMT_value"].values, 
        mri_types[i], 
        augment = True)


    valid_data_retriever = DataRetriever(
        df_valid["BraTS21ID"].values, 
        df_valid["MGMT_value"].values,
        mri_types[i], 
        augment = False)


    train_loader = torch_data.DataLoader(
        train_data_retriever,
        batch_size=4,
        shuffle=True,
        num_workers=8, pin_memory = True
    )

    valid_loader = torch_data.DataLoader(
        valid_data_retriever, 
        batch_size=4,
        shuffle=False,
        num_workers=8, pin_memory = True
    )

    history = trainer.fit(
        30, 
        train_loader,
        valid_loader, 
        f"best-model-{mri_types[i]}.pth",
        10,
    )

# Transfer learning - nnUnet + CNN

In [None]:
class DataRetriever_unet(torch_data.Dataset):
    def __init__(self, paths, targets, label_smoothing=0.01, augment = True, num_imgs = 64, mri_type = 'FLAIR'):
        self.paths = paths
        self.targets = targets
        self.target_flag = True
        self.train_flag = 'train'
        self.label_smoothing = label_smoothing
        self.augment = augment
        self.num_imgs = num_imgs
        self.mri_type = mri_type
        if len(targets)==0:
            self.target_flag = False
            self.train_flag = 'test'
          
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        _id = self.paths[index]
        if self.augment:
            rotation = np.random.randint(0,4)
        else:
            rotation = 0
            
        unet_mask = get_array(f'../input/nnunet-masks/{str(_id).zfill(5)}.nii.gz')
            
        if self.mri_type == 'FLAIR':
            temp = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0000.nii.gz')
        elif self.mri_type == 'T1w':
            temp = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0001.nii.gz')
        elif self.mri_type == 'T1wCE':
            temp = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0002.nii.gz')
        elif self.mri_type == 'T2w':
            temp = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0003.nii.gz')
        
        sum_pixels_in_unet_mask = [unet_mask[i,:,:].sum() for i in range(0, unet_mask.shape[0])]
        sum_pixels_in_unet_mask_df = pd.DataFrame(sum_pixels_in_unet_mask)
        sum_pixels_in_unet_mask_df.columns = ['Sum of pixels in unet mask']
        sum_pixels_in_unet_mask_df['ID'] = sum_pixels_in_unet_mask_df.index
        selected_slices = sum_pixels_in_unet_mask_df.sort_values(by = 'Sum of pixels in unet mask', ascending = False).head(int(self.num_imgs)).sort_values(by = 'ID', ascending = True)['ID'].tolist()

        data = []
        for slice_num in selected_slices:
            img_slice = unet_mask[slice_num,:,:]
            img_data = img_slice * temp[slice_num,:,:]
            if (img_data.sum() != 0):
                data.append(img_data)

        data = np.stack(data, axis = 2)
#         data = data[:,:,:64]
        
        if data.shape[-1] < self.num_imgs:
            n_zero = np.zeros((240, 240, self.num_imgs - data.shape[-1]))
            data = np.concatenate((data,  n_zero), axis = -1)

        data = np.expand_dims(data,0)
        
        if (self.target_flag):
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(data).float(), "y": y}
        else:
            return {"X": torch.tensor(data).float(), "id": _id}

In [None]:
class DataRetriever_unet_all_modalities(torch_data.Dataset):
    def __init__(self, paths, targets, label_smoothing=0.01, augment = True, num_imgs = 64):
        self.paths = paths
        self.targets = targets
        self.target_flag = True
        self.train_flag = 'train'
        self.label_smoothing = label_smoothing
        self.augment = augment
        self.num_imgs = num_imgs
        if len(targets)==0:
            self.target_flag = False
            self.train_flag = 'test'
          
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, index):
        _id = self.paths[index]
        if self.augment:
            rotation = np.random.randint(0,4)
        else:
            rotation = 0
            
        unet_mask = get_array(f'../input/nnunet-masks/{str(_id).zfill(5)}.nii.gz')
            
        temp_flair = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0000.nii.gz')
        temp_t1w = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0001.nii.gz')
        temp_t1wce = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0002.nii.gz')
        temp_t2w = get_array(f'../input/preprocessed-train-images/{str(_id).zfill(5)}_0003.nii.gz')

        sum_pixels_in_unet_mask = [unet_mask[i,:,:].sum() for i in range(0, unet_mask.shape[0])]
        sum_pixels_in_unet_mask_df = pd.DataFrame(sum_pixels_in_unet_mask)
        sum_pixels_in_unet_mask_df.columns = ['Sum of pixels in unet mask']
        sum_pixels_in_unet_mask_df['ID'] = sum_pixels_in_unet_mask_df.index
        selected_slices = sum_pixels_in_unet_mask_df.sort_values(by = 'Sum of pixels in unet mask', ascending = False).head(int(self.num_imgs/4)).sort_values(by = 'ID', ascending = True)['ID'].tolist()

        data = []
        for slice_num in selected_slices:
            img_slice = unet_mask[slice_num,:,:]
            img_data = img_slice * temp_flair[slice_num,:,:]
            data.append(img_data)
            img_data = img_slice * temp_t1w[slice_num,:,:]
            data.append(img_data)
            img_data = img_slice * temp_t1wce[slice_num,:,:]
            data.append(img_data)
            img_data = img_slice * temp_t2w[slice_num,:,:]
            data.append(img_data)
            
        data = np.stack(data, axis = 2)
        
        if data.shape[-1] < self.num_imgs:
            n_zero = np.zeros((240, 240, self.num_imgs - data.shape[-1]))
            data = np.concatenate((data,  n_zero), axis = -1)

        data = np.expand_dims(data,0)
        
        if (self.target_flag):
            y = torch.tensor(abs(self.targets[index]-self.label_smoothing), dtype=torch.float)
            return {"X": torch.tensor(data).float(), "y": y}
        else:
            return {"X": torch.tensor(data).float(), "id": _id}

In [None]:
class Model_3D_effnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = EfficientNet3D.from_name("efficientnet-b0", override_params={'num_classes': 2}, in_channels=1)
        n_features = self.net._fc.in_features
        self.net._fc = nn.Linear(in_features=n_features, out_features=1, bias=True)
    
    def forward(self, x):
        out = self.net(x)
        return out

In [None]:
class Trainer:
    def __init__(
        self, 
        model, 
        device, 
        optimizer, 
        criterion
    ):
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.criterion = criterion

        self.best_valid_score = np.-inf
        self.n_patience = 0
        self.lastmodel = None
        
    def fit(self, epochs, train_loader, valid_loader, save_path, patience):        
        for n_epoch in range(1, epochs + 1):
            self.info_message("EPOCH: {}", n_epoch)
            
            train_loss, train_time = self.train_epoch(train_loader)
            valid_loss, valid_auc, valid_time = self.valid_epoch(valid_loader)
            
            self.info_message(
                "[Epoch Train: {}] loss: {:.4f}, time: {:.2f} s            ",
                n_epoch, train_loss, train_time
            )
            
            self.info_message(
                "[Epoch Valid: {}] loss: {:.4f}, auc: {:.4f}, time: {:.2f} s",
                n_epoch, valid_loss, valid_auc, valid_time
            )

            # if True:
            #if self.best_valid_score < valid_auc: 
            if (self.best_valid_score < valid_auc) & (np.abs(train_loss - valid_loss)<0.05): 
                self.save_model(n_epoch, save_path, valid_loss, valid_auc)
                self.info_message(
                     "Valid AUC improved from {:.4f} to {:.4f}. Saved model to '{}'", 
                    self.best_valid_score, valid_auc, self.lastmodel
                )
                self.best_valid_score = valid_auc
                self.n_patience = 0
            else:
                self.n_patience += 1
            
            if self.n_patience >= patience:
                self.info_message("\nValid auc didn't improve last {} epochs.", patience)
                break
            
    def train_epoch(self, train_loader):
        self.model.train()
        t = time.time()
        sum_loss = 0

        for step, batch in enumerate(train_loader, 1):
            X = batch["X"].to(self.device)
            targets = batch["y"].to(self.device)
            self.optimizer.zero_grad()
            outputs = self.model(X).squeeze(1)
            
            loss = self.criterion(outputs, targets)
            loss.backward()

            sum_loss += loss.detach().item()

            self.optimizer.step()
            
            message = 'Train Step {}/{}, train_loss: {:.4f}'
            self.info_message(message, step, len(train_loader), sum_loss/step, end="\r")
        
        return sum_loss/len(train_loader), int(time.time() - t)
    
    def valid_epoch(self, valid_loader):
        self.model.eval()
        t = time.time()
        sum_loss = 0
        y_all = []
        outputs_all = []

        for step, batch in enumerate(valid_loader, 1):
            with torch.no_grad():
                X = batch["X"].to(self.device)
                targets = batch["y"].to(self.device)

                outputs = self.model(X).squeeze(1)
                loss = self.criterion(outputs, targets)

                sum_loss += loss.detach().item()
                y_all.extend(batch["y"].tolist())
                outputs_all.extend(torch.sigmoid(outputs).tolist())

            message = 'Valid Step {}/{}, valid_loss: {:.4f}'
            self.info_message(message, step, len(valid_loader), sum_loss/step, end="\r")
            
        y_all = [1 if x > 0.5 else 0 for x in y_all]
        auc = roc_auc_score(y_all, outputs_all)
        
        return sum_loss/len(valid_loader), auc, int(time.time() - t)
    
    def save_model(self, n_epoch, save_path, loss, auc):
        self.lastmodel = f"{save_path}"
        torch.save(
            {
                "model_state_dict": self.model.state_dict(),
                "optimizer_state_dict": self.optimizer.state_dict(),
                "best_valid_score": self.best_valid_score,
                "n_epoch": n_epoch,
            },
            self.lastmodel,
        )
    
    @staticmethod
    def info_message(message, *args, end="\n"):
        print(message.format(*args), end=end)

In [None]:
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mri_types = ['FLAIR', 'T1w', 'T1wCE', 'T2w']

for i in tqdm(range(0,len(mri_types))):
    print('MRI type :', mri_types[i])
    gc.collect()
    torch.cuda.empty_cache()
    model = Model_3D_effnet()
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch_functional.binary_cross_entropy_with_logits

    trainer = Trainer(
        model, 
        device, 
        optimizer, 
        criterion
    )

    train_data_retriever = DataRetriever_unet(
        df_train["BraTS21ID"].values, 
        df_train["MGMT_value"].values, 
        augment = True,
        mri_type = mri_types[i],
        num_imgs = 64)


    valid_data_retriever = DataRetriever_unet(
        df_valid["BraTS21ID"].values, 
        df_valid["MGMT_value"].values,
        augment = False,
        mri_type = mri_types[i],
        num_imgs = 64)


    train_loader = torch_data.DataLoader(
        train_data_retriever,
        batch_size=4,
        shuffle=True,
        num_workers=8, pin_memory = True
    )

    valid_loader = torch_data.DataLoader(
        valid_data_retriever, 
        batch_size=4,
        shuffle=False,
        num_workers=8, pin_memory = True
    )

    history = trainer.fit(
        50, 
        train_loader,
        valid_loader, 
        f"unet_images_3d_cnn_best-model_{mri_types[i]}.pth",
        15,
    )

# LSTM

In [None]:
# Train Model Class
class nn_model():
    def __init__(self, loader,criterion,num_epochs=25,embed_size=4096, num_classes=2,device='cpu',debug=False):
        
        torch.cuda.empty_cache()
        
        self.embed_size = embed_size
        self.device = device
        self.debug = debug
        
        vgg = torchvision.models.vgg19(pretrained = True)
        
        self.vgg_feat  = nn.Sequential(vgg.features)
        self.vgg_pool = nn.Sequential(vgg.avgpool)
        self.vgg_class = nn.Sequential(vgg.classifier[0])  
        
        self.criterion = criterion
        self.num_epochs = num_epochs
        del vgg
        
    def encoder(self,imgs):
        features = np.zeros((imgs.shape[0],imgs.shape[1],4096))
        for i in range(0,imgs.shape[0]):
            for j in range(0,imgs.shape[1]):
                A = imgs[i,j:j+1,:,:]
                B = A.repeat(3,1,1).type(torch.FloatTensor).unsqueeze(0)

                emb_ = self.vgg_class.forward(
                                               self.vgg_pool.forward(
                                                                       self.vgg_feat.forward(B)
                                                                      ).view(-1)
                                             )
                
                features[i,j,:] = emb_.detach().numpy()
        
        self.embed = torch.from_numpy(features)
#         pdb.set_trace()
#         self.embed = self.embed.permute(1,0,2).flatten(start_dim=1,end_dim=2)
#         print("Check:", self.embed.shape)
        return self.embed
    
    def model_arch(self):
        self.model = LSTM_RSNA()
        
    def train_model(self):
        since = time.time()
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.005, momentum=0.9)
        # Decay LR by a factor of 0.1 every 7 epochs
        self.scheduler = lr_scheduler.StepLR(self.optimizer, step_size=3, gamma=0.1)
        
        best_model_wts = copy.deepcopy(self.model.state_dict())
        best_acc = 0.0
        self.model.to(self.device)
        self.track = []
        for epoch in tqdm(range(self.num_epochs)):
            print('Epoch {}/{}'.format(epoch, self.num_epochs - 1))
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    self.model.train()  # Set model to training mode
                else:
                    self.model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                i=0                    
                for inputs, labels in dataloaders[phase]:
                    if self.debug and i>3:
                        break
                    else:
                        inputs = inputs.to(self.device)
                        labels = labels.to(self.device)[:,0,:]
                        

                        # zero the parameter gradients
                        self.optimizer.zero_grad()

                        # forward
                        # track history if only in train
                        with torch.set_grad_enabled(phase == 'train'):
                            outputs,states = self.model(self.encoder(inputs).float())
                            _, preds = torch.max(outputs, 1)                            
                            loss = self.criterion(outputs, labels)

                            # backward + optimize only if in training phase
                            if phase == 'train':
                                loss.backward()
                                self.optimizer.step()

                        # statistics
                        running_loss += loss.item() * inputs.size(0)
                        running_corrects += torch.sum(preds == labels.data[:,1])
                        i+=1
                    
                if phase == 'train':
                    self.scheduler.step()
#                 pdb.set_trace()
                epoch_loss = running_loss / len(dataloaders[phase])
                epoch_acc = running_corrects.double() / len(dataloaders[phase])
                
                
                if self.device=='cuda':
                    labels = labels.to('cpu')
                    preds = preds.to('cpu')
                
                epoch_metrics = np.asarray(precision_recall_fscore_support(labels.data[:,1],preds))
#                 pdb.set_trace()
                epoch_roc = roc_auc_score(labels.data[:,1],preds)
                self.track.append([phase, epoch_loss, epoch_acc, 
                                   epoch_metrics[0,1],epoch_metrics[1,1], epoch_metrics[2,1],
                                   epoch_roc])
                
                print('{} Loss: {:.4f} Acc: {:.4f} Precision: {:.4f} Recall: {:.4f} F1 Score: {:.4f} F1 AUC: {:.4f}'.format(
                    phase, epoch_loss, epoch_acc, epoch_metrics[0,1],
                    epoch_metrics[1,1], epoch_metrics[2,1],epoch_roc))

                # deep copy the model
                if phase == 'val' and epoch_roc > best_acc:
                    best_acc = epoch_roc
                    print('Current Best Model Epoch: ', epoch,'\n')
                    best_model_wts = copy.deepcopy(self.model.state_dict())
                    torch.save({'lstm': self.model.state_dict(),'optimizer': self.optimizer.state_dict()}, 'model1.path')

            print()

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        
        print('Best val Acc: {:4f}'.format(best_acc))
        
        # load best model weights
        self.model.load_state_dict(best_model_wts)


# Pyradiomics

In [None]:
!pip install pyradiomics

In [None]:
import radiomics

<a id="100"></a>
<h2 style='background:darkviolet; border:0; color:white'><center>Stacking models<center><h2>

In [None]:
# models_2D_CNN = []
# for i in range(4):
#     gc.collect()
#     torch.cuda.empty_cache()
#     model = Model_2D_CNN()
#     model.to(device)
    
#     checkpoint = torch.load(f"../input/brain-tumor-classification-model/best-model-{i}.pth")
# #     checkpoint = torch.load(f"best-model-{i}.pth")
#     print(checkpoint['best_valid_score'])
#     model.load_state_dict(checkpoint["model_state_dict"])
#     model.eval()
    
#     models_2D_CNN.append(model)

In [None]:
# models_3D_user_defined_CNN = []
# for i in range(4):
#     model = get_model(width=256, height=256, depth=64)

#     model.load_weights(f'../input/brain-tumor-classification-model-3d-simple-cnn/{mri_types[i]}.h5')
    
#     models_3D_user_defined_CNN.append(model)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
models_3D_CNN_effnet = []
for i in range(4):
    model = Model_3D_effnet()
    model.to(device)
    
    checkpoint = torch.load(f"../input/brain-tumor-classification-model-efficientnet-v1/best-model-{mri_types[i]} v1.pth")
#     checkpoint = torch.load(f"best-model-{mri_types[i]}.pth")
    
    print(checkpoint['best_valid_score'])
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    
    models_3D_CNN_effnet.append(model)

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(valid_combinations)):
    print('Combination :', list(valid_combinations[i]))

    train_data_retriever = DataRetriever_2D_CNN(
        df_train["BraTS21ID"].values, 
        df_train["MGMT_value"].values,
        list(valid_combinations[i]), rotate = 0)

    train_loader = torch_data.DataLoader(
        train_data_retriever,
        batch_size=4,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(train_loader):
        print(f"{e}/{len(train_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_2D_CNN[i]
            tmp_res =torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
            y_true.extend(batch["y"].numpy().tolist())
    
    master_y_pred.append(y_pred)
    master_y_true.append(y_true)

In [None]:
train_pred_df = pd.concat([pd.DataFrame(df_train["BraTS21ID"].values).reset_index(drop = True), pd.DataFrame(df_train["MGMT_value"].values).reset_index(drop = True), pd.DataFrame(master_y_pred[0]).reset_index(drop = True), pd.DataFrame(master_y_pred[1]).reset_index(drop = True), pd.DataFrame(master_y_pred[2]), pd.DataFrame(master_y_pred[3]).reset_index(drop = True)], axis = 1).reset_index(drop = True)
train_pred_df.columns = ['BraTS21ID', 'MGMT_value', 'T2w + T1wCE + T1w pred', 'FLAIR + T2w + T1w pred', 'FLAIR + T2w + T1wCE pred', 'FLAIR + T1wCE + T1w pred']
train_pred_df.head()

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(valid_combinations)):
    print('Combination :', list(valid_combinations[i]))

    valid_data_retriever = DataRetriever_2D_CNN(
        df_valid["BraTS21ID"].values, 
        df_valid["MGMT_value"].values,
        list(valid_combinations[i]), rotate = 0)

    valid_loader = torch_data.DataLoader(
        valid_data_retriever,
        batch_size=4,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(valid_loader):
        print(f"{e}/{len(valid_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_2D_CNN[i]
            tmp_res = torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
            y_true.extend(batch["y"].numpy().tolist())
    
    master_y_pred.append(y_pred)
    master_y_true.append(y_true)

In [None]:
valid_pred_df = pd.concat([pd.DataFrame(df_valid["BraTS21ID"].values).reset_index(drop = True), pd.DataFrame(df_valid["MGMT_value"].values).reset_index(drop = True), pd.DataFrame(master_y_pred[0]).reset_index(drop = True), pd.DataFrame(master_y_pred[1]).reset_index(drop = True), pd.DataFrame(master_y_pred[2]), pd.DataFrame(master_y_pred[3]).reset_index(drop = True)], axis = 1).reset_index(drop = True)
valid_pred_df.columns = ['BraTS21ID', 'MGMT_value', 'T2w + T1wCE + T1w pred', 'FLAIR + T2w + T1w pred', 'FLAIR + T2w + T1wCE pred', 'FLAIR + T1wCE + T1w pred']
valid_pred_df.head()

In [None]:
for i in range(0, len(mri_types)):
    data = Dataset(df_train,mri_type = mri_types[i], batch_size=1)
    model = models_3D_user_defined_CNN[i]
    y_pred = model.predict(data)
    y_pred = y_pred.reshape(-1)
    train_pred_df[f'{mri_types[i]}_3D_user_defined_CNN_pred'] = y_pred

In [None]:
for i in range(0, len(mri_types)):
    data = Dataset(df_valid,mri_type = mri_types[i], batch_size=1)
    model = models_3D_user_defined_CNN[i]
    y_pred = model.predict(data)
    y_pred = y_pred.reshape(-1)
    valid_pred_df[f'{mri_types[i]}_3D_user_defined_CNN_pred'] = y_pred

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(mri_types)):
    print('MRI type :', mri_types[i])

    gc.collect()
    torch.cuda.empty_cache()
    train_data_retriever = DataRetriever(
        df_train["BraTS21ID"].values, 
        df_train["MGMT_value"].values,
        mri_types[i],
        augment = False)

    train_loader = torch_data.DataLoader(
        train_data_retriever,
        batch_size=2,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(train_loader):
        print(f"{e}/{len(train_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_3D_CNN_effnet[i]
            tmp_res = torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
            y_true.extend(batch["y"].numpy().tolist())
    
    master_y_pred.append(y_pred)
    master_y_true.append(y_true)

In [None]:
# train_pred_df = pd.DataFrame(data = None)
# train_pred_df["BraTS21ID"] = df_train["BraTS21ID"]
train_pred_df['FLAIR pred - 3D effnet'] = master_y_pred[0]
train_pred_df['T1w pred - 3D effnet'] = master_y_pred[1]
train_pred_df['T1wCE pred - 3D effnet'] = master_y_pred[2]
train_pred_df['T2w pred - 3D effnet'] = master_y_pred[3]
train_pred_df.head()

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(mri_types)):
    print('MRI type :', mri_types[i])

    gc.collect()
    torch.cuda.empty_cache()
    valid_data_retriever = DataRetriever(
        df_valid["BraTS21ID"].values, 
        df_valid["MGMT_value"].values,
        mri_types[i],
        augment = False)

    valid_loader = torch_data.DataLoader(
        valid_data_retriever,
        batch_size=2,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(valid_loader):
        print(f"{e}/{len(valid_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_3D_CNN_effnet[i]
            tmp_res = torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
            y_true.extend(batch["y"].numpy().tolist())
    
    master_y_pred.append(y_pred)
    master_y_true.append(y_true)

In [None]:
valid_pred_df = pd.DataFrame(data = None)
valid_pred_df["BraTS21ID"] = df_valid["BraTS21ID"]
valid_pred_df['FLAIR pred - 3D effnet'] = master_y_pred[0]
valid_pred_df['T1w pred - 3D effnet'] = master_y_pred[1]
valid_pred_df['T1wCE pred - 3D effnet'] = master_y_pred[2]
valid_pred_df['T2w pred - 3D effnet'] = master_y_pred[3]
valid_pred_df.head()

In [None]:
flair_pred = train_pred_df['FLAIR pred - 3D effnet']
t1w_pred = train_pred_df['T1w pred - 3D effnet']
t1wce_pred = train_pred_df['T1wCE pred - 3D effnet']
t2w_pred = train_pred_df['T2w pred - 3D effnet']

In [None]:
# Combinations
y_pred = train_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)
y_true = df_train['MGMT_value']

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - mean')

y_pred = train_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].median(axis = 1)

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - median')

y_pred = train_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].max(axis = 1)

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - max')

y_pred = flair_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - flair')

y_pred = t1w_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t1w')

y_pred = t1wce_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t1wce')

y_pred = t2w_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t2w')

In [None]:
flair_pred = valid_pred_df['FLAIR pred - 3D effnet']
t1w_pred = valid_pred_df['T1w pred - 3D effnet']
t1wce_pred = valid_pred_df['T1wCE pred - 3D effnet']
t2w_pred = valid_pred_df['T2w pred - 3D effnet']

In [None]:
# Combinations
y_pred = valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)
y_true = df_valid['MGMT_value']

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - mean')

y_pred = valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].median(axis = 1)

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - median')

y_pred = valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].max(axis = 1)

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - max')

y_pred = flair_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - flair')

y_pred = t1w_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t1w')

y_pred = t1wce_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t1wce')

y_pred = t2w_pred

fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc, ' - t2w')

In [None]:
def classification_metrics(y_true, y_prob):
    '''
     Calculates classification metrics
    :param y_true: true label
    :param y_prob: probabilitites of true label
    :param thrshold: threshold
    :return: metrics
    '''
    
    # calculating auroc values
    fpr_rf, tpr_rf,thresholds = roc_curve(y_true, y_prob)
    roc_auc_rf = auc(fpr_rf, tpr_rf)
    optimal_idx = np.argmax(tpr_rf - fpr_rf)
    optimal_threshold = thresholds[optimal_idx]
        
#         print(optimal_threshold)
#         print("====="*20) 
#     if optimal_cal==False:
#         optimal_threshold = thrshold   
     
    # generating prediction on the basis of certain threshold
    y_pred = np.where(y_prob >= optimal_threshold, 1, 0)

    # calculating tp,tn,fp,fn from confusion metrics
    tn, fp, fn, tp = (confusion_matrix(y_true, y_pred)).ravel()

    # calculating auprc
    average_precision = average_precision_score(y_true, y_prob)

    # calculating precision,recall and f1 sscore and accuracy
    precision = (precision_score(y_true, y_pred))
    recall = (recall_score(y_true, y_pred))
    accuracy = (accuracy_score(y_true, y_pred))
    f1_accuracy = (f1_score(y_true, y_pred))
    from sklearn.metrics import cohen_kappa_score
    kappa_score = cohen_kappa_score(y_true, y_pred, labels=None, weights=None)
    binary_cross_entropy=log_loss(y_true, y_prob)
    # creating dictionary of classification metric
    target_mean=np.mean(y_true)
    classification_metric_dict = {"True_negatives": tn,
                                  "False_positives": fp,
                                  "False_negatives": fn,
                                  "True_positives": tp,
                                  "Accuracy": accuracy,
                                  "Recall": recall,
                                  "Precision": precision,
                                  "f1_score": f1_accuracy,
                                  "PR_AUC": average_precision,
                                  "ROC_AUC": roc_auc_rf,
                                  "Kappa Score": kappa_score,
                                  "binary_cross_etropy":binary_cross_entropy,
                                  "target_imbalance":target_mean,
                                  "target_size":len(y_true)
                                  }

    return classification_metric_dict, optimal_threshold

def classification_metrics_train(y_true, y_prob,threshold):
    '''
     Calculates classification metrics
    :param y_true: true label
    :param y_prob: probabilitites of true label
    :param thrshold: threshold
    :return: metrics
    '''
    
    # calculating auroc values
    fpr_rf, tpr_rf,thresholds = roc_curve(y_true, y_prob)
    roc_auc_rf = auc(fpr_rf, tpr_rf)
#     optimal_idx = np.argmax(tpr_rf - fpr_rf)
    optimal_threshold = threshold
        
#         print(optimal_threshold)
#         print("====="*20) 
#     if optimal_cal==False:
#         optimal_threshold = thrshold   
     
    # generating prediction on the basis of certain threshold
    y_pred = np.where(y_prob >= optimal_threshold, 1, 0)

    # calculating tp,tn,fp,fn from confusion metrics
    tn, fp, fn, tp = (confusion_matrix(y_true, y_pred)).ravel()

    # calculating auprc
    average_precision = average_precision_score(y_true, y_prob)

    # calculating precision,recall and f1 sscore and accuracy
    precision = (precision_score(y_true, y_pred))
    recall = (recall_score(y_true, y_pred))
    accuracy = (accuracy_score(y_true, y_pred))
    f1_accuracy = (f1_score(y_true, y_pred))
    from sklearn.metrics import cohen_kappa_score
    kappa_score = cohen_kappa_score(y_true, y_pred, labels=None, weights=None)
    binary_cross_entropy=log_loss(y_true, y_prob)
    # creating dictionary of classification metric
    target_mean=np.mean(y_true)
    classification_metric_dict = {"True_negatives": tn,
                                  "False_positives": fp,
                                  "False_negatives": fn,
                                  "True_positives": tp,
                                  "Accuracy": accuracy,
                                  "Recall": recall,
                                  "Precision": precision,
                                  "f1_score": f1_accuracy,
                                  "PR_AUC": average_precision,
                                  "ROC_AUC": roc_auc_rf,
                                  "Kappa Score": kappa_score,
                                  "binary_cross_etropy":binary_cross_entropy,
                                  "target_imbalance":target_mean,
                                  "target_size":len(y_true)
                                  }

    return classification_metric_dict, optimal_threshold

In [None]:
def objective_classification(X_train, y_train, X_val, y_val, target_value, trial):
    """It tries to find the best hyper-parameters for XGBOOST model for given task

        Details:
            It uses OPTUNA library which is based on Baseian-optimization to tune the hyper-params.

        Args:
            X_train: training data
            X_test: testing data
            y_tain: training label
            y_val: validation label
            trail: object of optuna for optimizing the task in hand

        Returns:
            best score till now

    """
    if ((target_value)):
        tree_methods = ['approx', 'hist', 'exact']
        boosting_lists = ['gbtree', 'gblinear']
        objective_list_reg = ['binary:logistic']  # 'reg:gamma', 'reg:tweedie'
        boosting = trial.suggest_categorical('boosting', boosting_lists),
        tree_method = trial.suggest_categorical('tree_method', tree_methods),
        n_estimator = trial.suggest_int('n_estimators',20, 120, 10),
        max_depth = trial.suggest_int('max_depth', 1, 10),
        reg_alpha = trial.suggest_int('reg_alpha', 2,7),
        reg_lambda = trial.suggest_int('reg_lambda', 2,7),
        min_child_weight = trial.suggest_int('min_child_weight', 0,5),
        gamma = trial.suggest_int('gamma', 0, 5),
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        objective = trial.suggest_categorical('objective', objective_list_reg),
        colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.8, 1, 0.05),
        colsample_bynode = trial.suggest_discrete_uniform('colsample_bynode', 0.8, 1, 0.05),
        colsample_bylevel = trial.suggest_discrete_uniform('colsample_bylevel', 0.8, 1, 0.05),
        subsample = trial.suggest_discrete_uniform('subsample', 0.8, 1, 0.05),
#         scale_pos_weight = trial.suggest_discrete_uniform('scale_pos_weight', 0, 3, 0.1)
        nthread = -1
        
        
    xgboost_tune = xgb.XGBClassifier(
        tree_method=tree_method[0],
#         boosting=boosting[0],
        reg_alpha=reg_alpha[0],
        reg_lambda=reg_lambda[0],
        gamma=gamma[0],
        objective=objective[0],
        colsample_bynode=colsample_bynode[0],
        colsample_bylevel=colsample_bylevel[0],
        n_estimators=n_estimator[0],
        max_depth=max_depth[0],
        min_child_weight=min_child_weight[0],
        learning_rate=learning_rate[0],
        subsample=subsample[0],
        colsample_bytree=colsample_bytree[0],
#         scale_pos_weight=scale_pos_weight,
        eval_metric='logloss',
        num_class=1,
        n_jobs=nthread,
        random_state=SEED)
    xgboost_tune.fit(X_train, y_train)
    pred_val = xgboost_tune.predict(X_val)
    
    return roc_auc_score(y_val,pred_val)

In [None]:
cols = [x for x in train_pred_df.columns.tolist() if 'pred' in x.lower()]
# get a list of models to evaluate
def get_models():
	models = dict()
	models['lr'] = LogisticRegression()
	models['knn'] = KNeighborsClassifier()
	models['cart'] = DecisionTreeClassifier()
	models['svm'] = SVC()
	models['bayes'] = GaussianNB()
	return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores
 
# define dataset
X = pd.concat([train_pred_df[cols],valid_pred_df[cols]], axis = 0).reset_index(drop = True)
y = pd.concat([train_pred_df[['MGMT_value']],valid_pred_df[['MGMT_value']]], axis = 0).reset_index(drop = True)
y = y['MGMT_value']
# get the models to evaluate
classifier_models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in classifier_models.items():
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
model = LogisticRegression()
model.fit(train_pred_df[cols], train_pred_df['MGMT_value'])
train_pred = model.predict_proba(train_pred_df[cols])
train_pred = [x[1] for x in train_pred]
valid_pred = model.predict_proba(valid_pred_df[cols])
valid_pred = [x[1] for x in valid_pred]
model_performance_results = classification_metrics(train_pred_df['MGMT_value'], train_pred)
print('\nTrain data performance:')
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)
print('\nValidation data performance:')
model_performance_results = classification_metrics(valid_pred_df['MGMT_value'], valid_pred)
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)
joblib.dump(model, 'lr_model.pkl')

In [None]:
train_pred = train_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)
valid_pred = valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)
model_performance_results = classification_metrics(train_pred_df['MGMT_value'], train_pred)
print('\nTrain data performance:')
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)
print('\nValidation data performance:')
model_performance_results = classification_metrics(valid_pred_df['MGMT_value'], valid_pred)
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)

In [None]:
SEED = 42
X_train = train_pred_df[cols]
y_train = train_pred_df[['MGMT_value']]
X_valid = valid_pred_df[cols]
y_valid = valid_pred_df[['MGMT_value']]

# # br = BoostARoota(metric='logloss')
# # br.fit(X_train,y_train)
# # X_train=X_train[br.keep_vars_.tolist()]
# # X_valid=X_valid[br.keep_vars_.tolist()]

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
study.optimize(
    functools.partial(objective_classification, X_train, y_train, X_valid, y_valid,'trial'),
            timeout=20)

model_xgb = xgb.XGBClassifier(**study.best_params, random_state=SEED)
model_xgb.fit(X_train,y_train)

train_pred = model_xgb.predict_proba(train_pred_df[cols])
train_pred = [x[1] for x in train_pred]
valid_pred = model_xgb.predict_proba(valid_pred_df[cols])
valid_pred = [x[1] for x in valid_pred]

model_performance_results = classification_metrics(train_pred_df['MGMT_value'], train_pred)
print('\nTrain data performance:')
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)
print('\nValidation data performance:')
model_performance_results = classification_metrics(valid_pred_df['MGMT_value'], valid_pred)
display(pd.DataFrame(model_performance_results[0].items(), columns = ['Metric', 'Value']).T)

import joblib
print("Saving model .. ",end=" ")
joblib.dump(model,r"XGBoost_model.pkl")

# Selecting best model

In [None]:
valid_pred_df.head()

In [None]:
valid_pred_df.columns

In [None]:
y_pred = list(valid_pred_df[['T2w + T1wCE + T1w pred',
       'FLAIR + T2w + T1w pred', 'FLAIR + T2w + T1wCE pred',
       'FLAIR + T1wCE + T1w pred']].mean(axis = 1))
y_true = list(valid_pred_df['MGMT_value'])
fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

In [None]:
y_pred = list(valid_pred_df[['FLAIR_3D_user_defined_CNN_pred',
       'T1w_3D_user_defined_CNN_pred', 'T1wCE_3D_user_defined_CNN_pred',
       'T2w_3D_user_defined_CNN_pred']].mean(axis = 1))
y_true = list(valid_pred_df['MGMT_value'])
fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

In [None]:
y_pred = list(valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1))
y_true = list(valid_pred_df['MGMT_value'])
fpr, tpr, thresholds =roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

####################################
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
####################################
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]

# Plot tpr vs 1-fpr
fig, ax = pl.subplots()
pl.plot(roc['tpr'])
pl.plot(roc['1-fpr'], color = 'red')
pl.xlabel('Index')
pl.ylabel('TPR/(1-FPR)')
pl.title('Receiver operating characteristic')
optimal_threshold = roc.iloc[(roc.tf-0).abs().argsort()[:1]]['thresholds'].iloc[0]
print('Optimal threshold:', optimal_threshold*100,'%')

In [None]:
# ROC curve
fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot([roc.iloc[(roc.tf-0).abs().argsort()[:1]]['fpr'].iloc[0], roc.iloc[(roc.tf-0).abs().argsort()[:1]]['fpr'].iloc[0]], [0, 1], color='green', lw=2, linestyle='--') # Performance metrics at optimal threshold
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# Error analysis

In [None]:
valid_pred_df['Final pred'] = valid_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)

fpr, tpr, thresholds =roc_curve(valid_pred_df['MGMT_value'], valid_pred_df['Final pred'])
roc_auc = auc(fpr, tpr)
print("\nArea under the ROC curve : %f" % roc_auc)

print('\n','# False positives:',valid_pred_df[(valid_pred_df['Final pred']>0.5) & (valid_pred_df['MGMT_value']==0)].shape[0],'\n')
valid_pred_df[(valid_pred_df['Final pred']>0.5) & (valid_pred_df['MGMT_value']==0)]

In [None]:
print('\n','# False negatives:',valid_pred_df[(valid_pred_df['Final pred']<0.5) & (valid_pred_df['MGMT_value']==1)].shape[0],'\n')
valid_pred_df[(valid_pred_df['Final pred']<0.5) & (valid_pred_df['MGMT_value']==1)]

In [None]:
# a = load_dicom_images_3d(scan_id = '00819')
# plt.imshow(a[0,:,:,49], cmap = 'gray')

In [None]:
# master_y_pred = []
# for i in range(0,len(mri_types)):
#     print('MRI type :', mri_types[i])

#     train_data_retriever = DataRetriever(
#         df_train["BraTS21ID"].values, 
#         df_train["MGMT_value"].values,
#         mri_types[i],
#         augment = False)

#     train_loader = torch_data.DataLoader(
#         train_data_retriever,
#         batch_size=4,
#         shuffle=False,
#         num_workers=8,
#     )

#     y_pred = []
    
#     for e, batch in enumerate(train_loader):
#         print(f"{e}/{len(train_loader)}", end="\r")
#         with torch.no_grad():
#             tmp_pred = np.zeros((batch["X"].shape[0], ))
#             model = models[i]
#             embeddings_df = model.base_model(batch["X"].to(device))
#             y_pred.extend(embeddings_df)
            
#     master_y_pred.append(y_pred)

In [None]:
# neurons_num = 1000

# for i in tqdm(range(0, len(mri_types))):
#     for j in range(0, neurons_num):
#         train_pred_df[mri_types[i]+'_'+str(j+1)] = [master_y_pred[i][m][0].cpu().numpy().tolist() for m in range(0, len(train_pred_df))]

In [None]:
# master_y_pred = []
# for i in range(0,len(mri_types)):
#     print('MRI type :', mri_types[i])

#     valid_data_retriever = DataRetriever(
#         df_valid["BraTS21ID"].values, 
#         df_valid["MGMT_value"].values,
#         mri_types[i],
#         augment = False)

#     valid_loader = torch_data.DataLoader(
#         valid_data_retriever,
#         batch_size=4,
#         shuffle=False,
#         num_workers=8,
#     )

#     y_pred = []
    
#     for e, batch in enumerate(valid_loader):
#         print(f"{e}/{len(valid_loader)}", end="\r")
#         with torch.no_grad():
#             tmp_pred = np.zeros((batch["X"].shape[0], ))
#             model = models[i]
#             embeddings_df = model.base_model(batch["X"].to(device))
#             y_pred.extend(embeddings_df)
            
#     master_y_pred.append(y_pred)

In [None]:
# for i in tqdm(range(0, len(mri_types))):
#     for j in range(0, neurons_num):
#         valid_pred_df[mri_types[i]+'_'+str(j+1)] = [master_y_pred[i][m][0].cpu().numpy().tolist() for m in range(0, len(valid_pred_df))]

In [None]:
# Neural network

X_train = train_pred_df[cols]
y_train = train_pred_df['MGMT_value']

simple_nn_model = Sequential()
simple_nn_model.add(Dense(100, input_dim=12, activation='relu'))
simple_nn_model.add(Dense(50, activation='relu'))
simple_nn_model.add(Dense(10, activation='relu'))
simple_nn_model.add(Dense(1, activation='sigmoid'))
simple_nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[AUC(name = 'auc'),'accuracy'])

# Define callbacks
model_save = ModelCheckpoint('simple_nn_model.h5', 
                             save_best_only = True, 
                             monitor = 'val_auc', 
                             mode = 'max', verbose = 1)
early_stop = EarlyStopping(monitor = 'val_auc', 
                           patience = 40, mode = 'max', verbose = 1,
                           restore_best_weights = True)

simple_nn_model.fit(X_train, y_train, validation_data=(valid_pred_df[cols], valid_pred_df['MGMT_value']), epochs=500, batch_size=10,
        shuffle=True,
        verbose=1,
        callbacks = [model_save, early_stop],
    )

In [None]:
y_pred = [x[0] for x in simple_nn_model.predict(train_pred_df[cols]).tolist()]
fpr, tpr, thresholds =roc_curve(train_pred_df['MGMT_value'], y_pred)
roc_auc = auc(fpr, tpr)
print("\n Training data: Area under the ROC curve - %f" % roc_auc)

y_pred = [x[0] for x in simple_nn_model.predict(valid_pred_df[cols])]
fpr, tpr, thresholds =roc_curve(valid_pred_df['MGMT_value'], y_pred)
roc_auc = auc(fpr, tpr)
print("\n Validation data: Area under the ROC curve - %f" % roc_auc)

# Predictions on test data

In [None]:
submission = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")
submission['BraTS21ID5'] = submission['BraTS21ID'].apply(lambda x: format(x, '05d'))

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(valid_combinations)):
    print('Combination :', list(valid_combinations[i]))

    test_data_retriever = DataRetriever_2D_CNN(
        submission["BraTS21ID"].values, 
        np.array([]),
        list(valid_combinations[i]), rotate = 0)

    test_loader = torch_data.DataLoader(
        test_data_retriever,
        batch_size=4,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(test_loader):
        print(f"{e}/{len(test_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_2D_CNN[i]
            tmp_res =torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
    
    master_y_pred.append(y_pred)

In [None]:
test_pred_df = pd.concat([pd.DataFrame(submission["BraTS21ID"].values).reset_index(drop = True), pd.DataFrame(master_y_pred[0]).reset_index(drop = True), pd.DataFrame(master_y_pred[1]).reset_index(drop = True), pd.DataFrame(master_y_pred[2]), pd.DataFrame(master_y_pred[3]).reset_index(drop = True)], axis = 1).reset_index(drop = True)
test_pred_df.columns = ['BraTS21ID', 'T2w + T1wCE + T1w pred', 'FLAIR + T2w + T1w pred', 'FLAIR + T2w + T1wCE pred', 'FLAIR + T1wCE + T1w pred']
test_pred_df.head()

In [None]:
for i in range(0, len(mri_types)):
    data = Dataset(submission,mri_type = mri_types[i], batch_size=1, is_train = False)
    model = models_3D_user_defined_CNN[i]
    y_pred = model.predict(data)
    y_pred = y_pred.reshape(-1)
    test_pred_df[f'{mri_types[i]}_3D_user_defined_CNN_pred'] = y_pred

In [None]:
master_y_pred = []
master_y_true = []
for i in range(0,len(mri_types)):
    print('MRI type :', mri_types[i])

    gc.collect()
    torch.cuda.empty_cache()
    test_data_retriever = DataRetriever(
        submission["BraTS21ID"].values, 
        np.array([]),
        mri_types[i],
        augment = False)

    test_loader = torch_data.DataLoader(
        test_data_retriever,
        batch_size=2,
        shuffle=False,
        num_workers=8,
    )

    y_pred = []
    y_true = []

    for e, batch in enumerate(test_loader):
        print(f"{e}/{len(test_loader)}", end="\r")
        with torch.no_grad():
            tmp_pred = np.zeros((batch["X"].shape[0], ))
            model = models_3D_CNN_effnet[i]
            tmp_res = torch.sigmoid(model(batch["X"].to(device))).cpu().numpy().squeeze()
            tmp_pred += tmp_res
            y_pred.extend(tmp_pred)
    
    master_y_pred.append(y_pred)

In [None]:
test_pred_df['FLAIR pred - 3D effnet'] = master_y_pred[0]
test_pred_df['T1w pred - 3D effnet'] = master_y_pred[1]
test_pred_df['T1wCE pred - 3D effnet'] = master_y_pred[2]
test_pred_df['T2w pred - 3D effnet'] = master_y_pred[3]
test_pred_df.head()

In [None]:
model_xgb = joblib.load(r"./XGBoost_model.pkl")
test_pred_df['XGBoost pred'] = model_xgb.predict_proba(test_pred_df[cols])[:,1]
simple_nn_model = Sequential()
simple_nn_model.add(Dense(100, input_dim=12, activation='relu'))
simple_nn_model.add(Dense(50, activation='relu'))
simple_nn_model.add(Dense(10, activation='relu'))
simple_nn_model.add(Dense(1, activation='sigmoid'))

test_pred_df['Simple NN pred'] = simple_nn_model.predict(test_pred_df[cols])
model = joblib.load('./lr_model.pkl')
test_pred_df['lr pred'] = model.predict_proba(test_pred_df[cols])[:,1]

In [None]:
test_pred_df.head()

In [None]:
y_pred = test_pred_df[['FLAIR pred - 3D effnet','T1w pred - 3D effnet','T1wCE pred - 3D effnet','T2w pred - 3D effnet']].mean(axis = 1)
ids = test_pred_df['BraTS21ID']

In [None]:
submission = pd.DataFrame({"BraTS21ID": ids, "MGMT_value": y_pred})
submission.to_csv("submission.csv", index=False)

In [None]:
print(submission.shape)
submission.head()

In [None]:
plt.figure(figsize=(5, 5))
plt.hist(submission["MGMT_value"]);