In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random

import pydicom #DICOM file
from pydicom.pixel_data_handlers.util import apply_voi_lut

import cv2 #openCV
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm #progress bar

import glob #glob

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from tensorflow.keras.optimizers import SGD

In [None]:
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
test_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")

In this [discussion](https://www.kaggle.com/c/rsna-miccai-brain-tumor-radiogenomic-classification/discussion/262046) a competition host has notified that there are some issues with these 3 cases   
Patient IDs - 

1. 00109 (FLAIR images are blank)
2. 00123 (T1w images are blank)
3. 00709 (FLAIR images are blank)    

Hence these can be excluded


In [None]:
#refer: https://www.kaggle.com/arnabs007/part-1-rsna-miccai-btrc-understanding-the-data
EXCLUDE = [109, 123, 709]
train_df = train_df[~train_df.BraTS21ID.isin(EXCLUDE)]

In [None]:
train_df.head(10)

In [None]:
train_df.head(10)

In [None]:
TYPES = ["FLAIR", "T1w", "T1wCE", "T2w"] #mpMRI scans

In [None]:
def load_dicom(path, size = 224): #load DICOM files
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array #returns a numpy.ndarray containing the pixel data
    if np.max(data) != 0:
        data = data / np.max(data) #standardizes so that the pixel values are between 0 and 1
    data = (data * 255).astype(np.uint8) #rescales to 0 and 255
    return cv2.resize(data, (size, size))

In [None]:
def get_all_image_paths(BraTS21ID, image_type, folder="train"): #get an array of all the images of a particular type or a particular patient id
    assert(image_type in TYPES) #only in types
    patient_path = os.path.join("../input/rsna-miccai-brain-tumor-radiogenomic-classification/%s/" % folder, str(BraTS21ID).zfill(5)) #다른 폴더일 수도 있음
    #print(lambda x: int(x[:-4].split("-")[-1]))
    
    paths = sorted(glob.glob(os.path.join(patient_path, image_type, "*")), key=lambda x: int(x[:-4].split("-")[-1])) #sort
    #print(paths)
    
    num_images = len(paths)
    
    start = int(num_images * 0.25)
    end = int(num_images * 0.75)
    if num_images < 10:
        jump = 1
    else:
        jump = 3
        
    return np.array(paths[start:end:jump])

In [None]:
def get_all_images(BraTS21ID, image_type, folder="train", size=225):
    return [load_dicom(path, size) for path in get_all_image_paths(BraTS21ID, image_type, folder)]

In [None]:
IMAGE_SIZE = 128

def get_all_data_train(image_type):
    global train_df
    
    X = []
    y = []
    train_ids = []

    for i in tqdm(train_df.index):
        tmp_x = train_df.loc[i]
        images = get_all_images(int(tmp_x["BraTS21ID"]), image_type, "train", IMAGE_SIZE)
        label = tmp_x["MGMT_value"]

        X += images
        y += [label] * len(images)
        train_ids += [int(tmp_x["BraTS21ID"])] * len(images)
        assert(len(X) == len(y))
    return np.array(X), np.array(y), np.array(train_ids)

def get_all_data_test(image_type):
    global test_df
    
    X = []
    test_ids = []

    for i in tqdm(test_df.index):
        tmp_x = test_df.loc[i]
        images = get_all_images(int(tmp_x["BraTS21ID"]), image_type, "test", IMAGE_SIZE)
        X += images
        test_ids += [int(tmp_x["BraTS21ID"])] * len(images)

    return np.array(X), np.array(test_ids)

In [None]:
X, y, train_idt = get_all_data_train("T1wCE")
X_test, test_idt = get_all_data_test("T1wCE")
X.shape, y.shape, train_idt.shape

In [None]:
X.shape, y.shape

In [None]:
X_train, X_valid, y_train, y_valid, train_idt_train, train_idt_valid = train_test_split(X, y, train_idt, test_size=0.2, random_state=42)

split = int(X.shape[0] * 0.8) #8:2 split

X_train = tf.expand_dims(X_train, axis=-1) #expand the dimension at the end of the array
X_valid = tf.expand_dims(X_valid, axis=-1)

y_train = to_categorical(y_train) #one-hot incoding
y_valid = to_categorical(y_valid)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, train_idt_train.shape, train_idt_valid.shape

In [None]:
file_path1 = "../input/rsna-model-2/rsna_model_data_augment_best_model_3.h5" #shape=128
#file_path2 = "../input/rsna-best-model-training2/best_model (2).h5"
file_path2 = "../input/rsna-model-model-1/rsna_model_data_augment_best_model_2.h5" #shape=128
file_path3 = "../input/best-model-trainingver3/best_model_trainingVer3.h5" #shape=32

In [None]:
#import tensorflow_hub as tfhub
#import tensorflow_addons as tfa
from tensorflow.keras import layers

In [None]:
#best_model = tf.keras.models.load_model(filepath = file_path, custom_objects={'KerasLayer': tfhub.KerasLayer})
model1 = tf.keras.models.load_model(filepath = file_path1)
model2 = tf.keras.models.load_model(filepath = file_path2)
#model3 = tf.keras.models.load_model(filepath = file_path3)

In [None]:
y_pred1 = model1.predict(X_valid) #pedict on X_valid
y_pred2 = model2.predict(X_valid) #pedict on X_valid

pred1 = np.argmax(y_pred1, axis = 1)
pred2 = np.argmax(y_pred2, axis = 1)

result = pd.DataFrame(train_idt_valid)
result[1] = pred1*0.3+pred2*0.7
result.columns=["BraTS21ID","MGMT_value"]

#Group by BraTS21ID and average + do not use index
result_temp = result.groupby("BraTS21ID", as_index = False).mean()
result_temp = result_temp.merge(train_df, on = "BraTS21ID") #merge train_df
result_temp

In [None]:
auc = roc_auc_score(result_temp.MGMT_value_y, result_temp.MGMT_value_x)

print(f"Validation AUC={auc}")

In [None]:
#submission
sample_sub = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")

y_pred1 = model1.predict(X_test) #predict test
y_pred2 = model2.predict(X_test)
#y_pred3 = model3.predict(X_test)

pred1 = np.argmax(y_pred1, axis = 1)
pred2 = np.argmax(y_pred2, axis = 1)
#pred3 = np.argmax(y_pred3, axis = 1)

result = pd.DataFrame(test_idt)
result[1] = pred1*0.3+pred2*0.7

result.columns=["BraTS21ID","MGMT_value"]
result_final = result.groupby("BraTS21ID",as_index = False).mean()

result_final["BraTS21ID"] = sample_sub["BraTS21ID"]
result_final["MGMT_value"] = result_final["MGMT_value"]
result_final

In [None]:
result_final.to_csv("submission.csv",index=False)

In [None]:
plt.figure(figsize=(5, 5))
plt.hist(result_final["MGMT_value"]);