In [None]:
#Define todas as bibliotecas que serão posteriormente utilizadas

#pacotes
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import cv2
import concurrent.futures
import tensorflow as tf

#pacote de leitura dos arquivos dcm
import pydicom

#Pacote para salvar as imagens como .zip
from IPython.display import FileLink

#outros imports
from sklearn import model_selection as sk_model_selection
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from operator import itemgetter
from scipy import ndimage
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import Sequence
from tensorflow.keras import layers
from tensorflow.keras import activations
from tensorflow.keras import Model
from tensorflow.keras.callbacks import EarlyStopping

# Getting to know the data

In [None]:
#Lista o conteudo das pastas usadas
!ls ../input/rsna-miccai-brain-tumor-radiogenomic-classification/
!ls ../input/rsna-miccai-png/train/

In [None]:
#Original da baseline
#Definindo os exames problemáticos do conjunto de dados
EXCLUDED_STR = ['00109', '00123', '00709']
EXCLUDED_INT = [109, 123, 709]

#Definindo o tamanho das imagens
IMG_SIZE = 256

In [None]:
#Mostra o header de um dado corte
ds = pydicom.dcmread("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00008/FLAIR/Image-12.dcm")
ds

In [None]:
#Acessa um dado do header. Debugging
print(ds.ImagePositionPatient[2])
elem = ds[0x0020, 0x0013].value
print(type(elem))
print(elem)

In [None]:
#Original da baseline:
#Lendo dataset 
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
train_df = train_df[~train_df['BraTS21ID'].isin(EXCLUDED_INT)]
train_df.head(50)

#Criando variavel com id do paciente
train_df['BraTS21ID5'] = [format(x, '05d') for x in train_df.BraTS21ID]
print(len(train_df))
train_df = train_df[:int(len(train_df))]

#Debugging: usando 1/10 do dataset
#train_df = train_df[:int(len(train_df)/10)]

#Mostra o header
train_df.head(3)
print(train_df.head(3))

In [None]:
class Config:
    INPUT_PATH_DCM = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/'
    INPUT_PATH_PNG = '../input/rsna-miccai-png/'
    TENSORBOARD_LOG_DIR = '../working/log_tensorboard/'
    SEED = 42
    #These should be removed from the dataset
    EXCLUDED_STR = ['00109', '00123', '00709']
    EXCLUDED_INT = [109, 123, 709]

    #Defining target size of image
    IMG_SIZE = 224
    NUM_SLICES_3D = 64
    MIN_SLICES = 12
    
    BATCH_SIZE = 64
    
    CLASS_MODE = 'binary'
    COLOR_MODE = 'rgb'
    TARGET_SIZE = (224, 224)
    def __self__():
        pass
    @staticmethod
    def set_seed(seed_val):
        tf.random.set_seed(seed_val)
        random.seed(seed_val)
        os.environ['PYTHONHASHSEED'] = str(seed_val)
        np.random.seed(seed_val)
        

In [None]:
#Divisao estratificada em treino, teste e validação
df = train_df
df_trainval, df_test = sk_model_selection.train_test_split(
    df, 
    test_size=0.15, 
    random_state=Config.SEED, 
    stratify=df["MGMT_value"],
)
df_train, df_val = sk_model_selection.train_test_split(
    df_trainval, 
    test_size=0.2, 
    random_state=Config.SEED, 
    stratify=df_trainval["MGMT_value"],
)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

In [None]:
#Divisao estratificada em treino, teste e validação
#df_trainval, df_test = sk_model_selection.train_test_split(
#    df, 
#    test_size=0.15, 
#    random_state=Config.SEED, 
#    stratify=df["MGMT_value"],
#)
#df_train, df_val = sk_model_selection.train_test_split(
#    df_trainval, 
#    test_size=0.2, 
#    random_state=Config.SEED, 
#    stratify=df_trainval["MGMT_value"],
#)

#print(df_train.shape)
#print(df_val.shape)
#print(df_test.shape)

In [None]:
def normalize_img(img, option = 'minmax'):
        
    if option == 'minmax' :
        mi = np.min( img.ravel() )
        ma = np.max( img.ravel() )

        img = ( img - mi ) / ( ma - mi )
        img = 255 * img 
    if option == 'std' :
        print('not implemented')       
    img = img.astype(np.uint8)
    
    return img

def resize_volume(img, target_x, target_y, target_z):
    # Set the desired depth
    desired_depth = target_z
    desired_width = target_x
    desired_height = target_y
    # Get current depth
    current_depth = img.shape[-1]
    current_width = img.shape[0]
    current_height = img.shape[1]
    # Compute depth factor
    depth_ratio  = current_depth / desired_depth
    width_ratio  = current_width / desired_width
    height_ratio = current_height / desired_height
    depth_factor  = 1 / depth_ratio
    width_factor  = 1 / width_ratio
    height_factor = 1 / height_ratio
    # Rotate
    #img = ndimage.rotate(img, 90, reshape=False)
    # Resize across z-axis
    img = ndimage.zoom(img, (width_factor, height_factor, depth_factor), order=1)
    return img

def read_patient_dcm( p_id = '00000', t_type = 'FLAIR', dim = (32,32,32) , norm = None):
    base = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'
    path = base + '/' + p_id + '/' + t_type + '/'
    path, dirs, files = next(os.walk(path))
    file_count = len(files)
    path_list = [0]*(file_count+1)
    
    (target_x,target_y,target_z) = dim
    
    dcms = []
    
    for (i,file) in zip(range(len(path_list)),files):   
        #print(file)
        path_list[i+1] = path + file
        #print( path_list[i+1] )
        try:
            path_list[i] = path + file
            #print( path_list[i+1] )
            i_dcm = pydicom.dcmread(path_list[i+1])
            #plt.imshow(pydicom.dcmread(path_list[i+1]).pixel_array, cmap=plt.cm.bone)
            #plt.show()
            #print(path_list[i+1])
            
            img = i_dcm.pixel_array
            dcms.append( [i_dcm[0x0020, 0x0013].value , img] )
        except:
            pass
    
    dcms = sorted(dcms,key=itemgetter(0))
    
    slices = []
    for i in range(len(dcms)):
        slices.append( dcms[i][1])
    arr3d = np.stack(slices)
       
    arr3d = resize_volume(arr3d,target_x,target_y,target_z)
    
    if norm is not None:
        arr3d = normalize_img( arr3d , norm )
    
    return arr3d

def load_slices(path):
    filelist = os.listdir(path)
    filelist = [s[6:] for s in filelist]
    filelist = sorted(filelist,key=lambda x: int(os.path.splitext(x)[0]))
    
    imgs = [mpimg.imread(path + '/Image-' + s) for s in (filelist)]
    
    
    #sorting ?
    #dcms.sort(key = lambda x: int(x[0x0020, 0x0013].value ))

    slices = imgs # [item.pixel_array for item in dcms]
    return slices


def read_patient_png_3d( p_id = '00000', t_type = 'FLAIR', dim = (IMG_SIZE,IMG_SIZE,IMG_SIZE) , norm = None):
    base = '../input/rsna-miccai-png/train/'
    path = base + '/' + p_id + '/' + t_type + '/'
    path, dirs, files = next(os.walk(path))

    file_count = len(files)
    path_list = [0]*(file_count+1)
    
    (target_x,target_y,target_z) = dim
    
    slices = load_slices(path)
    arr3d = np.stack(slices)
       
    arr3d = resize_volume(arr3d,target_x,target_y,target_z)
        
    return arr3d


def read_patient_dcm_2d( p_id = '00000', t_type = 'FLAIR', dim = (32,32,32) , norm = None):
    (target_x , target_y , target_z) = dim
    base = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'
    path = base + '/' + p_id + '/' + t_type + '/'
   
    slices = load_slices(path)
    arr3d = np.stack(slices)
       
    arr3d = resize_volume(arr3d,target_x,target_y,target_z)
    
    if norm is not None:
        arr3d = normalize_img( arr3d , norm )
    
    #Calcular RMS pra retonar o melhor slice
    max_rms = 0
    i_max_rms = 0
    #for i in range(target_x):
    for i in range(target_y):
        #i_slice = arr3d[i,:,:]
        i_slice = arr3d[:,i,:]
        rms = np.sqrt(np.mean(i_slice))
        if rms > max_rms:
            max_rms = rms
            i_max_rms = i
            
    arr2d = arr3d[:,i_max_rms,:]
    #arr2d = arr3d[i_max_rms,:,:]
    #print(np.shape(arr2d))
    return arr2d

def get_patient_list():
    path = '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/'
    
    _, dirs, _ = next(os.walk(path))

    patients = sorted(dirs)

    for p in EXCLUDED_STR:
        patients.remove(p)
        
    return patients

def print_from_list( dcms):
    for (i,ds) in dcms:
        plt.imshow(ds, cmap=plt.cm.bone)
        plt.show()
        
def retrieve_labels():
    df = pd.read_csv ('../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv')
    df = df[~df.BraTS21ID5.isin(EXCLUDED_INT)]
    y = df['MGMT_value']
    return y

def create_feat_vec( img ):
    arr = img.ravel()
    hist = np.histogram(arr,bins = 255)
    return hist
    

def read_patient_png_2d( p_id = '00000', t_type = 'FLAIR', dim = (IMG_SIZE,IMG_SIZE,IMG_SIZE) , norm = None):
    base = '../input/rsna-miccai-png/train/'
    path = base + '/' + p_id + '/' + t_type + '/'
    path, dirs, files = next(os.walk(path))

    file_count = len(files)
    path_list = [0]*(file_count+1)
    
    (target_x,target_y,target_z) = dim
    
    slices = load_slices(path)
    arr3d = np.stack(slices)
       
    arr3d = resize_volume(arr3d,target_x,target_y,target_z)
    
    #if norm is not None:
    #    arr3d = normalize_img( arr3d , norm )
    
    #Calcular RMS pra retonar o melhor slice
    max_rms = 0
    i_max_rms = 0
    #for i in range(target_x):
    for i in range(target_y):
        #i_slice = arr3d[i,:,:]
        i_slice = arr3d[:,i,:]
        rms = np.sqrt(np.mean(i_slice))
        if rms > max_rms:
            max_rms = rms
            i_max_rms = i
            
    arr2d = arr3d[:,i_max_rms,:]
    
    return arr2d

In [None]:
#Mostra agora mais imagens para diferentes pacientes, mas mudando o shape dos cortes

arr  = read_patient_dcm('00000','FLAIR',(IMG_SIZE,IMG_SIZE,IMG_SIZE))
print(np.shape(arr))

fig, ax = plt.subplots(figsize=(3,3))
plt.imshow(arr[:,IMG_SIZE//2,:],cmap=plt.cm.afmhot,aspect='auto')

fig, ax = plt.subplots(figsize=(3,3))
plt.imshow(arr[:,:,IMG_SIZE//2],cmap=plt.cm.afmhot,aspect='auto')

fig, ax = plt.subplots(figsize=(3,3))
plt.imshow(arr[IMG_SIZE//2,:,:],cmap=plt.cm.afmhot,aspect='auto')

In [None]:
#Agora, com a função read_patient_dcm_2d
for id in  ['00000', '00002', '00003','00009','00789','00044','00740','00360']:
    arr  = read_patient_png_2d(id,'T2w',(IMG_SIZE,IMG_SIZE,IMG_SIZE))
    fig, ax = plt.subplots(figsize=(3,3))
    plt.imshow(arr,cmap=plt.cm.afmhot,aspect='auto')
print(np.shape(arr))

# Definindo o gerador de dados

In [None]:
class DataGenerator_2d_png(Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=4, dim=(IMG_SIZE,IMG_SIZE), n_channels=1,
                 n_classes=2, shuffle=True, data_type = 'train',):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.datatype = data_type
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        list_labels_temp = [self.labels[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp, list_labels_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp, list_labels_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)
        # Generate data
        
        for i, ID in enumerate(list_IDs_temp):
            img = read_patient_png_2d( p_id = ID, t_type = 'FLAIR', dim = (IMG_SIZE,IMG_SIZE,IMG_SIZE), norm = 'minmax')
            img = np.expand_dims(img, -1)
            X[i,] = img
            y[i] = list_labels_temp[i]
            
        return X, y

In [None]:
BATCH_SIZE = 4

train_id = df_train.BraTS21ID5.tolist()
train_labels = df_train.MGMT_value.tolist()
val_id     = df_val.BraTS21ID5.tolist()
val_labels = df_val.MGMT_value.tolist()

print("--- Header do treino ---")
print(df_train.head())
print("--- Header da validação ---")
print(df_val.head())


train_gen_2d = DataGenerator_2d_png(data_type = 'train', list_IDs = train_id, labels = train_labels,batch_size = BATCH_SIZE)
valid_gen_2d = DataGenerator_2d_png(data_type = 'val', list_IDs = val_id, labels = val_labels,batch_size = BATCH_SIZE)

In [None]:
model_resnet = tf.keras.applications.ResNet50(weights='imagenet', include_top=False)

ipt = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 1), name="input")
x = tf.keras.layers.Concatenate()([ipt, ipt, ipt])
x = tf.cast(x, tf.float32)
x = tf.keras.applications.resnet50.preprocess_input(x)
x = model_resnet(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.1)(x)
out = layers.Dense(1, activation='sigmoid')(x)

model_ResNet50_2d = Model(inputs=ipt, outputs=out)
model_ResNet50_2d.summary()

In [None]:
# Train model 2D on dataset
NUM_EPOCHS = 25

opt = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, decay=0.001, nesterov=True)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)
model_ResNet50_2d.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

history = model_ResNet50_2d.fit(train_gen_2d, validation_data=valid_gen_2d,
                    epochs = NUM_EPOCHS,
                    verbose = 1,
                    callbacks=[early_stopping],
                    workers = 4
                    )

In [None]:
print( np.mean( train_labels ))
print( np.mean( val_labels ))

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
val_predictions = model_ResNet50_2d.predict(valid_gen_2d)
val_labels = val_labels[:len(val_predictions)]

In [None]:
#Definição do cálculo da curva ROC e da área sob esta curva

ns_probs = [0 for _ in range(len(val_labels))]

ns_auc = roc_auc_score(val_labels, ns_probs)
lr_auc = roc_auc_score(val_labels, val_predictions)
print('No skill: ROC AUC=%.3f' % (ns_auc))
print('CNN: ROC AUC=%.3f' % (lr_auc))
lr_fpr, lr_tpr, _ = roc_curve(val_labels, val_predictions)
plt.plot(lr_fpr, lr_tpr, marker='.', label='CNN')

ns_fpr, ns_tpr, _ = roc_curve(val_labels, ns_probs)
plt.plot(ns_fpr, ns_tpr, marker='.', label='No Skill')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()