# RSNA-MICCAI Brain Tumor Radiogenomic Classification

・ このノートブックでは、TensorFlowの学習のために、VGG16モデルを用いた予測モデルの構築を行いました。

・ TensorFlowのコーディングには、公式サイトや下記日本語サイトなどにお世話になりました。

　　[@IT; TensorFlow 2＋Keras（tf.keras）入門][1]

　　[TensorFlowの公式チュートリアル「tf.dataを使って画像をロードする」をもうちょっとスリムにして読む][2]


・ 今回の予測精度はサッパリですが、今後は用いるデータを変えたり、3DCNNモデルを用いるなどして改善できればと思っています。

・ 初学者なので、よからぬ書き方や間違っている箇所などもあるかと思います。お気づきの際には教えていただけるとありがたいです。

・ その他、ご意見ご感想などもいただけると嬉しく思います。よろしくお願いします。

[* The English version of this notebook][5]

---
【入力データについて】

・ VGG16の入力チャネルは3なので、"FLAIR", "T1w", "T2w"の画像3枚をconcatして1つのデータにしています。

・ そのために、各BraTS21IDの"FLAIR", "T1w", "T2w"フォルダから、中央の画像データを取得しています。

---
EDAに関して、下記ノートブックを参考にさせていただきました。ありがとうございます。

1. [[TF]: 3D & 2D Model for Brain Tumor Classification][3]

2. [【Brain Tumor】EDA for starter(日本語version)][4]

---

[1]: https://atmarkit.itmedia.co.jp/ait/subtop/features/di/tf2keras_index.html
 
[2]: https://zenn.dev/tokyoyoshida/articles/5c3270ce0d4c91

[3]: https://www.kaggle.com/ipythonx/tf-3d-2d-model-for-brain-tumor-classification

[4]: https://www.kaggle.com/chumajin/brain-tumor-eda-for-starter-version

[5]: https://www.kaggle.com/masatomurakawamm/tensorflow-simple-prediction-with-2d-vgg16

# 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import os, sys, glob, gc 
import math, random, time
from tqdm import tqdm 
import cv2, pydicom

from sklearn.model_selection import StratifiedKFold 

import tensorflow as tf

In [None]:
# Params
config = {
    'data_path': '../input/rsna-miccai-brain-tumor-radiogenomic-classification',
    'model_path': '../input/keras-pretrained-models/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
    'input_path': '../input', 
    'output_path': './',
    'nfolds': 5, 
    'batch_size': 16,
    'learning_rate': 1e-4,
    'num_epochs': 10
}
AUTO = tf.data.AUTOTUNE

# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
global_seed = 42
seed_all(global_seed)

input_modality = ["FLAIR", "T1w", "T1wCE", "T2w"]
modality_list = ["FLAIR", "T1w", "T2w"] 

train_folder = os.path.join(config['data_path'], 'train')
test_folder = os.path.join(config['data_path'], 'test')
sample_submission_path = os.path.join(config['data_path'], 'sample_submission.csv')

train_df = pd.read_csv(os.path.join(config['data_path'], 'train_labels.csv')); print(train_df.shape)
sample_df = pd.read_csv(sample_submission_path); print(sample_df.shape)
test_df = sample_df.copy(); print(test_df.shape)

# 1. EDA

## 1.1 Train DataFrame

In [None]:
# 交差検証用に5分割（しかし、このノートブックではホールドアウト検証しかしていない）

skf = StratifiedKFold(n_splits=config['nfolds'], shuffle=True, random_state=global_seed)

for index, (train_index, val_index) in enumerate(skf.split(X=train_df.index, y=train_df.MGMT_value)):
    train_df.loc[val_index, 'fold'] = index
    
print(train_df.groupby(['fold', train_df.MGMT_value]).size())

In [None]:
# BraTS21IDごとにフォルダパスを取得

train_df['imfolder'] = ['{:05d}'.format(s) for s in train_df['BraTS21ID']]
train_df['path'] = [os.path.join(train_folder, s) for s in train_df['imfolder']]
train_df

In [None]:
# BraTS21IDごとに、"FLAIR", "T1w", "T1wCE", "T2w"の各フォルダの中身をカウント

input_modality = ["FLAIR", "T1w", "T1wCE", "T2w"] 
for modality in input_modality:   
    modality_count = []
    for i in range(len(train_df)):
        sample_folder = train_df['path'].iloc[i]
        modality_folder = os.path.join(sample_folder, modality)
        if os.path.exists(modality_folder):
            modality_count.append(len(os.listdir(modality_folder)))
        else:
            modality_count.append(0)
        
    train_df[f'{modality}_count'] = modality_count    
    
train_df

In [None]:
# 先のカウント数をもとに、"FLAIR", "T1w", "T2w"フォルダごとに中央の画像データパスを取得

def get_mid_path(path, modality='FLAIR'):
    modality_path = os.path.join(path, modality)
    img_list = os.listdir(modality_path)

    img_num = [s.split('-')[1] for s in img_list]
    img_num = [s.split('.')[0] for s in img_num]

    img_path_list = [os.path.join(modality_path, s) for s in img_list]

    tempdf = pd.DataFrame()
    tempdf['img_num'] = img_num
    tempdf['img_num'] = tempdf['img_num'].astype('int')
    tempdf['img_path'] = img_path_list

    tempdf = tempdf.sort_values('img_num').reset_index(drop=True)

    num_imgs = len(img_list)
    mid_img_path = tempdf['img_path'].iloc[num_imgs//2]
    return mid_img_path

for modality in input_modality:
    train_df[f'{modality}_mid'] = [ get_mid_path(s, modality=modality) for s in train_df['path'] ]
    
train_df

## 1.2 Test DataFrame（先のTrain DataFrameと同様の流れ）

In [None]:
test_df['imfolder'] = ['{:05d}'.format(s) for s in test_df['BraTS21ID']]
test_df['path'] = [os.path.join(test_folder, s) for s in test_df['imfolder']]
test_df

In [None]:
input_modality = ["FLAIR", "T1w", "T1wCE", "T2w"] 
for modality in input_modality:   
    modality_count = []
    for i in range(len(test_df)):
        sample_folder = test_df['path'].iloc[i]
        modality_folder = os.path.join(sample_folder, modality)
        if os.path.exists(modality_folder):
            modality_count.append(len(os.listdir(modality_folder)))
        else:
            modality_count.append(0)
        
    test_df[f'{modality}_count'] = modality_count    
    
test_df

In [None]:
for modality in input_modality:
    test_df[f'{modality}_mid'] = [ get_mid_path(s, modality=modality) for s in test_df['path'] ]
    
test_df

# 2. DataLoader

## 2.1 Train Dataset

・ はじめは単純にtf.data.Dataset.from_tensor_slices()を使おうと思っていたのですが、dcmファイルの読み込みが上手くいかず、Keras Sequenceを使いました。

・ 高速化のため、TFRecordの作成と読み込みも行なっています。

In [None]:
@tf.function
def preprocessing_img(img, threashold=5):
    img_mean = tf.math.reduce_mean(img)
    img = img - img_mean
    img_var = tf.math.reduce_variance(img)
    img = img / img_var
    img_min = tf.math.reduce_min(img)
    img = img - img_min
    img = tf.where(img<threashold, img, threashold)    # 入力値が大きくなりすぎる場合には制限
    img = tf.squeeze(img)    # 余計な次元はカット
    return img

    
class ImageGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, modality_list):
        self.df = df
        self.modality_list = modality_list

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        img_list = []
        for modality in self.modality_list:
            path = self.df[f'{modality}_mid'].iloc[index]
            dicom = pydicom.read_file(path)
            img = dicom.pixel_array
            img = np.expand_dims(img, -1)
            img = tf.constant(img)
            img = tf.image.resize(img, [224, 224])    # concatするため、ここで画像の大きさを揃える
            img_list.append(img)
        multi_ch_img = tf.concat(img_list, axis=-1)
        multi_ch_img = preprocessing_img(multi_ch_img)
        return multi_ch_img         # shape=(224, 224, 3)
    
    
def parse(x):
    result = tf.io.parse_tensor(x, out_type=tf.float32)
    result = tf.reshape(result, [224, 224, 3])
    return result


def build_3ch_train_dataloader(train_df, modality_list, p_fold=0):
    p_train = train_df.query(f'fold != {p_fold}').reset_index(drop=True)
    p_valid = train_df.query(f'fold == {p_fold}').reset_index(drop=True)

    AUTOTUNE = tf.data.experimental.AUTOTUNE

    train_datasets = []
    for mode, df in zip(['train', 'valid'], [p_train, p_valid]):
        i_g = ImageGenerator(df, modality_list)
        img_ds = tf.data.Dataset.from_generator(lambda: map(tuple, i_g),
                                                output_types=(tf.float32),
                                                output_shapes=(tf.TensorShape([224, 224, 3])),
                                                 )
        
        serial_ds = img_ds.map(tf.io.serialize_tensor)

        if not os.path.exists(f'{mode}-{p_fold}-img.tfrec'):
            img_tfrec = tf.data.experimental.TFRecordWriter(f'{mode}-{p_fold}-img.tfrec')
            img_tfrec.write(serial_ds)
        serial_ds = tf.data.TFRecordDataset(f'{mode}-{p_fold}-img.tfrec')
        serial_ds = serial_ds.map(parse, num_parallel_calls=AUTOTUNE)

        labels = df['MGMT_value']
        label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(labels, tf.int32))

        ds = tf.data.Dataset.zip((img_ds, label_ds))
        
        ds = ds.cache(filename=f'./cache.tf-{mode}-{p_fold}-data')
        if mode == 'train':
            train_count = len(df)
            ds = ds.shuffle(buffer_size=train_count)
        ds = ds.batch(config['batch_size'], drop_remainder=True)
        ds = ds.prefetch(buffer_size=AUTOTUNE)
        train_datasets.append(ds)

    return train_datasets

In [None]:
# Datasetの作成
p_fold = 0
modality_list = ["FLAIR", "T1w", "T2w"] 

train_datasets = build_3ch_train_dataloader(train_df, modality_list, p_fold=p_fold)
train_ds = train_datasets[0]
valid_ds = train_datasets[1]

for d, l in train_ds.take(1):
    print('Train Data shape: ', d.shape)
    print('Train Label shape: ', l.shape)
    
for d, l in valid_ds.take(1):
    print('Valid Data shape: ', d.shape)
    print('Valid Label shape: ', l.shape)

## 2.2 Test Dataset

In [None]:
# ラベルなしのTestDatasetを作成
def build_3ch_test_dataloader(test_df, modality_list):
    AUTOTUNE = tf.data.experimental.AUTOTUNE

    i_g = ImageGenerator(test_df, modality_list)
    img_ds = tf.data.Dataset.from_generator(lambda: map(tuple, i_g),
                                         output_types=(tf.float32),
                                         output_shapes=(tf.TensorShape([224, 224, 3])),
                                                 )
    serial_ds = img_ds.map(tf.io.serialize_tensor)

    if not os.path.exists('test-img.tfrec'):
        img_tfrec = tf.data.experimental.TFRecordWriter('test-img.tfrec')
        img_tfrec.write(serial_ds)
    serial_ds = tf.data.TFRecordDataset('test-img.tfrec')
    test_ds = serial_ds.map(parse, num_parallel_calls=AUTOTUNE)

    test_ds = test_ds.cache(filename='./cache.tf-test-data')
    test_ds = test_ds.batch(config['batch_size'], drop_remainder=False)
    test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

    return test_ds

In [None]:
test_ds = build_3ch_test_dataloader(test_df, modality_list)

for d in test_ds.take(1):
    print('Test Data shape: ', d.shape)

# 3. Train

In [None]:
# モデル構築用関数
def build_model():    
    vgg_layers = tf.keras.applications.vgg16.VGG16(weights=config['model_path'],
                                          include_top=False,
                                          pooling='avg',
                                          input_shape=None)
    vgg_layers.trainable = False
    vgg_norm = tf.keras.layers.BatchNormalization(name='vgg_norm')
    
    model = tf.keras.Sequential()
    model.add(vgg_layers)
    model.add(vgg_norm)
    model.add(tf.keras.layers.Dense(units=256, name='d_1', activation='relu'))
    model.add(tf.keras.layers.Dense(units=1, name='d_out', activation='sigmoid'))
    
    return model

In [None]:
# モデル構築
if tf.test.is_gpu_available():
    device_name = tf.test.gpu_device_name()
else:
    device_name = 'cpu:0'

with tf.device(device_name):
    model = build_model()

model.summary()

In [None]:
# モデルの学習
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=config['learning_rate']),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=config['output_path'],
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

history = model.fit(train_ds, epochs=config['num_epochs'],
                    validation_data=valid_ds, shuffle=True,
                    callbacks=[early_stopping, model_checkpoint],
                    )

train_losses = history.history['loss']
valid_losses = history.history['val_loss']
num_epochs = config['num_epochs']

# 4. Prediction 

In [None]:
proba = model.predict(test_ds, batch_size=16, verbose=1)
proba

test_df['prediction'] = proba
sample_df['MGMT_value'] = test_df['prediction']
sample_df

In [None]:
sample_df.to_csv("submission.csv", index=False)