In [2]:
import pandas as pd, numpy as np
from pathlib import Path
from posthocos.preprocess import preprocess

In [3]:
path = Path('/home/renato/data/brats2019/raw')
out_path = Path('/home/renato/data/brats2019/preprocess')

# Preprocess images

In [None]:
preprocess(path/'train', out_path/'train', size=128)
preprocess(path/'validation', out_path/'validation', size=128, train_data=False)
preprocess(path/'validation', out_path/'validation_nocrop', size=128, train_data=False, crop=False)

# Build metadata files

In [8]:
def _clean_data(df):
    '''Manages 2 problems on the raw data:
    1. On the 2019 dataset some rows appear as "ALIVE (361 days later)", this functions will clean it.
    2. Remove rows with missing Survival data.
    '''
    idx2clean = df['Survival'].str.contains('[A-Za-z]').fillna(False)
    df.loc[idx2clean, 'Survival'] = df.loc[idx2clean, 'Survival'].str.extract(r'.*\((\d+)\w.*')
    df.dropna(subset=['Survival'], inplace=True)
    df['Survival'] = df['Survival'].astype(int)
    return df

df_train = pd.read_csv(path / 'train' / 'survival_data.csv')
df_valid = pd.read_csv(path / 'validation' / 'survival_evaluation.csv')
_clean_data(df_train)
print('Survival data\n', '-'*40)
print('Train     :', df_train.shape)
print('Validation:', df_valid.shape)
print('\nScans data\n', '-'*40)
print('Train     :', end=' ')
print(', '.join([f'{len((path / "train" / e).ls())} {e} samples' for e in ['LGG', 'HGG']]))
print('Validation:', f'{len([e for e in (path / "validation").ls() if e.is_dir()])} samples')

Survival data
 ----------------------------------------
Train     : (210, 4)
Validation: (125, 3)

Scans data
 ----------------------------------------
Train     : 76 LGG samples, 259 HGG samples
Validation: 125 samples


In [26]:
(out_path / 'train').ls()

[PosixPath('/home/renato/data/brats2019/preprocess/train/mapping.csv'),
 PosixPath('/home/renato/data/brats2019/preprocess/train/metadata.csv'),
 PosixPath('/home/renato/data/brats2019/preprocess/train/log_metadata.csv'),
 PosixPath('/home/renato/data/brats2019/preprocess/train/labels'),
 PosixPath('/home/renato/data/brats2019/preprocess/train/data')]

In [31]:
metadata_train = (pd.read_csv(out_path / 'train/mapping.csv')
                    .merge(df_train, how='left', left_on='subject', right_on='BraTS19ID')
                    .drop('BraTS19ID', axis=1))
metadata_train['label'] = pd.cut(metadata_train.Survival, [0, 10*30, 15*30, np.inf],
                                 labels=['short', 'mid', 'long'])
metadata_train['surv'] = metadata_train.Survival
metadata_train.to_csv(out_path / 'train/metadata.csv', index=False)
metadata_train.tail(3)

Unnamed: 0,modal,subject,Age,Survival,ResectionStatus,label,surv
332,HGG,BraTS19_TCIA06_409_1,69.265753,99.0,,short,99.0
333,HGG,BraTS19_CBICA_AYI_1,65.920548,387.0,GTR,mid,387.0
334,HGG,BraTS19_CBICA_BKV_1,64.808219,374.0,GTR,mid,374.0


In [9]:
metadata_valid = (pd.read_csv(out_path / 'validation/mapping.csv')
                    .merge(df_valid, how='left', left_on='subject', right_on='BraTS19ID')
                    .drop('BraTS19ID', axis=1))
metadata_valid.to_csv(out_path / 'validation/metadata.csv', index=False)
metadata_valid.tail(3)

Unnamed: 0,subject,Age,ResectionStatus
122,BraTS19_TCIA10_627_1,33.181,
123,BraTS19_TCIA02_230_1,49.658,
124,BraTS19_TCIA06_497_1,63.186,
