In [11]:
import nibabel as nib
import nibabel.imagestats as stats
import pandas as pd
from pathlib import Path
import os

# infer participant home folder
home = str(Path.home())
pd.options.plotting.backend = 'hvplot'

data_path = os.path.join(home,'Downloads/MICCAI_FeTS2022_TrainingData')

In [2]:
data_path

'/Downloads/MICCAI_FeTS2022_TrainingData/'

In [12]:
# Read or create the volume series describing the area of each tumour
# Scans have all been resized to 1x1x1 spacing so counting voxels is enough
# Further refinement can be made by measuring only the core tumour area,
# or different morphological statistics alltogether

if os.path.isfile(f"{data_path}/volume_file.csv"):
    df = pd.read_csv(f"{data_path}/volume_file.csv")
else:
    df = pd.read_csv(f"{data_path}/partitioning_1.csv")
    df['Volume'] = df.apply(lambda x: stats.count_nonzero_voxels(nib.load(
        data_path+f"{x['Subject_ID']}/{x['Subject_ID']}_seg.nii.gz")), axis=1)

    df.to_csv(f"{data_path}/volume_file.csv",index=False)

In [13]:
# add a column describing the original artificial split by the organizers
df['org_split'] = pd.read_csv(f"{data_path}/partitioning_2.csv")['Partition_ID']

In [29]:
# Overlay the natural and original artificial barplots
# df.groupby('Partition_ID').size().plot(kind='bar', ylabel='Patients', alpha=1) \
df.groupby('org_split').size().plot(
    kind='bar', 
    xlabel='Original Artificial Split', 
    ylabel='Patients', 
    alpha=1)

In [39]:
THRESHOLD = 100 # Only shard participants larger than this
CUTS = 10 # Number of shards

to_be_split = df.groupby('Partition_ID').filter(lambda x: len(x) >THRESHOLD)

to_be_split['new_split'] = to_be_split['Partition_ID'].to_numpy() + \
                            (to_be_split.
                            groupby('Partition_ID')['Volume'].
                            apply(lambda x: pd.qcut(x, CUTS, labels=range(CUTS))).
                            to_numpy())

# hacky way that assumes part. 1 (the largest) is sharded
to_be_split.loc[to_be_split['Partition_ID'].ge(2), ['new_split']] += CUTS - 1 

df['new_split'] = df['Partition_ID']
df.loc[df['Partition_ID'].ge(2), ['new_split']] += CUTS - 1 # 2,19 change base on threshold
df.loc[df['Partition_ID'].ge(19), ['new_split']] += CUTS - 1                            

df.loc[df.Subject_ID.isin(to_be_split.Subject_ID), ['new_split']] = to_be_split[['new_split']]

df.groupby('new_split').size().plot(kind='bar', alpha=0.8) \
* df.groupby('org_split').size().plot(kind='bar', alpha=0.8)

In [None]:
if os.path.isfile(f"{data_path}/new_split.csv"):
    print('Name/file exists, avoiding overwrite!')
else:
    df[['Subject_ID', 'new_split']].to_csv(f"{data_path}/new_split.csv",index=False)

# Prototyping Split
- Includes intact original collaborators with 10-30 patients
- 7 collaborators
- 117 patients (~10% of original)

In [40]:
tmp = df.groupby('Partition_ID').filter(lambda x: (len(x)>9) & (len(x)<31))

if os.path.isfile(f"{data_path}/prototyping_split.csv"):
    print('Name/file exists, avoiding overwrite!')
else:
    tmp[['Subject_ID', 'Partition_ID']].to_csv(f"{data_path}/prototyping_split.csv",index=False)
    
tmp.groupby('Partition_ID').size().plot(kind='bar', ylabel='Patients')

Name/file exists, avoiding overwrite!


# Medium split
- Split largest two collaborators to 10 each by tumor volume
- Only use the 2 middle bins for each + all other collaborators 
- 25 collaborators
- 536 patients (~42% of original)

In [43]:
tmp2 = df[~df.new_split.isin([*range(5), *range(7, 11), *range(27, 31), *range(33, 37)])] 

if os.path.isfile(f"{data_path}/medium_split.csv"):
    print('Name/file exists, avoiding overwrite!')
else:
    tmp2[['Subject_ID', 'Partition_ID']].to_csv(f"{data_path}/medium_split.csv",index=False)
tmp2.groupby('new_split').size().plot(kind='bar', xlabel='Partition ID', ylabel='Patients')

Name/file exists, avoiding overwrite!
