In [19]:
import glob
import h5py
import os
import pandas as pd

## Constants

In [20]:
# Path to TMA HDF5 files
PATH_TO_PROCESSED_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/processed"
PATH_TO_TMA_PATCHES = os.path.join(PATH_TO_PROCESSED_DATA, "tma_patches")

# Path to train-val-test splits.
PATH_TO_RAW_DATA = "/deep/group/aihc-bootcamp-fall2021/lymphoma/raw"
PATH_TO_TRAIN_VAL_TEST_SPLIT = os.path.join(PATH_TO_RAW_DATA, "custom_train_test_split.csv")

# Output files
PATH_TO_OUTPUT = os.path.join(PATH_TO_PROCESSED_DATA, "data_splits/custom_splits/tma_patches")
PATH_TO_TRAIN_DATA = os.path.join(PATH_TO_OUTPUT, "train.hdf5")
PATH_TO_VAL_DATA = os.path.join(PATH_TO_OUTPUT, "val.hdf5")
PATH_TO_TEST_DATA = os.path.join(PATH_TO_OUTPUT, "test.hdf5")

## Read Train/Val/Test Splits

In [21]:
data_split_df = pd.read_csv(PATH_TO_TRAIN_VAL_TEST_SPLIT, delimiter=',')
data_split_map = data_split_df.set_index('patient_id')['split'].to_dict()

## Read TMA Data

In [22]:
tma_hdf5_filenames = glob.glob(PATH_TO_TMA_PATCHES + "/tma*.hdf5")
tma_hdf5_filenames

['/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma5.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma1.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma8.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6b.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma4.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma2.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6a.hdf5',
 '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma3.hdf5']

## Build Train/Val/Test HDF5 files

In [23]:
def build_data_splits_from_hdf5_files(tma_hdf5_filenames):
    included_patient_ids = set()
    excluded_patient_ids = set()
    train_f = h5py.File(PATH_TO_TRAIN_DATA, "w")
    val_f = h5py.File(PATH_TO_VAL_DATA, "w")
    test_f = h5py.File(PATH_TO_TEST_DATA, "w")

    patient_ids = set()
    patient_id_repeats = {}

    for filename in tma_hdf5_filenames:
        print(filename)
        
        with h5py.File(filename, "r") as f:
            for patient_id in f.keys():
                data = f[patient_id]
                patient_id_key = patient_id.split("_")[0].replace(" ", "")[:5]
                if patient_id_key not in data_split_map:
                    excluded_patient_ids.add(patient_id_key)
                    continue

                included_patient_ids.add(patient_id_key)
                data_split = data_split_map[patient_id_key]
                if data_split == "train":
                    out_f = train_f
                elif data_split == "val":
                    out_f = val_f
                else:
                    assert(data_split == "test")
                    out_f = test_f

                name = patient_id
                # Deal with duplicate patients
                if (patient_id not in patient_ids):
                    patient_id_repeats[patient_id] = 0
                patient_id_repeats[patient_id] += 1
                name += f"_v{patient_id_repeats[patient_id]}"

                dset = out_f.create_dataset(name, data=data, dtype='uint8', chunks=True)
                dset.attrs['tma_id'] = data.attrs["tma_id"]
                dset.attrs['patient_id'] = data.attrs["patient_id"]
                dset.attrs['who_diagnosis'] = data.attrs["who_diagnosis"]
                dset.attrs['clpa_diagnosis'] = data.attrs["clpa_diagnosis"]
                dset.attrs['label'] = data.attrs["label"]
                patient_ids.add(patient_id)
    train_f.close()
    val_f.close()
    test_f.close()
    return (included_patient_ids, excluded_patient_ids)

(included_patient_ids, excluded_patient_ids) = build_data_splits_from_hdf5_files(tma_hdf5_filenames)

/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma5.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma1.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma8.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6b.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma4.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma2.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma6a.hdf5
/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/tma_patches/tma3.hdf5
