In [None]:
import pathlib
import json
import shutil

import numpy as np
import matplotlib.pyplot as plt

from IPython import display

import pydicom

In [None]:
# Makes it so any changes in pymedphys is automatically
# propagated into the notebook without needing a kernel reset.
from IPython.lib.deepreload import reload
%load_ext autoreload
%autoreload 2

In [None]:
import pymedphys
from pymedphys.labs.autosegmentation import pipeline, filtering, indexing, mask, tfrecord

In [None]:
# Put all of the DICOM data within a directory called 'dicom' 
# organised by 'training', 'validation', and 'testing' in here:
data_path_root = pathlib.Path.home().joinpath('.data/dicom-ct-and-structures')
dicom_directory = data_path_root.joinpath('dicom')

training_directory = dicom_directory.joinpath('training')
validation_directory = dicom_directory.joinpath('validation')
testing_directory = dicom_directory.joinpath('testing')

# Of note, the DICOM file directory structure need not have any further
# organisation beyond being placed somewhere within one of the three
# 'training', 'validation', or 'testing'. They can be organised into
# directories by patient but that is not a requirement.

In [None]:
name_mappings_path = data_path_root.joinpath('name_mappings.json')

In [None]:
dicom_paths = pymedphys.zenodo_data_paths("auto-segmentation")

for path in dicom_paths:
    if path.suffix == '.dcm':
        dataset_id = path.parent.name
        parent_and_file = path.parts[-2::]

        if int(dataset_id) < 4:
            new_path = testing_directory.joinpath(*parent_and_file)
        elif int(dataset_id) < 8:
            new_path = validation_directory.joinpath(*parent_and_file)
        else:
            new_path = training_directory.joinpath(*parent_and_file)

    elif path.name == 'name_mappings.json':
        new_path = name_mappings_path
        
    else:
        raise ValueError(f"Unexpected file found. {path}.")
        
    if not new_path.exists():
        new_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(path, new_path)

In [None]:
# The following names_map is used to standardise the structure names
names_map = filtering.load_names_mapping(name_mappings_path)

In [None]:
# Create masks for the following structures, in the following order
structures_to_learn = [
    'lens_left', 'lens_right', 'eye_left', 'eye_right', 'patient']

# Use the following to filter the slices used for training, validation,
# and testing
filters = {
    "study_set_must_have_all_of": structures_to_learn,
    "slice_at_least_one_of": [
        'lens_left', 'lens_right', 'eye_left', 'eye_right'
    ],
    "slice_must_have": ['patient'],
    "slice_cannot_have": []
}

In [None]:
datasets = pipeline.create_datasets(
    data_path_root, names_map, structures_to_learn, filters)

In [None]:
for dataset_type, dataset in datasets.items():
    tfrecord_path = str(data_path_root.joinpath(f'{dataset_type}.tfrecord'))
    tfrecord.dump(dataset, tfrecord_path)

In [None]:
structures_to_learn_path = data_path_root.joinpath("structures_to_learn.json")
with open(structures_to_learn_path, "w") as f:
    json.dump(structures_to_learn, f)