# 3D lesion data processing into nnUNet format
https://github.com/MIC-DKFZ/nnUNet/blob/master/documentation/dataset_format.md

In [None]:
import os
from pathlib import Path
import nibabel as nib
import shutil
import glob
from tqdm import tqdm
import SimpleITK as sitk

from utils.plot import transparent_cmap

data_folder = Path("/media/liushifeng/KINGSTON/nnUNet_raw/Dataset001_3dlesion")
train_images = data_folder / "imagesTr"
train_labels = data_folder / "labelsTr"

In [None]:
# ULS / Deeplesion 3D dataset
uls_folder = Path("/media/liushifeng/KINGSTON/ULS Jan 2025/ULS23/novel_data/ULS23_DeepLesion3D/")
image_folder = uls_folder / "images"
label_folder = uls_folder / "labels"

# check image shapes are 3D + channel
for i in os.listdir(image_folder):
    if len(nib.load(image_folder / i).shape) != 4:
        print(i, "need 4 dims")

# copy images over and add prefix
file_type = ".nii.gz"
prefix = "ULSDL3D"
for f in os.listdir(image_folder):
    if f.endswith(file_type):
        shutil.copy2(image_folder / f, train_images / f"{prefix}_{f}")

file_type = ".nii.gz"
for f in os.listdir(label_folder):
    if f.endswith(file_type):
        shutil.copy2(label_folder / f, train_labels / f"{prefix}_{f}")

# remove images that do not have labels
label_files = os.listdir(train_labels)
to_remove = [x for x in os.listdir(train_images) if x not in label_files]
print("removing:", to_remove)
for p in to_remove:
    os.remove(train_images / p)

In [None]:
# internal AutoPET dataset
# use lymphoma and melanoma dataset Bs as training data
folder = Path("/home/liushifeng/Desktop/AutoPET dataset/dataset B_lymphoma/")
prefix = "AutoPET-Lymphoma-B"
for ct_path in tqdm(glob.glob(str(folder) + "/**/CT.nii.gz", recursive=True)):
    name = Path(ct_path).parents[1].stem
    shutil.copy2(ct_path, train_images / f"{prefix}_{name}_{Path(ct_path).name}")

folder = Path("/home/liushifeng/Desktop/AutoPET dataset/SEG_Lymphoma_B_Rifki-selected/")
for f in tqdm(os.listdir(folder)):
    if f.endswith(".nrrd"):
        name = Path(f).stem[4:].split(".")[0]
        output_name = f"{prefix}_PETCT_{name}_CT.nii.gz"
        sitk.WriteImage(sitk.ReadImage(folder / f), train_labels / output_name)

ap_folder = Path("/home/liushifeng/Desktop/AutoPET dataset/dataset B_melanoma/")
prefix = "AutoPET-Melanoma-B"
for ct_path in tqdm(glob.glob(str(ap_folder) + "/**/CT.nii.gz", recursive=True)):
    name = Path(ct_path).parents[1].stem
    shutil.copy2(ct_path, train_images / f"{prefix}_{name}_{Path(ct_path).name}")

folder = Path("/home/liushifeng/Desktop/AutoPET dataset/SEG_Melanoma_B_Rifki/")
for f in tqdm(os.listdir(folder)):
    if f.endswith(".nrrd"):
        name = Path(f).stem[4:].split(".")[0]
        output_name = f"{prefix}_PETCT_{name}_CT.nii.gz"
        sitk.WriteImage(sitk.ReadImage(folder / f), train_labels / output_name)

# # use lymphoma and melanoma dataset Cs as test data
# ap_folder = Path("/home/liushifeng/Desktop/AutoPET dataset/dataset C_lymphoma/")
# prefix = "AutoPET-Lymphoma-C"
# for ct_path in tqdm(glob.glob(str(ap_folder) + "/**/CT.nii.gz", recursive=True)):
#     name = Path(ct_path).parents[1].stem
#     shutil.copy2(ct_path, TEST_FOLDER_PATH / f"{prefix}_{name}_{Path(ct_path).name}")
#
# ap_folder = Path("/home/liushifeng/Desktop/AutoPET dataset/dataset C_melanoma/")
# prefix = "AutoPET-Melanoma-C"
# for ct_path in tqdm(glob.glob(str(ap_folder) + "/**/CT.nii.gz", recursive=True)):
#     name = Path(ct_path).parents[1].stem
#     shutil.copy2(ct_path, TEST_FOLDER_PATH / f"{prefix}_{name}_{Path(ct_path).name}")

In [None]:
image_files = os.listdir(train_images)
label_files = os.listdir(train_labels)

images_with_no_labels = [x for x in image_files if x not in label_files]
labels_with_no_images = [x for x in label_files if x not in image_files]

In [None]:
print("removing:", images_with_no_labels, labels_with_no_images)
for p in images_with_no_labels:
    os.remove(train_images / p)

for p in labels_with_no_images:
    os.remove(train_labels / p)

In [None]:
# Add _0000 to all training images to indicate channel
for f in os.listdir(train_images):
    os.rename(train_images / f, train_images / f.replace('.nii.gz', '_0000.nii.gz'))

In [None]:
for f in tqdm(sorted(os.listdir(train_labels))):
    img_path = train_images / f.replace('.nii.gz', '_0000.nii.gz')
    label_path = train_labels / f
    img = sitk.ReadImage(img_path)
    seg = sitk.ReadImage(label_path)
    # seg_img = sitk.Resample(seg, img, sitk.Transform(), sitk.sitkNearestNeighbor, 0.0, img.GetPixelID())

    # ct_data = sitk.GetArrayFromImage(img)
    # seg_data = sitk.GetArrayFromImage(seg)
    # resampled = sitk.GetArrayFromImage(seg_img)
    i = img.GetSize()
    s = seg.GetSize()
    if i != s:
        print(f"resampled {f}")
        print(i, s)

            # break
    # sitk.WriteImage(seg_img, label_path)

## Visualize data
Verify images and segmentations match

In [None]:
import random

data_folder = Path("/media/liushifeng/KINGSTON/nnUNet_raw/Dataset001_3dlesion")
train_images = data_folder / "imagesTr"
train_labels = data_folder / "labelsTr"

In [None]:
import matplotlib.pyplot as plt

def plot(f, ct_data, seg_data):

    slice_index = seg_data.sum(axis=(1,2)).argmax()
    ct_slice = ct_data[slice_index, :, :, ]
    seg_slice = seg_data[slice_index, :, :]

    print(f)
    fig, ax = plt.subplots(1,2, figsize=(8,4))
    ax[0].imshow(ct_slice, cmap='gray')
    ax[1].imshow(ct_slice, cmap='gray')
    ax[1].imshow(seg_slice, cmap=transparent_cmap('r'), alpha=0.3)

    for a in ax:
        a.axis('off')
    plt.show()

In [22]:
uls_img = [x for x in os.listdir(train_images) if x.startswith("ULS")]
ap_img = [x for x in os.listdir(train_images) if x.startswith("AutoPET")]

# filenames = random.sample(ap_img, 5) + random.sample(uls_img, 5)
filenames = uls_img + ap_img
n = 0

volumes = []

for i, f in enumerate(filenames):
    ct_path = train_images / f
    seg_path = train_labels / f.replace("_0000.nii.gz", ".nii.gz")

    seg_img = sitk.ReadImage(seg_path)
    seg_data = sitk.GetArrayFromImage(seg_img)

    volumes.append(seg_data.mean())
    # if seg_data.mean() > 5e-4:
    #     print(seg_data.mean())
    #     ct_img = sitk.ReadImage(ct_path)
    #     ct_data = sitk.GetArrayFromImage(ct_img)
    #     plot(f, ct_data, seg_data)
    #     n += 1

    # if n > 10:
    #     break


In [None]:
import pandas as pd

pd.Series(volumes).clip(0,0.001).hist(bins=100, figsize=(4,2));
plt.title("Volume of lesion / scan")