In [1]:
import os
import pydicom
import SimpleITK as sitk
import pandas as pd
import numpy as np
from tqdm import tqdm

Set up paths for loading DICOM images, Ktrans volumes, and findings metadata from the ProstateX dataset

In [3]:

DICOM_ROOT = r"C:\Users\anude\Downloads\project"
KTRANS_DIR = r"C:\Users\anude\Downloads\ProstateXKtrains-train-fixed"
FINDINGS_CSV = r"C:\Users\anude\Downloads\project\ProstateX-Findings-Train.csv"
OUTPUT_DIR = r"C:\Users\anude\Downloads\processed_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

findings_df = pd.read_csv(FINDINGS_CSV)

findings_df['ProxID'] = findings_df['ProxID'].astype(str)

# Keywords for modality matching
modality_keywords = {
    "T2": ["t2", "t2w"],
    "ADC": ["adc"],
    "DWI": ["dwi", "diffusion", "ep2d_diff", "ep2d"]
}


In [4]:

# --- Functions ---
def match_series_description(description, keywords):
    """
    Checks if any keyword is present in the series description.
    """
    description = description.lower()
    return any(kw in description for kw in keywords)

def find_series_by_modality(patient_folder, keywords):
    """
    Walks through the patient folder to find a DICOM series matching specified keywords in SeriesDescription.
    Returns the series as a SimpleITK image.
    """
    for root, dirs, files in os.walk(patient_folder):
        dcm_files = [f for f in files if f.endswith('.dcm')]
        if not dcm_files:
            continue
        try:
            sample_dcm = pydicom.dcmread(os.path.join(root, dcm_files[0]), stop_before_pixels=True)
            if hasattr(sample_dcm, 'SeriesDescription'):
                if match_series_description(sample_dcm.SeriesDescription, keywords):
                    # Load full series as SimpleITK image
                    series_paths = [os.path.join(root, f) for f in dcm_files]
                    reader = sitk.ImageSeriesReader()
                    reader.SetFileNames(series_paths)
                    return reader.Execute()
        except Exception:
            continue
    return None

def load_ktrans_volume(ktrans_dir, proxid):
    """
    Loads the Ktrans .mhd image for a given ProxID.
    """
    for root, dirs, files in os.walk(ktrans_dir):
        for file in files:
            if file.endswith(".mhd") and proxid in file:
                return sitk.ReadImage(os.path.join(root, file))
    return None


def normalize(vol):
    """
    Normalizes a SimpleITK volume to the range [0, 1].
    """
    arr = sitk.GetArrayFromImage(vol).astype(np.float32)
    return (arr - arr.min()) / (arr.max() - arr.min() + 1e-5)

<h2> Data Preprocessing: </h2>
Processes each patient in the ProstateX dataset by loading T2, ADC, and Ktrans images.Then, normalising and stacking them into a single 3D array. Later, the result is saved as a .npy file for each patient.

In [11]:
# --- Data Preprocessing ---
processed = 0
for proxid in tqdm(findings_df['ProxID'].unique()):
    patient_folder = os.path.join(DICOM_ROOT, proxid)
    if not os.path.isdir(patient_folder):
        print(f"[SKIP] Folder not found: ProstateX-{proxid}")
        continue

    try:
        t2 = find_series_by_modality(patient_folder, modality_keywords["T2"])
        adc = find_series_by_modality(patient_folder, modality_keywords["ADC"])
        ktrans = load_ktrans_volume(KTRANS_DIR, proxid)

        if None in [t2, adc, ktrans]:
            print(f"[WARN] Missing one or more modalities for {proxid}")
            continue

        # Resample adc and ktrans to match t2
        resampler = sitk.ResampleImageFilter()
        resampler.SetReferenceImage(t2)
        adc = resampler.Execute(adc)
        ktrans = resampler.Execute(ktrans)

        t2_arr = normalize(t2)
        adc_arr = normalize(adc)
        ktrans_arr = normalize(ktrans)

        stacked = np.stack([t2_arr, adc_arr, ktrans_arr], axis=0)  # [C, D, H, W]
        np.save(os.path.join(OUTPUT_DIR, f"{proxid}.npy"), stacked)
        processed += 1

    except Exception as e:
        print(f"[ERROR] {proxid}: {e}")

print(f"\n Finished. Processed {processed} patient volumes.")


100%|██████████| 204/204 [00:54<00:00,  3.75it/s]


 Finished. Processed 204 patient volumes.



