<a href="https://colab.research.google.com/github/sajidcsecu/radioGenomic/blob/main/Data_Preparation_(final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (1) Install Required Libraries

In [11]:
!pip install SimpleITK
!pip install pydicom===2.4.3
!pip install pydicom-seg
!pip install numpy==1.23.5



# (2) Import Required Libraries

In [12]:
import os
import numpy as np
import pandas as pd
import pydicom
import pydicom_seg
import SimpleITK as sitk
import torch
from torch.utils.data import Dataset
import h5py
from sklearn.model_selection import train_test_split
from datetime import datetime
from google.colab import drive
from multiprocessing import Pool

# (3) Mount Google Drive

In [13]:
drive.mount('/content/drive')

# Define dataset directory
DATASET_DIR = "/content/drive/MyDrive/PhDwork/datasets"
os.makedirs(DATASET_DIR, exist_ok=True)
print(f" Dataset Directory: {DATASET_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Dataset Directory: /content/drive/MyDrive/PhDwork/datasets


# (4) Define Dataset Class with Optimized Mask Storage

In [14]:
class PreparePatientDataset(Dataset):
    def __init__(self, patients, metadata, train=True):
        self.patients = patients
        self.metadata = metadata
        self.train = train
        self.slices = self._extract_slices()

    def get_path(self, subject, modality):
        subject_filtered = subject[subject['Modality'] == modality]
        if subject_filtered.empty:
            return None

        path = os.path.normpath(subject_filtered['File Location'].iloc[0])
        path = path.replace("\\", "/")
        path = path.strip()

        if not os.path.exists(path):
            print(f"❌ WARNING: Path does not exist: {path}")

        return path if os.path.exists(path) else None

    def get_num_slices(self, path, modality):
        path = os.path.normpath(path).replace("\\", "/")

        if modality == "CT":
            if not os.path.exists(path):
                print(f"❌ WARNING: CT folder not found: {path}")
                return 0
            if not os.path.isdir(path):
                print(f"❌ ERROR: Expected a folder but found a file: {path}")
                return 0
            return len([f for f in os.listdir(path) if f.endswith('.dcm')])

        elif modality == "SEG":
            seg_file = os.path.join(path, '1-1.dcm')
            if not os.path.exists(seg_file):
                print(f"❌ WARNING: SEG file not found: {seg_file}")
                return 0
            try:
                segmentation = pydicom.dcmread(seg_file)
                mask_data = pydicom_seg.SegmentReader().read(segmentation)
                return mask_data.segment_image(1).GetDepth()
            except Exception as e:
                print(f"❌ ERROR reading segmentation {seg_file}: {e}")
                return 0

    def has_tumor(self, seg_path):
        """
        Checks if a segmentation mask contains tumor regions.
        Args:
            seg_path (str): Path to the segmentation DICOM file.
        Returns:
            np.ndarray: Boolean array where True indicates a non-empty slice.
        """
        try:
            mask = self.read_seg_array(seg_path, slice_idx=None, seg_type="GTV-1")  # Load full mask
            return np.any(mask, axis=(1, 2))  # Check per-slice if any pixel is nonzero (tumor present)
        except Exception as e:
            print(f"❌ ERROR in has_tumor(): {e}")
            return np.zeros(1, dtype=bool)  # Default to no tumor if error occurs

    def _extract_slices(self):
        slices = []
        patient_paths = {p: self.metadata[self.metadata['Subject ID'] == p] for p in self.patients}

        for patient, subject in patient_paths.items():
            img_path = self.get_path(subject, "CT")
            msk_path = self.get_path(subject, "SEG")

            if img_path and msk_path:
                min_slices = min(self.get_num_slices(img_path, "CT"), self.get_num_slices(msk_path, "SEG"))
                slice_indices = np.arange(min_slices) if not self.train else np.where(self.has_tumor(msk_path))[0]
                slices.extend(zip([img_path] * len(slice_indices), [msk_path] * len(slice_indices), slice_indices))

        return slices

    def read_ct_array(self, path, slice_idx):
        reader = sitk.ImageSeriesReader()
        reader.SetImageIO("GDCMImageIO")
        path = os.path.normpath(path).replace("\\", "/")
        dicom_names = reader.GetGDCMSeriesFileNames(path)
        reader.SetFileNames(dicom_names)
        image = reader.Execute()
        return sitk.GetArrayFromImage(image)[slice_idx].astype(np.float32)

    def read_seg_array(self, path, slice_idx, seg_type="GTV-1"):
        path = os.path.normpath(path).replace("\\", "/")
        try:
            segmentation = pydicom.dcmread(os.path.join(path, '1-1.dcm'))
            seg_df = pd.DataFrame({f: [s[f].value for s in segmentation.SegmentSequence] for f in ['SegmentNumber', 'SegmentDescription']})
            seg_number = seg_df.loc[seg_df['SegmentDescription'] == seg_type, 'SegmentNumber'].iloc[0]
            mask = pydicom_seg.SegmentReader().read(segmentation).segment_image(seg_number)
            return sitk.GetArrayFromImage(mask)[slice_idx].astype(np.uint8)  # Use uint8 to save space
        except Exception as e:
            print(f"Error reading segmentation from {path}: {e}")
            return None

    def __getitem__(self, idx):
        img_path, mask_path, slice_idx = self.slices[idx]
        img = self.read_ct_array(img_path, slice_idx)
        msk = self.read_seg_array(mask_path, slice_idx, "GTV-1")

        if img is None or msk is None:
            img = torch.zeros((512, 512), dtype=torch.float16)
            msk = torch.zeros((512, 512), dtype=torch.uint8)
            return img.unsqueeze(0), msk.unsqueeze(0)

        img = (img - img.min()) / max(img.max(), 1e-6)
        img = torch.tensor(img, dtype=torch.float32 ).unsqueeze(0) # add channel dim
        msk = torch.tensor(msk, dtype=torch.uint8).unsqueeze(0) # add channel dim

        return img, msk
    def __len__(self):
            return len(self.slices)

    @staticmethod
    def process_slice(slice_data):
        img, msk = slice_data
        img_np = img.squeeze(0).numpy()
        msk_np = msk.squeeze(0).numpy().astype(np.uint8)
        return img_np, msk_np

    def save_hdf5(self, output_file):
        slice_data = [self[i] for i in range(len(self))]

        with Pool() as pool:
            results = pool.map(PreparePatientDataset.process_slice, slice_data) #changed to class.process_slice

        images, masks = zip(*results)
        images = np.stack(images)
        masks = np.stack(masks)

        with h5py.File(output_file, "w") as f:
            f.create_dataset("images", data=images, compression="gzip", compression_opts=9)
            f.create_dataset("masks", data=masks, compression="gzip", compression_opts=9)

        print(f"✅ HDF5 file saved at {output_file}")

# (5) PROCESS & SAVE TRAIN DATASET in HDF5 format

In [15]:
if __name__ == "__main__":
    # Define the target directory
    target_dir = "/content/drive/MyDrive/PhDwork"
    # Change to the target directory
    os.chdir(target_dir)
    # Verify the change
    print("Current Directory:", os.getcwd())

    # Metadata
    metadata_lung1 = pd.read_csv("/content/drive/MyDrive/PhDwork/metadata/metadata_lung1.csv")
    patient_list_lung1 = metadata_lung1["Subject ID"].unique().tolist()

    train_patient, valid_patient = train_test_split(patient_list_lung1, test_size=0.1, random_state=42)
    train_patient, test_patient = train_test_split(train_patient, test_size=0.1, random_state=42)
    print(f"Patients: Train={len(train_patient)}, Valid={len(valid_patient)}, Test={len(test_patient)}")

    # Define DATASET_DIR
    DATASET_DIR = os.path.join(target_dir, "datasets/allPatients")  # Create a datasets subfolder
    os.makedirs(DATASET_DIR, exist_ok=True)  # Create the directory if it doesn't exist

    # Create Train Dataset
    print("Loading Training Data...")
    start_time_train = datetime.now()
    train_dataset = PreparePatientDataset(train_patient, metadata_lung1, train=True)  # removed device argument
    train_path = os.path.join(DATASET_DIR, "train_dataset.hdf5")
    train_dataset.save_hdf5(train_path)
    end_time_train = datetime.now()
    duration_train = end_time_train - start_time_train
    print(f"Training dataset creation time: {duration_train}")

    # Create Valid Dataset
    print("Loading valid Data...")
    start_time_valid = datetime.now()
    valid_dataset = PreparePatientDataset(valid_patient, metadata_lung1, train=False) #removed device argument
    valid_path = os.path.join(DATASET_DIR, "valid_dataset.hdf5")
    valid_dataset.save_hdf5(valid_path)
    end_time_valid = datetime.now()
    duration_valid = end_time_valid - start_time_valid
    print(f"Validation dataset creation time: {duration_valid}")

    # Create Test Dataset
    print("Loading test Data...")
    start_time_test = datetime.now()
    test_dataset = PreparePatientDataset(test_patient, metadata_lung1, train=False) #removed device argument
    test_path = os.path.join(DATASET_DIR, "test_dataset.hdf5")
    test_dataset.save_hdf5(test_path)
    end_time_test = datetime.now()
    duration_test = end_time_test - start_time_test
    print(f"Test dataset creation time: {duration_test}")

Current Directory: /content/drive/MyDrive/PhDwork
Patients: Train=1, Valid=1, Test=1
Loading Training Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/train_dataset.hdf5
Training dataset creation time: 0:05:17.565594
Loading valid Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/valid_dataset.hdf5
Validation dataset creation time: 0:08:05.516331
Loading test Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/test_dataset.hdf5
Test dataset creation time: 0:05:30.574955


# (5) Load and Save Train Dataset

In [None]:
def load_dataset_hdf5(filename):
    """
    Load images and masks from an HDF5 file.

    Args:
        filename (str): Path to the HDF5 file.

    Returns:
        images (torch.Tensor): Tensor of images.
        masks (torch.Tensor): Tensor of masks.
    """
    load_path = os.path.join(DATASET_DIR, filename)

    with h5py.File(load_path, 'r') as f:
        images = f["images"][:]  # Load images
        masks = f["masks"][:]    # Load masks

    # Convert to PyTorch tensors
    images = torch.tensor(images, dtype=torch.float16)
    masks = torch.tensor(masks, dtype=torch.uint8)

    print(f"✅ Loaded {filename} from {load_path} - Shape: {images.shape}, {masks.shape}")

    return images, masks

images, masks = load_dataset_hdf5("train_dataset.hdf5")

# Check the shape
print("Images shape:", images.shape)  # (N, 1, H, W)
print("Masks shape:", masks.shape)    # (N, 1, H, W)

# (6) Display Sample Slices

In [None]:
def display_sample_shape(dataset):
    idx = np.random.randint(0, len(dataset))
    img, mask = dataset[idx]

    print(f"Image Shape: {img.shape}")  # Should be (1, H, W)
    print(f"Mask Shape: {mask.shape}")  # Should be (1, H, W)

display_sample_shape(train_dataset)
