<a href="https://colab.research.google.com/github/sajidcsecu/radioGenomic/blob/main/Data_Preparation_(final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (1) Mount Google Drive

In [7]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Define dataset directory
DATASET_DIR = "/content/drive/MyDrive/PhDwork/datasets"
os.makedirs(DATASET_DIR, exist_ok=True)
print(f" Dataset Directory: {DATASET_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Dataset Directory: /content/drive/MyDrive/PhDwork/datasets


# (2) Import Required Libraries

In [2]:
!pip install SimpleITK
!pip install pydicom===2.4.3
!pip install pydicom-seg
!pip install numpy==1.23.5

Collecting SimpleITK
  Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.4.1
Collecting pydicom===2.4.3
  Downloading pydicom-2.4.3-py3-none-any.whl.metadata (7.8 kB)
Downloading pydicom-2.4.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.4.3
Collecting pydicom-seg
  Downloading pydicom_seg-0.4.1-py3-none-any.whl.metadata (5.2 kB)
Collecting jsonschema<4.0.0,>=3.2.0 (from pydicom-seg)
  Downloading jsonschema-3.2.0-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting numpy

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import h5py
import pydicom
import pydicom_seg
import SimpleITK as sitk
from sklearn.model_selection import train_test_split

# (2) Define Dataset Class with Optimized Mask Storage

In [3]:
class PreparePatientDataset:
    def __init__(self, patients, metadata, train=True):
        """
        Args:
            patients (list): List of patient IDs.
            metadata (DataFrame): Metadata containing patient information.
            train (bool): If True, filters out empty slices.
        """
        self.patients = patients
        self.metadata = metadata
        self.train = train
        self.slices = self._extract_slices()  # Store (img_path, mask_path, slice_idx) tuples

    def get_path(self, subject, modality):
        subject_filtered = subject[subject['Modality'] == modality]
        if subject_filtered.empty:
            return None

        path = os.path.normpath(subject_filtered['File Location'].iloc[0])
        path = path.replace("\\", "/")  # ✅ Convert Windows backslashes to Unix forward slashes
        path = path.strip()  # ✅ Remove any accidental whitespace or newlines

        if not os.path.exists(path):
            print(f"❌ WARNING: Path does not exist: {path}")

        return path if os.path.exists(path) else None

    def get_num_slices(self, path, modality):
        path = os.path.normpath(path).replace("\\", "/")

        if modality == "CT":
            if not os.path.exists(path):
                print(f"❌ WARNING: CT folder not found: {path}")
                return 0
            if not os.path.isdir(path):  # ✅ Check if it's actually a directory
                print(f"❌ ERROR: Expected a folder but found a file: {path}")
                return 0
            return len([f for f in os.listdir(path) if f.endswith('.dcm')])

        elif modality == "SEG":
            seg_file = os.path.join(path, '1-1.dcm')
            if not os.path.exists(seg_file):
                print(f"❌ WARNING: SEG file not found: {seg_file}")
                return 0
            try:
                segmentation = pydicom.dcmread(seg_file)
                mask_data = pydicom_seg.SegmentReader().read(segmentation)
                return mask_data.segment_image(1).GetDepth()  # Number of slices
            except Exception as e:
                print(f"❌ ERROR reading segmentation {seg_file}: {e}")
                return 0

    def has_tumor(self, seg_path):
        """
        Checks if a segmentation mask contains tumor regions.
        Args:
            seg_path (str): Path to the segmentation DICOM file.
        Returns:
            np.ndarray: Boolean array where True indicates a non-empty slice.
        """
        try:
            mask = self.read_seg_array(seg_path, slice_idx=None, seg_type="GTV-1")  # Load full mask
            return np.any(mask, axis=(1, 2))  # Check per-slice if any pixel is nonzero (tumor present)
        except Exception as e:
            print(f"❌ ERROR in has_tumor(): {e}")
            return np.zeros(1, dtype=bool)  # Default to no tumor if error occurs

    def _extract_slices(self):
        slices = []
        patient_paths = {p: self.metadata[self.metadata['Subject ID'] == p] for p in self.patients}

        for patient, subject in patient_paths.items():
            img_path = self.get_path(subject, "CT")
            msk_path = self.get_path(subject, "SEG")

            if img_path and msk_path:
                min_slices = min(self.get_num_slices(img_path, "CT"), self.get_num_slices(msk_path, "SEG"))
                slice_indices = np.arange(min_slices) if not self.train else np.where(self.has_tumor(msk_path))[0]
                slices.extend(zip([img_path] * len(slice_indices), [msk_path] * len(slice_indices), slice_indices))

        return slices

    def read_ct_array(self, path, slice_idx):
        reader = sitk.ImageSeriesReader()
        reader.SetImageIO("GDCMImageIO")
        path = os.path.normpath(path).replace("\\", "/")
        dicom_names = reader.GetGDCMSeriesFileNames(path)
        reader.SetFileNames(dicom_names)
        image = reader.Execute()
        return sitk.GetArrayFromImage(image)[slice_idx].astype(np.float32)

    def read_seg_array(self, path, slice_idx, seg_type="GTV-1"):
        path = os.path.normpath(path).replace("\\", "/")
        try:
            segmentation = pydicom.dcmread(os.path.join(path, '1-1.dcm'))
            seg_df = pd.DataFrame({f: [s[f].value for s in segmentation.SegmentSequence] for f in ['SegmentNumber', 'SegmentDescription']})
            seg_number = seg_df.loc[seg_df['SegmentDescription'] == seg_type, 'SegmentNumber'].iloc[0]
            mask = pydicom_seg.SegmentReader().read(segmentation).segment_image(seg_number)
            return sitk.GetArrayFromImage(mask)[slice_idx].astype(np.uint8)  # Use uint8 to save space
        except Exception as e:
            print(f"Error reading segmentation from {path}: {e}")
            return None

    def __getitem__(self, idx):
        img_path, mask_path, slice_idx = self.slices[idx]

        # Read one slice at a time
        img = self.read_ct_array(img_path, slice_idx)
        msk = self.read_seg_array(mask_path, slice_idx, "GTV-1")

        if img is None or msk is None:
            return np.zeros((512, 512), dtype=np.float16), np.zeros((512, 512), dtype=np.uint8)

        # Normalize image
        img = (img - img.min()) / max(img.max(), 1e-6)  # Avoid divide-by-zero

         # Convert to tensors
        img = torch.tensor(img, dtype=torch.float16).unsqueeze(0)  # Convert to float16
        msk = torch.tensor(msk, dtype=torch.uint8).unsqueeze(0)  # Use uint8 for masks

        return img, msk


    def save_hdf5(self, output_file):
        """
        Saves dataset as HDF5.
        """
        images, masks = [], []

        for i in range(len(self)):
            img, msk = self[i]  # Uses __getitem__()
            images.append(img)
            masks.append(msk)

        images, masks = np.stack(images), np.stack(masks)

        with h5py.File(output_file, "w") as f:
            f.create_dataset("images", data=images, compression="gzip", compression_opts=9)
            f.create_dataset("masks", data=masks, compression="gzip", compression_opts=9)

        print(f"✅ HDF5 file saved at {output_file}")


    def __len__(self):
        return len(self.slices)

# (4) PROCESS & SAVE TRAIN DATASET (.npz)

In [4]:
if __name__ == "__main__":

  # Define the target directory
  target_dir = "/content/drive/MyDrive/PhDwork"
  # Change to the target directory
  os.chdir(target_dir)
  # Verify the change
  print("Current Directory:", os.getcwd())
  # Metadata
  metadata_lung1 = pd.read_csv("/content/drive/MyDrive/PhDwork/metadata/metadata_lung1.csv")
  patient_list_lung1 = metadata_lung1["Subject ID"].unique().tolist()[:3]

  train_patient, valid_patient = train_test_split(patient_list_lung1, test_size=0.1, random_state=42)
  train_patient, test_patient = train_test_split(train_patient, test_size=0.1, random_state=42)
  print(f"Patients: Train={len(train_patient)}, Valid={len(valid_patient)}, Test={len(test_patient)}")

  # Create Train Dataset
  print("Loading Training Data...")
  train_dataset = PreparePatientDataset(train_patient, metadata_lung1, train=True)
  train_path = os.path.join(DATASET_DIR, "train_dataset.hdf5")
  train_dataset.save_hdf5(train_path)

   # Create Valid Dataset
  print("Loading valid Data...")
  valid_dataset = PreparePatientDataset(valid_patient, metadata_lung1, train=False)
  valid_path = os.path.join(DATASET_DIR, "valid_dataset.hdf5")
  valid_dataset.save_hdf5(valid_path)

  # Create Test Dataset
  print("Loading test Data...")
  test_dataset = PreparePatientDataset(test_patient, metadata_lung1, train=False)
  test_path = os.path.join(DATASET_DIR, "test_dataset.hdf5")
  test_dataset.save_hdf5(test_path)


# (5) Load and Save Train Dataset

In [12]:
def load_dataset_hdf5(filename):
    """
    Load images and masks from an HDF5 file.

    Args:
        filename (str): Path to the HDF5 file.

    Returns:
        images (torch.Tensor): Tensor of images.
        masks (torch.Tensor): Tensor of masks.
    """
    load_path = os.path.join(DATASET_DIR, filename)

    with h5py.File(load_path, 'r') as f:
        images = f["images"][:]  # Load images
        masks = f["masks"][:]    # Load masks

    # Convert to PyTorch tensors
    images = torch.tensor(images, dtype=torch.float16).unsqueeze(1)
    masks = torch.tensor(masks, dtype=torch.uint8).unsqueeze(1)

    print(f"✅ Loaded {filename} from {load_path} - Shape: {images.shape}, {masks.shape}")

    return images, masks

images, masks = load_dataset_hdf5("train_dataset.hdf5")

# Check the shape
print("Images shape:", images.shape)  # (N, 1, H, W)
print("Masks shape:", masks.shape)    # (N, 1, H, W)

✅ Loaded train_dataset.hdf5 from /content/drive/MyDrive/PhDwork/datasets/train_dataset.hdf5 - Shape: torch.Size([100, 1, 1, 512, 512]), torch.Size([100, 1, 1, 512, 512])
Images shape: torch.Size([100, 1, 1, 512, 512])
Masks shape: torch.Size([100, 1, 1, 512, 512])


# (6) Display Sample Slices

In [13]:
def display_sample_shape(dataset):
    idx = np.random.randint(0, len(dataset))
    img, mask = dataset[idx]

    print(f"Image Shape: {img.shape}")  # Should be (1, H, W)
    print(f"Mask Shape: {mask.shape}")  # Should be (1, H, W)

display_sample_shape(train_dataset)


Image Shape: torch.Size([1, 512, 512])
Mask Shape: torch.Size([1, 512, 512])
