<a href="https://colab.research.google.com/github/sajidcsecu/radioGenomic/blob/main/Data_Preparation_(3D).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (1) Install Required Libraries

In [None]:
!pip install SimpleITK
!pip install pydicom===2.4.3
!pip install pydicom-seg
!pip install numpy==1.23.5



# (2) Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import pydicom
import pydicom_seg
import SimpleITK as sitk
import h5py
import logging
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)

# (3) Mount Google Drive

In [None]:
drive.mount('/content/drive')

# Define dataset directory
DATASET_DIR = "/content/drive/MyDrive/PhDwork/datasets"
os.makedirs(DATASET_DIR, exist_ok=True)
print(f" Dataset Directory: {DATASET_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Dataset Directory: /content/drive/MyDrive/PhDwork/datasets


# (4) Define Dataset Class with Optimized Mask Storage

In [None]:
class PreparePatientDataset(Dataset):
    def __init__(self, patient_ids, metadata_df, train=True):
        self.patient_ids = patient_ids
        self.metadata_df = metadata_df
        self.train = train
        self.volume_data = self.load_volumes()


    def get_path(self, subject, modality):
        filtered = subject[subject['Modality'] == modality]
        if filtered.empty:
            return None
        path = os.path.normpath(filtered['File Location'].iloc[0]).replace("\\", "/").strip()
        return path if os.path.exists(path) else None


    def read_ct_volume(self, path):
        reader = sitk.ImageSeriesReader()
        reader.SetImageIO("GDCMImageIO")
        dicom_names = reader.GetGDCMSeriesFileNames(path)
        reader.SetFileNames(dicom_names)
        image = reader.Execute()
        return sitk.GetArrayFromImage(image)

    def read_seg_volume(self, path,seg_type="GTV-1"):
        seg_file = os.path.join(path, '1-1.dcm')
        segmentation = pydicom.dcmread(seg_file)
        seg_df = pd.DataFrame({
            f: [s[f].value for s in segmentation.SegmentSequence]
            for f in ['SegmentNumber', 'SegmentDescription']
        })
        match = seg_df.loc[seg_df['SegmentDescription'] == seg_type]
        if match.empty:
            raise ValueError(f"No segment of type {seg_type} found.")
        seg_number = match['SegmentNumber'].iloc[0]
        mask = pydicom_seg.SegmentReader().read(segmentation).segment_image(seg_number)
        return sitk.GetArrayFromImage(mask)

    def load_volumes(self):
        all_data = []
        for pid in self.patient_ids:
            try:
                subject = self.metadata_df[self.metadata_df['Subject ID'] == pid]
                ct_path = self.get_path(subject, "CT")
                seg_path = self.get_path(subject, "SEG")
                if ct_path is None or seg_path is None:
                    logging.warning(f"⚠️ Skipping {pid} due to missing CT or SEG path")
                    continue
                ct_volume = self.read_ct_volume(ct_path)
                seg_volume = self.read_seg_volume(seg_path)

                min_slices = min(ct_volume.shape[0], seg_volume.shape[0])
                ct_volume, seg_volume = ct_volume[:min_slices], seg_volume[:min_slices]
                if self.train == True:
                    tumor_slice_index = np.where(np.any(seg_volume, axis=(1, 2)))[0]
                    ct_volume = ct_volume[tumor_slice_index]
                    seg_volume = seg_volume[tumor_slice_index]

                if self.train and np.sum(seg_volume) == 0:
                    logging.info(f"🚫 Skipping {pid} (no tumor)")
                    continue

                # ct_volume = self.normalize_volume(ct_volume)
                all_data.append((ct_volume.astype(np.float32), seg_volume.astype(np.uint8), pid))
                logging.info(f"✅ Loaded {pid} | Volume shape: {ct_volume.shape}")

            except Exception as e:
                logging.warning(f"⚠️ Failed to load {pid}: {e}")

        return all_data

    def __getitem__(self, idx):
        return self.volume_data[idx]

    def __len__(self):
        return len(self.volume_data)

    def save_hdf5(self, output_path):
        with h5py.File(output_path, "w") as h5f:
            for i, (ct, mask, pid) in enumerate(self.volume_data):
                h5f.create_dataset(f"{pid}/ct", data=ct, compression="gzip")
                h5f.create_dataset(f"{pid}/mask", data=mask, compression="gzip")
                h5f[f"{pid}"].attrs["shape"] = ct.shape
                logging.info(f"💾 Saved patient {pid} | Shape: {ct.shape}")


# (5) Merged Multiple Files

In [None]:
def merge_hdf5_files(input_files, output_file):
    with h5py.File(output_file, "w") as f_out:
        for file in input_files:
            with h5py.File(file, "r") as f:
                for pid in f.keys():
                    grp = f_out.create_group(pid)
                    f_out[pid].create_dataset("ct", data=f[pid]["ct"][:], compression="gzip")
                    f_out[pid].create_dataset("mask", data=f[pid]["mask"][:], compression="gzip")
                    f_out[pid].attrs["shape"] = f[pid].attrs["shape"]
                    logging.info(f"🧬 Merged patient {pid} | Shape: {f[pid]['ct'].shape}")

    logging.info(f"\n✅ Merged HDF5 saved: {output_file}")

# (6) PROCESS & SAVE TRAIN DATASET in HDF5 format

In [None]:
if __name__ == "__main__":
    # Define the target directory
    target_dir = "/content/drive/MyDrive/PhDwork"
    # Change to the target directory
    os.chdir(target_dir)
    # Verify the change
    print("Current Directory:", os.getcwd())

    # Metadata
    metadata_lung1 = pd.read_csv("/content/drive/MyDrive/PhDwork/metadata/metadata_lung1.csv")
    patient_list_lung1 = metadata_lung1["Subject ID"].unique().tolist()

    train_patient, valid_patient = train_test_split(patient_list_lung1, test_size=0.1, random_state=42)
    train_patient, test_patient = train_test_split(train_patient, test_size=0.1, random_state=42)
    print(f"Patients: Train={len(train_patient)}, Valid={len(valid_patient)}, Test={len(test_patient)}")

    # Define DATASET_DIR
    DATASET_DIR = os.path.join(target_dir, "datasets/allPatients")  # Create a datasets subfolder
    os.makedirs(DATASET_DIR, exist_ok=True)  # Create the directory if it doesn't exist

    # Create Train Dataset
    print("Loading Training Data...")
    start_time_train = datetime.now()
    train_dataset = PreparePatientDataset(train_patient, metadata_lung1, train=True)  # removed device argument
    train_path = os.path.join(DATASET_DIR, "train_dataset.hdf5")
    train_dataset.save_hdf5(train_path)
    end_time_train = datetime.now()
    duration_train = end_time_train - start_time_train
    print(f"Training dataset creation time: {duration_train}")

    # Create Valid Dataset
    print("Loading valid Data...")
    start_time_valid = datetime.now()
    valid_dataset = PreparePatientDataset(valid_patient, metadata_lung1, train=False) #removed device argument
    valid_path = os.path.join(DATASET_DIR, "valid_dataset.hdf5")
    valid_dataset.save_hdf5(valid_path)
    end_time_valid = datetime.now()
    duration_valid = end_time_valid - start_time_valid
    print(f"Validation dataset creation time: {duration_valid}")

    # Create Test Dataset
    print("Loading test Data...")
    start_time_test = datetime.now()
    test_dataset = PreparePatientDataset(test_patient, metadata_lung1, train=False) #removed device argument
    test_path = os.path.join(DATASET_DIR, "test_dataset.hdf5")
    test_dataset.save_hdf5(test_path)
    end_time_test = datetime.now()
    duration_test = end_time_test - start_time_test
    print(f"Test dataset creation time: {duration_test}")

Current Directory: /content/drive/MyDrive/PhDwork
Patients: Train=1, Valid=1, Test=1
Loading Training Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/train_dataset.hdf5
Training dataset creation time: 0:05:17.565594
Loading valid Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/valid_dataset.hdf5
Validation dataset creation time: 0:08:05.516331
Loading test Data...
✅ HDF5 file saved at /content/drive/MyDrive/PhDwork/datasets/allPatients/test_dataset.hdf5
Test dataset creation time: 0:05:30.574955
