<a href="https://colab.research.google.com/github/sajidcsecu/radioGenomic/blob/main/Data_Preparation_(final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (1) Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# (2) Import Required Libraries

In [2]:
!pip install SimpleITK
!pip install pydicom===2.4.3
!pip install pydicom-seg
!pip install numpy==1.23.5

Collecting SimpleITK
  Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.4.1
Collecting pydicom===2.4.3
  Downloading pydicom-2.4.3-py3-none-any.whl.metadata (7.8 kB)
Downloading pydicom-2.4.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.4.3
Collecting pydicom-seg
  Downloading pydicom_seg-0.4.1-py3-none-any.whl.metadata (5.2 kB)
Collecting jsonschema<4.0.0,>=3.2.0 (from pydicom-seg)
  Downloading jsonschema-3.2.0-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting numpy

In [3]:
import torch
import numpy as np
import pandas as pd
import os
import SimpleITK as sitk
import pydicom
import pydicom_seg
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Define Paths

In [21]:
load_dir = "/content/drive/MyDrive/PhDwork"
save_dir = "/content/drive/MyDrive/PhDwork/datasets"
os.makedirs(save_dir, exist_ok=True)

# (3) Define Dataset Class with Optimized Mask Storage

In [27]:
class PatientDataset2DUNet(Dataset):
    def __init__(self, patients, metadata, train=True):
        """
        Args:
            patients (list): List of patient IDs.
            metadata (DataFrame): Metadata containing patient information.
            train (bool): If True, filters out empty slices.
        """
        self.patients = patients
        self.metadata = metadata
        self.train = train
        self.slices = self._extract_slices()  # Store (img_path, mask_path, slice_idx) tuples

    def get_path(self, subject, modality):
        subject_filtered = subject[subject['Modality'] == modality]
        return subject_filtered['File Location'].iloc[0] if not subject_filtered.empty else None

    def _extract_slices(self):
        slices = []
        patient_paths = {p: self.metadata[self.metadata['Subject ID'] == p] for p in self.patients}

        for patient, subject in patient_paths.items():
            img_path = self.get_path(subject, "CT")
            msk_path = self.get_path(subject, "SEG")

            if img_path and msk_path:
                min_slices = min(self.get_num_slices(img_path), self.get_num_slices(msk_path))
                slice_indices = np.arange(min_slices) if not self.train else np.where(self.has_tumor(msk_path))[0]
                slices.extend(zip([img_path] * len(slice_indices), [msk_path] * len(slice_indices), slice_indices))
        return slices

    def read_ct_array(self, path, slice_idx):
        reader = sitk.ImageSeriesReader()
        reader.SetImageIO("GDCMImageIO")
        path = os.path.normpath(path)
        path = path.replace("\\", "/")
        dicom_names = reader.GetGDCMSeriesFileNames(path)
        reader.SetFileNames(dicom_names)
        image = reader.Execute()
        return sitk.GetArrayFromImage(image)[slice_idx].astype(np.float32)

    def read_seg_array(self, path, slice_idx, seg_type="GTV-1"):
        path = os.path.normpath(path)
        path = path.replace("\\", "/")
        try:
            segmentation = pydicom.dcmread(os.path.join(path, '1-1.dcm'))
            seg_df = pd.DataFrame({f: [s[f].value for s in segmentation.SegmentSequence] for f in ['SegmentNumber', 'SegmentDescription']})
            seg_number = seg_df.loc[seg_df['SegmentDescription'] == seg_type, 'SegmentNumber'].iloc[0]
            mask = pydicom_seg.SegmentReader().read(segmentation).segment_image(seg_number)
            return sitk.GetArrayFromImage(mask)[slice_idx].astype(np.uint8)  # Use uint8 to save space
        except Exception as e:
            print(f"Error reading segmentation from {path}: {e}")
            return None

    def __getitem__(self, idx):
        img_path, mask_path, slice_idx = self.slices[idx]

        # Read one slice at a time
        img = self.read_ct_array(img_path, slice_idx)
        msk = self.read_seg_array(mask_path, slice_idx, "GTV-1")

        if img is None or msk is None:
            return torch.zeros(1, 512, 512), torch.zeros(1, 512, 512)

        # Normalize image
        img = (img - img.min()) / max(img.max(), 1e-6)  # Avoid divide-by-zero

        # Convert to tensors
        img = torch.tensor(img, dtype=torch.float16).unsqueeze(0)  # Convert to float16
        msk = torch.tensor(msk, dtype=torch.uint8).unsqueeze(0)  # Use uint8 for masks

        return img, msk

    def __len__(self):
        return len(self.slices)

/content/drive/MyDrive/PhDwork
Patients: Train=1, Valid=1, Test=1
Loading Training Data...
Processing Patient: LUNG1-002


# (4) PROCESS & SAVE TRAIN DATASET (.npz)

In [None]:
def save_dataset_npz(dataset, filename):
    save_path = os.path.join(DATASET_DIR, filename)
    images, masks = [], []

    for img, mask in dataset:
        images.append(img.numpy())  # Already float16
        masks.append(mask.numpy())  # Already uint8

    np.savez_compressed(save_path, images=np.stack(images), masks=np.stack(masks))
    print(f"✅ Saved {filename} at {save_path}")

# (5) Load and Save Train Dataset

In [26]:
metadata_lung1 = pd.read_csv("/content/drive/MyDrive/PhDwork/metadata/metadata_lung1.csv")

patient_list_lung1 = metadata_lung1["Subject ID"].unique().tolist()
train_patient, valid_patient = train_test_split(patient_list_lung1, test_size=0.1, random_state=42)
train_patient, test_patient = train_test_split(train_patient, test_size=0.1, random_state=42)

print(f"Patients: Train={len(train_patient)}, Valid={len(valid_patient)}, Test={len(test_patient)}")

# Create Train Dataset
print("Loading Training Data...")
train_dataset = PatientDataset2DUNet(train_patient, metadata_lung1, train=True)

# Save Train Dataset as .npz
save_dataset_npz(train_dataset, "train_dataset.npz")


Loading Training Data...
Processing Patient: LUNG1-002


AttributeError: module 'numpy' has no attribute 'float8'

# (6) Display Sample Slices

In [24]:
def display_sample_shape(dataset):
    idx = np.random.randint(0, len(dataset))
    img, mask = dataset[idx]

    print(f"Image Shape: {img.shape}")  # Should be (1, H, W)
    print(f"Mask Shape: {mask.shape}")  # Should be (1, H, W)

display_sample_shape(train_dataset)


Image Shape: torch.Size([1, 512, 512])
Mask Shape: torch.Size([1, 512, 512])
