## Steps

1. **Data Splitting**
   - Split the dataset into training, testing, and validation sets with a ratio of 60:20:20.

2. **Data Filtering**
   - Select data samples that have at least 0.1% masks and their corresponding images.

3. **Normalization**
   - Apply Z-score normalization to the data.

4. **Data Transformation**
   - Convert 3D data into 2D slices.

5. **Cropping**
   - Crop the data to a dimension of 192 x 192.

6. **Data Format Conversion**
   - Convert data from .nii.gz to .npy format.

In [8]:
# Importing necessary module
import os
import re
import glob
import cv2
import shutil
import numpy as np
import nibabel as nib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [9]:
# Data path
data_folder = "/path/"

In [10]:
# Image and Mask files
image_files = sorted(glob.glob(os.path.join(data_folder, "**/*_T1w.nii.gz"), recursive=True))
mask_files = sorted(glob.glob(os.path.join(data_folder, "**/*_label-L_desc-T1lesion_mask.nii.gz"), recursive=True))

# Spliting train, test and validation sets in the ratio of 60:20:20
image_train_files, image_test_val_files = train_test_split(image_files, test_size=0.4, random_state=42)
image_test_files, image_val_files = train_test_split(image_test_val_files, test_size=0.5, random_state=42)

mask_train_files, mask_test_val_files = train_test_split(mask_files, test_size=0.4, random_state=42)
mask_test_files, mask_val_files = train_test_split(mask_test_val_files, test_size=0.5, random_state=42)

# Define the directories to save the train, test, and validation files
train_dir = os.path.join(data_folder, 'train')
test_dir = os.path.join(data_folder, 'test')
val_dir = os.path.join(data_folder, 'val')

# Accept atleast 0.1% (threshold)
mask_rejection_threshold = 0.001

#TRAINING
for file_name in image_train_files:
    # Load file
    file_path = file_name
    file_data = nib.load(file_path).get_fdata()

    # Load corresponding mask file
    mask_file_name = file_name.replace('_T1w.nii.gz', '_label-L_desc-T1lesion_mask.nii.gz')
    mask_file_path = os.path.join(os.path.dirname(file_path), mask_file_name)
    mask_data = nib.load(mask_file_path).get_fdata()

    # Get the shape of the image
    x_dim, y_dim, z_dim = file_data.shape
    
    image_folder = os.path.splitext(file_path)[0] + '_train_image_slices'
    os.makedirs(os.path.join(train_dir, image_folder), exist_ok=True)
    mask_folder = os.path.splitext(mask_file_path)[0] + '_train_mask_slices'
    os.makedirs(os.path.join(train_dir, mask_folder), exist_ok=True)

    # Slice along the Z-axis
    for i in range(z_dim):
        # Extract the 2D slice from the 3D data and mask
        slice_data = file_data[:, :, i]
        mask_slice_data = mask_data[:, :, i]

        # Reject if the mask slice has enough nonzero pixels/lesion 
        if np.count_nonzero(mask_slice_data) / mask_slice_data.size >= mask_rejection_threshold:
            # Crop the image
            slice_data_cropped = slice_data[10:190, 40:220]
            # Resize the image
            slice_data_resized = cv2.resize(slice_data_cropped, (192, 192), interpolation=cv2.INTER_LINEAR)
            # Normalize data using Z-score normalization
            data_norm = (slice_data_resized - np.mean(slice_data_resized)) / np.std(slice_data_resized, ddof=1)

            # Save the image slice as a .npy file in the image folder
            slice_filename = os.path.splitext(os.path.basename(file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(train_dir, image_folder, slice_filename)
            np.save(slice_path, data_norm)

            # Resize the mask
            mask_slice_data_cropped = mask_slice_data[10:190, 40:220]
            mask_slice_data_resized = cv2.resize(mask_slice_data_cropped, (192, 192), interpolation=cv2.INTER_NEAREST)

            # Save the mask slice as a .npy file in the mask folder
            slice_filename = os.path.splitext(os.path.basename(mask_file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(train_dir, mask_folder, slice_filename)
            np.save(slice_path, mask_slice_data_resized)

###########################################################################################################
# TESTING
for file_name in image_test_files:
    # Load file
    file_path = file_name
    file_data = nib.load(file_path).get_fdata()

    # Load corresponding mask file
    mask_file_name = file_name.replace('_T1w.nii.gz', '_label-L_desc-T1lesion_mask.nii.gz')
    mask_file_path = os.path.join(os.path.dirname(file_path), mask_file_name)
    mask_data = nib.load(mask_file_path).get_fdata()

    # Get the shape of the Image
    x_dim, y_dim, z_dim = file_data.shape

    image_folder = os.path.splitext(file_path)[0] + '_test_image_slices'
    os.makedirs(os.path.join(test_dir, image_folder), exist_ok=True)
    mask_folder = os.path.splitext(mask_file_path)[0] + '_test_mask_slices'
    os.makedirs(os.path.join(test_dir, mask_folder), exist_ok=True)

    # Slice along the Z-axis
    for i in range(z_dim):
        # Extract the 2D slice from the 3D data and mask
        slice_data = file_data[:, :, i]
        mask_slice_data = mask_data[:, :, i]

        # Reject if the mask slice has enough nonzero pixels/'lesions
        if np.count_nonzero(mask_slice_data) / mask_slice_data.size >= mask_rejection_threshold:
            # Crop the image
            slice_data_cropped = slice_data[10:190, 40:220]
            # Resize the image
            slice_data_resized = cv2.resize(slice_data_cropped, (192, 192), interpolation=cv2.INTER_LINEAR)
            # Normalize data using Z-score normalization
            data_norm = (slice_data_resized - np.mean(slice_data_resized)) / np.std(slice_data_resized, ddof=1)

            # Save the image slice as a .npy file in the image folder
            slice_filename = os.path.splitext(os.path.basename(file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(test_dir, image_folder, slice_filename)
            np.save(slice_path, data_norm)

            # Resize the mask
            mask_slice_data_cropped = mask_slice_data[10:190, 40:220]
            mask_slice_data_resized = cv2.resize(mask_slice_data_cropped, (192, 192), interpolation=cv2.INTER_NEAREST)
            # Save the mask slice as a .npy file in the mask folder
            slice_filename = os.path.splitext(os.path.basename(mask_file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(test_dir, mask_folder, slice_filename)
            np.save(slice_path, mask_slice_data_resized)
##########################################################################################################
# VALIDATION
for file_name in image_val_files:
    # Load file
    file_path = file_name
    file_data = nib.load(file_path).get_fdata()

    # Load corresponding mask file
    mask_file_name = file_name.replace('_T1w.nii.gz', '_label-L_desc-T1lesion_mask.nii.gz')
    mask_file_path = os.path.join(os.path.dirname(file_path), mask_file_name)
    mask_data = nib.load(mask_file_path).get_fdata()

    # Get the shape of the Image
    x_dim, y_dim, z_dim = file_data.shape

    image_folder = os.path.splitext(file_path)[0] + '_val_image_slices'
    os.makedirs(os.path.join(val_dir, image_folder), exist_ok=True)
    mask_folder = os.path.splitext(mask_file_path)[0] + '_val_mask_slices'
    os.makedirs(os.path.join(val_dir, mask_folder), exist_ok=True)

    # Slice along the Z-axis
    for i in range(z_dim):
        # Extract the 2D slice from the 3D data and mask
        slice_data = file_data[:, :, i]
        mask_slice_data = mask_data[:, :, i]

        # Reject if the mask slice has enough nonzero pixels/lesions
        if np.count_nonzero(mask_slice_data) / mask_slice_data.size >= mask_rejection_threshold:
            # Crop the image
            slice_data_cropped = slice_data[10:190, 40:220]
            # Resize the image
            slice_data_resized = cv2.resize(slice_data_cropped, (192, 192), interpolation=cv2.INTER_LINEAR)
            # Normalize data using Z-score normalization
            data_norm = (slice_data_resized - np.mean(slice_data_resized)) / np.std(slice_data_resized, ddof=1)

            # Save the image slice as a .npy file in the image folder
            slice_filename = os.path.splitext(os.path.basename(file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(val_dir, image_folder, slice_filename)
            np.save(slice_path, data_norm)

            # Resize the mask
            mask_slice_data_cropped = mask_slice_data[10:190, 40:220]
            mask_slice_data_resized = cv2.resize(mask_slice_data_cropped, (192, 192), interpolation=cv2.INTER_NEAREST)

            # Save the mask slice as a .npy file in the mask folder
            slice_filename = os.path.splitext(os.path.basename(mask_file_path))[0] + f'_slice_{i}.npy'
            slice_path = os.path.join(val_dir, mask_folder, slice_filename)
            np.save(slice_path, mask_slice_data_resized)

In [11]:
# Get the .npy folder
image_npy_files_train = sorted(glob.glob(os.path.join(data_folder, "**/*_train_image_slices/*.npy"), recursive=True))
mask_npy_files_train = sorted(glob.glob(os.path.join(data_folder, "**/*_train_mask_slices/*.npy"), recursive=True))
image_npy_files_test = sorted(glob.glob(os.path.join(data_folder, "**/*_test_image_slices/*.npy"), recursive=True))
mask_npy_files_test = sorted(glob.glob(os.path.join(data_folder, "**/*_test_mask_slices/*.npy"), recursive=True))
image_npy_files_val = sorted(glob.glob(os.path.join(data_folder, "**/*_val_image_slices/*.npy"), recursive=True))
mask_npy_files_val = sorted(glob.glob(os.path.join(data_folder, "**/*_val_mask_slices/*.npy"), recursive=True))

print("Image .npy files:", len(image_npy_files_train))
print("Mask .npy files:", len(mask_npy_files_train))

print("Image .npy files:", len(image_npy_files_test))
print("Mask .npy files:", len(mask_npy_files_test))

print("Image .npy files:", len(image_npy_files_val))
print("Mask .npy files:", len(mask_npy_files_val))

Image .npy files: 15394
Mask .npy files: 15394
Image .npy files: 4666
Mask .npy files: 4666
Image .npy files: 5452
Mask .npy files: 5452


In [12]:
# Train
train_image_folder = os.path.join(data_folder, "train", "images")
train_mask_folder = os.path.join(data_folder, "train", "masks")

if not os.path.exists(train_image_folder):
    os.makedirs(train_image_folder)
if not os.path.exists(train_mask_folder):
    os.makedirs(train_mask_folder)

# Save image and mask files to train/images and train/masks
for i in range(len(image_npy_files_train)):
    image_path = image_npy_files_train[i]
    mask_path = mask_npy_files_train[i]
    file_name = os.path.basename(image_path)
    shutil.copy(image_path, os.path.join(train_image_folder, file_name))
    shutil.copy(mask_path, os.path.join(train_mask_folder, file_name))

# Test
test_image_folder = os.path.join(data_folder, "test", "images")
test_mask_folder = os.path.join(data_folder, "test", "masks")

if not os.path.exists(test_image_folder):
    os.makedirs(test_image_folder)
if not os.path.exists(test_mask_folder):
    os.makedirs(test_mask_folder)

# Save image and mask files to test/images and test/masks
for i in range(len(image_npy_files_test)):
    image_path = image_npy_files_test[i]
    mask_path = mask_npy_files_test[i]
    file_name = os.path.basename(image_path)
    shutil.copy(image_path, os.path.join(test_image_folder, file_name))
    shutil.copy(mask_path, os.path.join(test_mask_folder, file_name))

# Validation 
val_image_folder = os.path.join(data_folder, "val", "images")
val_mask_folder = os.path.join(data_folder, "val", "masks")

if not os.path.exists(val_image_folder):
    os.makedirs(val_image_folder)
if not os.path.exists(val_mask_folder):
    os.makedirs(val_mask_folder)

# Save image and mask files to val/images and val/masks
for i in range(len(image_npy_files_val)):
    image_path = image_npy_files_val[i]
    mask_path = mask_npy_files_val[i]
    file_name = os.path.basename(image_path)
    shutil.copy(image_path, os.path.join(val_image_folder, file_name))
    shutil.copy(mask_path, os.path.join(val_mask_folder, file_name))

In [13]:
# Train folder
train_image_folder = "/path/train/images"
train_mask_folder = "/path/train/masks"

num_images = len(os.listdir(train_image_folder))
num_masks = len(os.listdir(train_mask_folder))

print("Number of images in train/images:", num_images)
print("Number of masks in train/masks:", num_masks)

# Test Folder
test_image_folder = "/path/test/images"
test_mask_folder = "/path/test/masks"

num_images = len(os.listdir(test_image_folder))
num_masks = len(os.listdir(test_mask_folder))

print("Number of images in test/images:", num_images)
print("Number of masks in test/masks:", num_masks)

# Validation Folder
val_image_folder = "/path/val/images"
val_mask_folder = "/path/val/masks"

num_images = len(os.listdir(val_image_folder))
num_masks = len(os.listdir(val_mask_folder))

print("Number of images in val/images:", num_images)
print("Number of masks in val/masks:", num_masks)

Number of images in train/images: 15394
Number of masks in train/masks: 15394
Number of images in test/images: 4666
Number of masks in test/masks: 4666
Number of images in val/images: 5452
Number of masks in val/masks: 5452
