In [None]:
import os
import numpy as np
import nibabel as nib
from skimage.transform import resize
from sklearn.model_selection import train_test_split

In [None]:
DATA_PATH = "MICCAI_BraTS2020_TrainingData"
TARGET_SIZE = (64, 64)

# List to hold all processed 3-channel image samples
all_images = []
# List to hold the binary labels (0 or 1)
all_labels = []

In [None]:
# Helper Function for Normalization
def normalize_slice(slice_data):
    if np.std(slice_data) == 0:
        return slice_data # Skip normalization if standard deviation is zero
    mean = np.mean(slice_data)
    std = np.std(slice_data)
    # Add a small epsilon to prevent division by zero in case of unexpected data
    return (slice_data - mean) / (std + 1e-8)

In [None]:
# Main Data Processing Loop

# Iterate over every patient folder in the dataset directory
for patient_folder in os.listdir(DATA_PATH):
    # Construct the full path to the patient's directory
    patient_dir = os.path.join(DATA_PATH, patient_folder)

    # Skip any files that are not directories
    if not os.path.isdir(patient_dir):
        continue

    try:
        # --- Step 1: Load Raw 3D Volumes ---
        # We select three modalities (T1, T1ce, FLAIR) for the 3 channels
        # T1c is often the best for tumor boundaries, FLAIR for edema

        # NOTE: File naming convention might vary. Adjust if your file names are different.
        files = {
            'T1': nib.load(os.path.join(patient_dir, patient_folder + '_t1.nii.gz')).get_fdata(),
            'T1ce': nib.load(os.path.join(patient_dir, patient_folder + '_t1ce.nii.gz')).get_fdata(),
            'FLAIR': nib.load(os.path.join(patient_dir, patient_folder + '_flair.nii.gz')).get_fdata(),
            'SEG': nib.load(os.path.join(patient_dir, patient_folder + '_seg.nii.gz')).get_fdata() # Segmentation for labels
        }

        # Ensure all volumes are the same shape (a standard BRATS assumption)
        num_slices = files['T1'].shape[2]

        # Step 2 & 3: Slice Extraction, Normalization, Resizing, and Stacking

        # Iterate through every slice (Z-axis)
        for z in range(num_slices):

            # Extract 2D slices for all 4 volumes
            slice_T1 = files['T1'][:, :, z]
            slice_T1ce = files['T1ce'][:, :, z]
            slice_FLAIR = files['FLAIR'][:, :, z]
            slice_SEG = files['SEG'][:, :, z]

            # --- Filter Blank Slices ---
            # Use the T1ce slice to check for any brain matter (non-zero value)
            if np.max(slice_T1ce) > 0:

                # --- Step 3 (cont.): Normalization ---
                norm_T1 = normalize_slice(slice_T1)
                norm_T1ce = normalize_slice(slice_T1ce)
                norm_FLAIR = normalize_slice(slice_FLAIR)

                # --- Step 4: Resize and Stack (64x64x3) ---

                # Resize each channel independently
                # We use resize(..., preserve_range=True) to keep normalized values
                resized_T1 = resize(norm_T1, TARGET_SIZE, anti_aliasing=True, preserve_range=True)
                resized_T1ce = resize(norm_T1ce, TARGET_SIZE, anti_aliasing=True, preserve_range=True)
                resized_FLAIR = resize(norm_FLAIR, TARGET_SIZE, anti_aliasing=True, preserve_range=True)

                # Stack the three 2D arrays to create the 3-channel input tensor (64, 64, 3)
                three_channel_tensor = np.stack(
                    [resized_T1, resized_T1ce, resized_FLAIR],
                    axis=-1 # Stacks along the last axis, creating the channel dimension
                )

                # --- Labeling (Binary Classification) ---
                # Label is 1 if any segmentation value is > 0 (tumor present)
                label = 1 if np.max(slice_SEG) > 0 else 0

                # Append the processed sample and its label to the master lists
                all_images.append(three_channel_tensor)
                all_labels.append(label)

    except Exception as e:
        print(f"Error processing patient {patient_folder}: {e}")
        continue

print(f"\nTotal number of image slices extracted: {len(all_images)}")

# Convert lists to final NumPy arrays for training
X = np.array(all_images, dtype=np.float32)
Y = np.array(all_labels, dtype=np.int32)

print(f"Final Input Data (X) Shape: {X.shape}") # Expected: (N, 64, 64, 3)
print(f"Final Label Data (Y) Shape: {Y.shape}") # Expected: (N,)

In [None]:

# --- Step 5: Train/Test Split (80:20 Ratio from the Paper) ---

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.20,             # 20% for testing
    random_state=42,            # Ensures the split is the same every time
    stratify=Y                  # Keeps the ratio of tumor/no-tumor slices equal in train/test sets
)

print("-" * 50)
print("Data Split Complete.")
print(f"Training Set Shape (X_train): {X_train.shape}")
print(f"Testing Set Shape (X_test): {X_test.shape}")
print("-" * 50)