In [None]:
# Step 1: Import Libraries and Initialize Constants
import os
import shutil
import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Constants and Configurations
BASE_DIR = "/content/drive/MyDrive/V1.0 for BreakHis/BreakHis_dataset_augmented"
SAVE_BASE_DIR = "/content/drive/MyDrive/BreakHis_dataset_split"
TEST_RATIO = 0.3
IMG_EXT = "*.png"

In [None]:
# Step 2: Utility Functions
def ensure_dir_exists(directory):
    os.makedirs(directory, exist_ok=True)

def split_dataset(image_paths, test_ratio=TEST_RATIO, validation_ratio=0.5):
    # Create labels based on the presence of 'benign' in the file path (otherwise 'malignant')
    labels = ['benign' if 'benign' in path.lower() else 'malignant' for path in image_paths]
    train_paths, temp_paths = train_test_split(
        image_paths, test_size=test_ratio, stratify=labels, random_state=42
    )
    # For temp_paths, use the immediate folder name (assumed to be the class) for stratification
    val_paths, test_paths = train_test_split(
        temp_paths, test_size=validation_ratio,
        stratify=[os.path.basename(os.path.dirname(p)) for p in temp_paths],
        random_state=42
    )
    return train_paths, val_paths, test_paths

def organize_dataset(paths, target_dir):
    """Organize dataset into labeled subfolders by copying images."""
    for path in tqdm(paths, desc=f"Saving to {os.path.basename(target_dir)}"):
        try:
            # Determine class label based on file path
            class_label = 'benign' if 'benign' in path.lower() else 'malignant'
            class_dir = os.path.join(target_dir, class_label)
            ensure_dir_exists(class_dir)
            shutil.copy(path, os.path.join(class_dir, os.path.basename(path)))
        except Exception as e:
            print(f"Error copying {path} to {class_dir}: {e}")

def prepare_dataset(base_dir, save_base_dir):
    """Prepare dataset directories and save images into train, validation, and test splits."""
    print("Preparing dataset...")
    # Recursively find all jpg images
    image_paths = glob.glob(os.path.join(base_dir, '**', IMG_EXT), recursive=True)
    print(f"Total images found: {len(image_paths)}")

    # Split dataset
    train_paths, val_paths, test_paths = split_dataset(image_paths)

    # Define directories for each split
    splits = {
        "train": os.path.join(save_base_dir, 'train'),
        "validation": os.path.join(save_base_dir, 'validation'),
        "test": os.path.join(save_base_dir, 'test'),
    }
    for directory in splits.values():
        ensure_dir_exists(directory)

    # Organize images into respective folders
    organize_dataset(train_paths, splits["train"])
    organize_dataset(val_paths, splits["validation"])
    organize_dataset(test_paths, splits["test"])

    return splits

In [None]:
# Step 3: Prepare Dataset
print("Starting dataset preparation...")
splits = prepare_dataset(BASE_DIR, SAVE_BASE_DIR)
print(f"Train directory: {splits['train']}")
print(f"Validation directory: {splits['validation']}")
print(f"Test directory: {splits['test']}")

Starting dataset preparation...
Preparing dataset...
Total images found: 10858


Saving to train: 100%|██████████| 7600/7600 [24:27<00:00,  5.18it/s]
Saving to validation: 100%|██████████| 1629/1629 [03:56<00:00,  6.90it/s]
Saving to test: 100%|██████████| 1629/1629 [04:04<00:00,  6.67it/s]

Train directory: /content/drive/MyDrive/BreakHis_dataset_split/train
Validation directory: /content/drive/MyDrive/BreakHis_dataset_split/validation
Test directory: /content/drive/MyDrive/BreakHis_dataset_split/test



