In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import cv2
import numpy as np
import random
import shutil

# Define paths
original_dataset = "/content/drive/MyDrive/BreakHis_dataset"
new_dataset = "/content/drive/MyDrive/V1.0 for BreakHis/BreakHis_dataset_augmented"

benign_dir = os.path.join(original_dataset, "benign")
malignant_dir = os.path.join(original_dataset, "malignant")

aug_benign_dir = os.path.join(new_dataset, "benign")
aug_malignant_dir = os.path.join(new_dataset, "malignant")

In [None]:
# Create new directories if they don't exist
os.makedirs(aug_benign_dir, exist_ok=True)
os.makedirs(aug_malignant_dir, exist_ok=True)

In [None]:
# Get list of image files
benign_files = [f for f in os.listdir(benign_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
malignant_files = [f for f in os.listdir(malignant_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

benign_count = len(benign_files)
malignant_count = len(malignant_files)

print(f"Original dataset counts - Benign: {benign_count}, Malignant: {malignant_count}")

Original dataset counts - Benign: 2480, Malignant: 5429


In [None]:
# Compute how many extra benign images are needed
diff = malignant_count - benign_count
if diff <= 0:
    print("No augmentation needed as benign count is equal or higher than malignant.")
    diff = 0

# Determine number of augmentations per benign image
base_aug = diff // benign_count if benign_count > 0 else 0
remainder = diff % benign_count

In [None]:
def sensitive_augment(image):
    """
    Applies very subtle augmentations:
      - Rotation by ±2 degrees
      - Translation by ±2 pixels
      - Brightness adjustment within 0.98 to 1.02
      - Minimal Gaussian noise
    """
    h, w = image.shape[:2]

    # Small rotation
    angle = random.uniform(-2, 2)
    center = (w / 2, h / 2)
    M_rot = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M_rot, (w, h), borderMode=cv2.BORDER_REFLECT)

    # Small translation
    tx = random.uniform(-2, 2)
    ty = random.uniform(-2, 2)
    M_trans = np.float32([[1, 0, tx], [0, 1, ty]])
    translated = cv2.warpAffine(rotated, M_trans, (w, h), borderMode=cv2.BORDER_REFLECT)

    # Slight brightness adjustment
    brightness_factor = random.uniform(0.98, 1.02)
    adjusted = np.clip(translated.astype(np.float32) * brightness_factor, 0, 255).astype(np.uint8)

    # Minimal Gaussian noise (mean=0, std=1)
    noise = np.random.normal(0, 1, image.shape).astype(np.float32)
    augmented = np.clip(adjusted.astype(np.float32) + noise, 0, 255).astype(np.uint8)

    return augmented

In [None]:
# Process benign images: copy originals and create augmented versions
for idx, filename in enumerate(benign_files):
    src_path = os.path.join(benign_dir, filename)
    img = cv2.imread(src_path)
    if img is None:
        print(f"Could not read {src_path}. Skipping.")
        continue

    # Save original benign image in new dataset
    dst_path = os.path.join(aug_benign_dir, filename)
    cv2.imwrite(dst_path, img)

    # Determine how many augmented versions to generate for this image
    num_aug = base_aug + (1 if idx < remainder else 0)

    # Generate and save augmented images
    base_name, ext = os.path.splitext(filename)
    for j in range(num_aug):
        aug_img = sensitive_augment(img)
        new_filename = f"{base_name}_aug_{j}{ext}"
        cv2.imwrite(os.path.join(aug_benign_dir, new_filename), aug_img)

# Process malignant images: simply copy them over
for filename in malignant_files:
    src_path = os.path.join(malignant_dir, filename)
    img = cv2.imread(src_path)
    if img is None:
        print(f"Could not read {src_path}. Skipping.")
        continue
    dst_path = os.path.join(aug_malignant_dir, filename)
    cv2.imwrite(dst_path, img)

print("Augmentation complete. The new dataset is saved at:")
print(new_dataset)

Augmentation complete. The new dataset is saved at:
/content/drive/MyDrive/V1.0 for BreakHis/BreakHis_dataset_augmented


In [None]:
# Count and print new dataset totals
aug_benign_count = len([f for f in os.listdir(aug_benign_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
aug_malignant_count = len([f for f in os.listdir(aug_malignant_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
print(f"New dataset counts - Benign: {aug_benign_count}, Malignant: {aug_malignant_count}")

New dataset counts - Benign: 5429, Malignant: 5429
