In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 1: Upload kaggle.json file
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shatabdo","key":"661da089ea4ea47e5fda65368755e312"}'}

In [None]:
# Step 2: Create the .kaggle directory, move the JSON file there, and set proper permissions.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Step 3: Download the dataset using the provided Kaggle link
!kaggle datasets download -d ambarish/breakhis

Dataset URL: https://www.kaggle.com/datasets/ambarish/breakhis
License(s): unknown
Downloading breakhis.zip to /content
100% 3.97G/3.99G [00:37<00:00, 201MB/s]
100% 3.99G/3.99G [00:37<00:00, 114MB/s]


In [None]:
# Step 4: Unzip the downloaded file.
!mkdir -p /content/dataset
!unzip -q *.zip -d /content/dataset

In [None]:
# Step 5: Create destination folders for the reorganized dataset.
import os
import shutil

src_breast_folder = "/content/dataset/BreaKHis_v1/BreaKHis_v1/histology_slides/breast"
benign_src = os.path.join(src_breast_folder, "benign")
malignant_src = os.path.join(src_breast_folder, "malignant")

# Define destination folders for the new flattened structure.
dest_dataset = "BreakHis_dataset"
dest_benign = os.path.join(dest_dataset, "benign")
dest_malignant = os.path.join(dest_dataset, "malignant")

os.makedirs(dest_benign, exist_ok=True)
os.makedirs(dest_malignant, exist_ok=True)

In [None]:
# Step 6: Define a function to recursively copy image files from a source folder to a destination folder
# while renaming them to ensure uniqueness.
def copy_images_rename(src_dir, dst_dir):
    counter = 0
    for root, dirs, files in os.walk(src_dir):
        # Get the relative path to include in the new filename
        rel_path = os.path.relpath(root, src_dir)
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                src_file = os.path.join(root, file)
                # Replace directory separators with underscores and append a counter to create a unique filename.
                new_filename = f"{rel_path.replace(os.sep, '_')}_{counter}_{file}"
                dst_file = os.path.join(dst_dir, new_filename)
                shutil.copy(src_file, dst_file)
                counter += 1

In [None]:
# Step 7: Copy images for both classes using the renaming function.
print("Copying benign images...")
copy_images_rename(benign_src, dest_benign)
print("Copying malignant images...")
copy_images_rename(malignant_src, dest_malignant)

Copying benign images...
Copying malignant images...


In [None]:
# Step 8: Count the copied images to confirm that no overwriting occurred.
benign_count = len([f for f in os.listdir(dest_benign) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
malignant_count = len([f for f in os.listdir(dest_malignant) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
print("Total benign images copied:", benign_count)
print("Total malignant images copied:", malignant_count)

Total benign images copied: 2480
Total malignant images copied: 5429


In [None]:
# Step 9: Optionally, copy the BreakHis_dataset folder to your Google Drive.
!cp -r BreakHis_dataset /content/drive/MyDrive/BreakHis_dataset
