In [None]:
import os
import shutil

import numpy as np

from imutils import paths
from sklearn.model_selection import train_test_split

In [None]:
images = list(paths.list_images("../data/archive"))

In [None]:
images[0]

In [None]:
images[0].split("/")[-1].replace(".png", "")[-1]

In [None]:
# Invasive Ductal Carcinoma (IDC) is the most common subtype of all breast cancers
# 0 is non-IDC and 1 is IDC

In [None]:
idc = []
non_idc = []

for image in images:
    if image.split("/")[-1].replace(".png", "")[-1] == "0":
        non_idc.append(image)
    if image.split("/")[-1].replace(".png", "")[-1] == "1":
        idc.append(image)

In [None]:
# check if all images are accounted for
len(idc) + len(non_idc) == len(images)

In [None]:
print(f"IDC: {len(idc)}")
print(f"Non-IDC: {len(non_idc)}")

In [None]:
print(f"IDC: {round(len(idc)/len(images), 2)}")
print(f"Non-IDC: {round(len(non_idc)/len(images), 2)}")

In [None]:
# the dataset is imbalanced, to deal with this i'll undersample the non-idc images
non_idc = np.random.choice(non_idc, len(idc), replace=False)

In [None]:
# creation of the train, validation and test splits
train_idc, tmp_idc = train_test_split(idc, test_size=0.2, random_state=42)
train_non_idc, tmp_non_idc = train_test_split(non_idc, test_size=0.2, random_state=42)

test_idc, val_idc = train_test_split(tmp_idc, test_size=0.5, random_state=42)
test_non_idc, val_non_idc = train_test_split(tmp_non_idc, test_size=0.5, random_state=42)

In [None]:
print(f"Train IDC: {len(train_idc)}")
print(f"Train Non-IDC: {len(train_non_idc)}\n")
print(f"Test IDC: {len(test_idc)}")
print(f"Test Non-IDC: {len(test_non_idc)}\n")
print(f"Val IDC: {len(val_idc)}")
print(f"Val Non-IDC: {len(val_non_idc)}")

In [None]:
if os.path.exists("../data/dataset"):
    print("Dataset already exists")
else:
    os.mkdir("../data/dataset")
    os.mkdir("../data/dataset/train")
    os.mkdir("../data/dataset/test")
    os.mkdir("../data/dataset/val")
    os.mkdir("../data/dataset/train/idc")
    os.mkdir("../data/dataset/train/non_idc")
    os.mkdir("../data/dataset/test/idc")
    os.mkdir("../data/dataset/test/non_idc")
    os.mkdir("../data/dataset/val/idc")
    os.mkdir("../data/dataset/val/non_idc")

In [None]:
for image in train_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/train/idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
for image in train_non_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/train/non_idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
for image in test_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/test/idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
for image in test_non_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/test/non_idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
for image in val_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/val/idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
for image in val_non_idc:
    with open(image, 'rb') as src_file:
        with open(f"../data/dataset/val/non_idc/{image.split('/')[-1]}", 'wb') as dest_file:
            shutil.copyfileobj(src_file, dest_file)

In [None]:
# check if all images are accounted for
print(f"train IDC : {len(list(paths.list_images('../data/dataset/train/idc')))}")
print(f"train Non-IDC : {len(list(paths.list_images('../data/dataset/train/non_idc')))}\n")
print(f"test IDC : {len(list(paths.list_images('../data/dataset/test/idc')))}")
print(f"test Non-IDC : {len(list(paths.list_images('../data/dataset/test/non_idc')))}\n")
print(f"val IDC : {len(list(paths.list_images('../data/dataset/val/idc')))}")
print(f"val Non-IDC : {len(list(paths.list_images('../data/dataset/val/non_idc')))}")