In [None]:
import pandas
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
isic_metadata = pandas.read_csv('/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_Metadata.csv')
isic_groundtruth = pandas.read_csv('/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_GroundTruth.csv')

In [None]:
X = isic_groundtruth.image.tolist()

In [None]:
y = [column for _, row in isic_groundtruth.iterrows() for column in row.index if row[column] == 1]

In [None]:
import os
import shutil
from tqdm import tqdm
import multiprocessing.pool as mpp
from multiprocessing import Pool

# 80% in train, 20% remaining
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.2, stratify=y)
# 50% reamining in val, 50% remaining in test
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, stratify=y_rem)
# np.unique(y_train, return_counts=True)
# np.unique(y_test, return_counts=True)

# train_dataset = Dataset(X_train, y_train, ...)
# train_loader = DataLoader(train_dataset, ...)

original_image_folder = "/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_Input"

new_root_folder = "/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Split_val"

def istarmap(self, func, iterable, chunksize=1):
    """starmap-version of imap
    """
    if self._state != mpp.RUN:
        raise ValueError("Pool not running")

    if chunksize < 1:
        raise ValueError(
            "Chunksize must be 1+, not {0:n}".format(
                chunksize))

    task_batches = mpp.Pool._get_tasks(func, iterable, chunksize)
    result = mpp.IMapIterator(self._cache)
    self._taskqueue.put(
        (
            self._guarded_task_generation(result._job,
                                          mpp.starmapstar,
                                          task_batches),
            result._set_length
        ))
    return (item for chunk in result for item in chunk)


mpp.Pool.istarmap = istarmap

def copy(src, dest):
    shutil.copy(src, dest)

# Training data
def organise_data(split_name, images, labels):
    split_folder = os.path.join(new_root_folder, split_name)
    if not os.path.exists(split_folder):
        os.makedirs(split_folder)

    print(f"Organising {split_name} data")
    params = []
    for image_id, image_label in zip(images, labels):
        # The folder to move the image to
        class_folder = os.path.join(split_folder, image_label)

        # If the folder doesnt exist, create it
        if not os.path.exists(class_folder):
            os.makedirs(class_folder)
        
        # Move the original image to the split image
        source = os.path.join(original_image_folder, f"{image_id}.jpg")
        destination = os.path.join(class_folder, f"{image_id}.jpg")
        if not os.path.exists(destination):
            params.append([
                os.path.join(original_image_folder, f"{image_id}.jpg"),
                os.path.join(class_folder, f"{image_id}.jpg")
            ])
    
    print("Copying...")
    with Pool(4) as p:
        for _ in tqdm(p.istarmap(copy, params),
                           total=len(params)):
            pass

organise_data("train", X_train, y_train)
organise_data("val", X_val, y_val)
organise_data("test", X_test, y_test)

Organising train data
Copying...


100%|██████████| 20264/20264 [1:07:09<00:00,  5.03it/s]


Organising val data
Copying...


100%|██████████| 2533/2533 [12:46<00:00,  3.30it/s]


Organising test data
Copying...


100%|██████████| 2534/2534 [03:58<00:00, 10.64it/s]
