In [None]:
import pandas
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
isic_metadata = pandas.read_csv('/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_Metadata.csv')
isic_groundtruth = pandas.read_csv('/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_GroundTruth.csv')

In [None]:
print(isic_metadata)

              image  age_approx anatom_site_general    lesion_id     sex
0      ISIC_0000000        55.0      anterior torso          NaN  female
1      ISIC_0000001        30.0      anterior torso          NaN  female
2      ISIC_0000002        60.0     upper extremity          NaN  female
3      ISIC_0000003        30.0     upper extremity          NaN    male
4      ISIC_0000004        80.0     posterior torso          NaN    male
...             ...         ...                 ...          ...     ...
25326  ISIC_0073247        85.0           head/neck  BCN_0003925  female
25327  ISIC_0073248        65.0      anterior torso  BCN_0001819    male
25328  ISIC_0073249        70.0     lower extremity  BCN_0001085    male
25329  ISIC_0073251        55.0         palms/soles  BCN_0002083  female
25330  ISIC_0073254        50.0     upper extremity  BCN_0001079    male

[25331 rows x 5 columns]


In [None]:
print(isic_groundtruth)

              image  MEL   NV  BCC   AK  BKL   DF  VASC  SCC  UNK
0      ISIC_0000000  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
1      ISIC_0000001  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
2      ISIC_0000002  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
3      ISIC_0000003  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
4      ISIC_0000004  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
...             ...  ...  ...  ...  ...  ...  ...   ...  ...  ...
25326  ISIC_0073247  0.0  0.0  1.0  0.0  0.0  0.0   0.0  0.0  0.0
25327  ISIC_0073248  0.0  0.0  0.0  0.0  1.0  0.0   0.0  0.0  0.0
25328  ISIC_0073249  1.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
25329  ISIC_0073251  0.0  1.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0
25330  ISIC_0073254  0.0  0.0  0.0  0.0  1.0  0.0   0.0  0.0  0.0

[25331 rows x 10 columns]


In [None]:
X = isic_groundtruth.image.tolist()

In [None]:
y = [column for _, row in isic_groundtruth.iterrows() for column in row.index if row[column] == 1]

In [None]:
import os
import shutil
from tqdm import tqdm
import multiprocessing.pool as mpp
from multiprocessing import Pool


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# np.unique(y_train, return_counts=True)
# np.unique(y_test, return_counts=True)

# train_dataset = Dataset(X_train, y_train, ...)
# train_loader = DataLoader(train_dataset, ...)

original_image_folder = "/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Training_Input"

new_root_folder = "/content/drive/MyDrive/Dissertation/skin_lesion_data/ISIC_2019_Split"

def istarmap(self, func, iterable, chunksize=1):
    """starmap-version of imap
    """
    if self._state != mpp.RUN:
        raise ValueError("Pool not running")

    if chunksize < 1:
        raise ValueError(
            "Chunksize must be 1+, not {0:n}".format(
                chunksize))

    task_batches = mpp.Pool._get_tasks(func, iterable, chunksize)
    result = mpp.IMapIterator(self._cache)
    self._taskqueue.put(
        (
            self._guarded_task_generation(result._job,
                                          mpp.starmapstar,
                                          task_batches),
            result._set_length
        ))
    return (item for chunk in result for item in chunk)


mpp.Pool.istarmap = istarmap

def copy(src, dest):
    shutil.copy(src, dest)

# Training data
def organise_data(split_name, images, labels):
    split_folder = os.path.join(new_root_folder, split_name)
    if not os.path.exists(split_folder):
        os.makedirs(split_folder)

    print(f"Organising {split_name} data")
    params = []
    for image_id, image_label in zip(images, labels):
        # The folder to move the image to
        class_folder = os.path.join(split_folder, image_label)

        # If the folder doesnt exist, create it
        if not os.path.exists(class_folder):
            os.makedirs(class_folder)
        
        # Move the original image to the split image
        source = os.path.join(original_image_folder, f"{image_id}.jpg")
        destination = os.path.join(class_folder, f"{image_id}.jpg")
        if not os.path.exists(destination):
            params.append([
                os.path.join(original_image_folder, f"{image_id}.jpg"),
                os.path.join(class_folder, f"{image_id}.jpg")
            ])
    
    print("Copying...")
    with Pool(4) as p:
        for _ in tqdm(p.istarmap(copy, params),
                           total=len(params)):
            pass

organise_data("train", X_train, y_train)
organise_data("test", X_test, y_test)

Organising train data
Copying...


100%|██████████| 18598/18598 [46:12<00:00,  6.71it/s]


Organising test data
Copying...


 78%|███████▊  | 3927/5067 [13:35<01:14, 15.27it/s]