In [1]:
import os
import shutil
import random
import pandas as pd
from sklearn.model_selection import train_test_split


# Define paths
ham10000_path = "./data/HAM10000"
dermnet_path = "./data/dermnet"
output_path = "./data/training_data"

In [4]:
def files_in_directory(directory_path):
    return [file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

dsets= ["train", "test"]

train_classes = os.listdir(dermnet_path + '/train')

class_dict = {value: index for index, value in enumerate(train_classes)}


# for dset in dsets:
#     os.makedirs(output_path + '/' + dset, exist_ok=True)
#     for i in range(len(train_classes)):
#         og_pth = dermnet_path + '/' + dset + '/' + train_classes[i] + '/';
#         files = files_in_directory(og_pth)
#         os.makedirs(output_path + '/' + dset + '/' + str(i), exist_ok=True)
#         for file in files:
#             shutil.copy(og_pth + file, output_path + '/' + dset + '/' + str(i) + '/' + file)

In [5]:
# Get train/test split from DermNet
split_ratios = {}
for class_name in os.listdir(os.path.join(dermnet_path, "train")):
    train_count = len(os.listdir(os.path.join(dermnet_path, "train", class_name)))
    test_count = len(os.listdir(os.path.join(dermnet_path, "test", class_name)))
    split_ratios[class_name] = train_count / (train_count + test_count)

In [6]:
split_ratios

{'Acne and Rosacea Photos': 0.7291666666666666,
 'Actinic Keratosis Basal Cell Carcinoma and other Malignant Lesions': 0.7995824634655533,
 'Atopic Dermatitis Photos': 0.7990196078431373,
 'Bullous Disease Photos': 0.7985739750445633,
 'Cellulitis Impetigo and other Bacterial Infections': 0.7977839335180056,
 'Eczema Photos': 0.7998704663212435,
 'Exanthems and Drug Eruptions': 0.8,
 'Hair Loss Photos Alopecia and other Hair Diseases': 0.7993311036789298,
 'Herpes HPV and other STDs Photos': 0.7988165680473372,
 'Light Diseases and Disorders of Pigmentation': 0.7988748241912799,
 'Lupus and other Connective Tissue diseases': 0.8,
 'Melanoma Skin Cancer Nevi and Moles': 0.7996545768566494,
 'Nail Fungus and other Nail Disease': 0.7993850883935434,
 'Poison Ivy Photos and other Contact Dermatitis': 0.8,
 'Psoriasis pictures Lichen Planus and related diseases': 0.7996585088218554,
 'Scabies Lyme Disease and other Infestations and Bites': 0.7996289424860853,
 'Seborrheic Keratoses and othe

In [2]:
df = pd.read_csv(ham10000_path + '/HAM10000_metadata')

In [4]:
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern


In [13]:
# Perform stratified split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["dx"], random_state=42)

# Save the results
train_df.to_csv(ham10000_path + "/train_dataset.csv", index=False)
test_df.to_csv(ham10000_path + "/test_dataset.csv", index=False)

In [16]:
ham_classes = df["dx"].unique().tolist()

In [23]:
meaning_map = {
    "akiec": "Actinic keratoses and intraepithelial carcinoma / Bowen's disease",
    "bcc": "Basal cell carcinoma",
    "bkl": "Benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses)",
    "df": "Dermatofibroma",
    "mel": "Melanoma",
    "nv": "Melanocytic nevi",
    "vasc": "Vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage)"
}

ham_folder_dict = {
    "bkl": len(train_classes),
    "nv": len(train_classes) + 1,
    "df": len(train_classes) + 2,
    "mel": len(train_classes) + 3,
    "vasc": len(train_classes) + 4,
    "bcc": len(train_classes) + 5,
    "akiec": len(train_classes) + 6
}

ham_class_dict = {meaning_map[index]: value for index, value in ham_folder_dict.items()}

In [29]:
for dset in dsets:
    os.makedirs(output_path + '/' + dset, exist_ok=True)
    for ham_class in ham_folder_dict:
        og_pth = ham10000_path + '/images/';
        selected_df = eval(dset + "_df")
        files = selected_df[selected_df["dx"] == ham_class]["image_id"]
        os.makedirs(output_path + '/' + dset + '/' + str(ham_folder_dict[ham_class]), exist_ok=True)
        for file in files:
            shutil.copy(og_pth + file + '.jpg', output_path + '/' + dset + '/' + str(ham_folder_dict[ham_class]) + '/' + file + '.jpg')

In [30]:
class_dict.update(ham_class_dict)

In [32]:
import json

with open("class_mapping.json", "w") as json_file:
    json.dump(class_dict, json_file, indent=4)  # `indent=4` makes it readable