In [1]:
# Import required libraries
import os
import re
import cv2
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Reading and Processing Functions

In [2]:
# Function to get all image files belonging to image_list in a given directory
def get_folder_files(folder_path, image_list):  

    files = [os.path.join(folder_path, f"{image}.jpg") for image in image_list]
    return files
                   
# Function to create and write images for each file path in given directories.
def create_and_write_img(file_paths, file_ids, file_targets, save_dir_0, save_dir_1, desc=None):
    
    # Iterate over each file_path and file_id pair using zip(file_paths, file_ids), while also displaying a progress bar using tqdm.
    for file_path, file_id, file_target, in tqdm(zip(file_paths, file_ids, file_targets), ascii=True, total=len(file_ids), desc=desc, leave=True):
        
        # Build the new file name
        new_name = file_id + ".jpg"

        # Build the image path
        if file_target == 0:
            dst_img_path = os.path.join(save_dir_0, new_name)
        else:
            dst_img_path = os.path.join(save_dir_1, new_name)
            
        # Copy the file from the original location to the destination directory
        shutil.copy(file_path, dst_img_path)
        
    return

def rsync_images(src_dirs, dest_dir):

    # Execute rsync for each source directory
    for src_dir in src_dirs:
        command = ['rsync', '-a', f'{src_dir}/', dest_dir]
        result = subprocess.run(command, capture_output=True, text=True)

        # Print output and errors for debugging
        print(result.stdout)
        if result.stderr:
            print(f"Error: {result.stderr}")
    return

# Function to count files in a directory
def count_files(target_dir):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])

# Prepare Images for CNN Classification

In [3]:
# Read the dataset
ROOT_DATASET_DIR = "./"
file_name = os.path.join(ROOT_DATASET_DIR,"train-metadata-eda-fe-v3-kaggle.csv")
df = pd.read_csv(file_name)

In [4]:
# Define paths for training and test image directories
ORIG_IMG_DIR = os.path.join("..","isic-2024-challenge","train-image","image")
CASE_FOLDERS = os.listdir(ORIG_IMG_DIR)
ROOT_IMAGE_DIR =  os.path.join("..","images")
ROOT_TRAIN_DIR_0 = os.path.join(ROOT_IMAGE_DIR, "train", "0")
ROOT_TRAIN_DIR_1 = os.path.join(ROOT_IMAGE_DIR, "train", "1")
ROOT_TEST_DIR_0 = os.path.join(ROOT_IMAGE_DIR, "test", "0")
ROOT_TEST_DIR_1 = os.path.join(ROOT_IMAGE_DIR, "test", "1")

ROOT_CROSS_DIR_0 = os.path.join(ROOT_IMAGE_DIR, "crossval", "0")
ROOT_CROSS_DIR_1 = os.path.join(ROOT_IMAGE_DIR, "crossval", "1")

# Create directories if not already present
os.makedirs(ROOT_TRAIN_DIR_0, exist_ok=True)
os.makedirs(ROOT_TRAIN_DIR_1, exist_ok=True)
os.makedirs(ROOT_TEST_DIR_0, exist_ok=True)
os.makedirs(ROOT_TEST_DIR_1, exist_ok=True)
os.makedirs(ROOT_CROSS_DIR_0, exist_ok=True)
os.makedirs(ROOT_CROSS_DIR_1, exist_ok=True)

TARGET_SIZE = (128, 128) # assumed 128x128
TRAIN_TEST_SPLIT = 0.15

In [5]:
# Train-test split
X = df.drop(['target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TRAIN_TEST_SPLIT, random_state=42, stratify=y)

In [6]:
# Process train images
ids_train = X_train.isic_id.to_list()
files_train = get_folder_files(folder_path=ORIG_IMG_DIR, image_list=ids_train)

create_and_write_img(file_paths=files_train,
                     file_ids=ids_train,
                     file_targets=y_train,                     
                     save_dir_0=ROOT_TRAIN_DIR_0,
                     save_dir_1=ROOT_TRAIN_DIR_1,
                     desc=f"Train :: {ORIG_IMG_DIR}")

Train :: ..\isic-2024-challenge\train-image\image: 100%|######################| 336007/336007 [08:50<00:00, 633.96it/s]


In [7]:
# Process test images
ids_test = X_test.isic_id.to_list()
files_test = get_folder_files(folder_path=ORIG_IMG_DIR, image_list=ids_test)

create_and_write_img(file_paths=files_test,
                     file_ids=ids_test,
                     file_targets=y_test,                     
                     save_dir_0=ROOT_TEST_DIR_0,
                     save_dir_1=ROOT_TEST_DIR_1,
                     desc=f"Train :: {ORIG_IMG_DIR}")

Train :: ..\isic-2024-challenge\train-image\image: 100%|########################| 59296/59296 [01:02<00:00, 954.09it/s]


In [6]:
# Process cross-validation images
ids_cross = X.isic_id.to_list()
files_cross = get_folder_files(folder_path=ORIG_IMG_DIR, image_list=ids_cross)

create_and_write_img(file_paths=files_cross,
                     file_ids=ids_cross,
                     file_targets=y,                     
                     save_dir_0=ROOT_CROSS_DIR_0,
                     save_dir_1=ROOT_CROSS_DIR_1,
                     desc=f"Crossval :: {ORIG_IMG_DIR}")

Crossval :: ..\isic-2024-challenge\train-image\image: 100%|##################| 395303/395303 [05:57<00:00, 1106.83it/s]


In [6]:
# Verification
src_dirs = [ROOT_TRAIN_DIR_0, ROOT_TRAIN_DIR_1, ROOT_TEST_DIR_0, ROOT_TEST_DIR_1, ROOT_CROSS_DIR_0, ROOT_CROSS_DIR_1]
#dest_dir = ROOT_ALL_DIR
#rsync_images(src_dirs, dest_dir)

# And check out that the copies to directores are successfull 
all_dirs = src_dirs.copy()
#all_dirs.append(dest_dir)
total = 0
subtotal = 0
for directory in all_dirs:
    file_count = count_files(directory)
    print(f"{directory} contains {file_count} files")
    if directory in src_dirs:
        subtotal = subtotal + file_count
    total = total + file_count
print(f"Total files in the train and test directories: {subtotal}")
print(f"Total files in {ROOT_IMAGE_DIR}: {total}")
print(f"Total cases in the dataframe: {df.shape[0]}")

..\images\train\0 contains 335673 files
..\images\train\1 contains 334 files
..\images\test\0 contains 59237 files
..\images\test\1 contains 59 files
..\images\crossval\0 contains 394910 files
..\images\crossval\1 contains 393 files
Total files in the train and test directories: 790606
Total files in ..\images: 790606
Total cases in the dataframe: 395303
