In [1]:
import pandas as pd

In [2]:
# Load the ground truth CSV file
df = pd.read_csv('HAM10000_metadata.csv')

In [3]:
# Display the first few rows of the dataframe
print(df.head())

     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear


In [4]:
import os
import shutil
#import imgaug.augmenters as iaa
from glob import glob
import cv2
import albumentations as A
from albumentations.core.composition import OneOf

In [5]:
from sklearn.model_selection import train_test_split
import json

In [6]:
# Define the paths
image_src_dir = 'HAM10000_combined'
mask_src_dir = 'HAM10000_segmentations'
image_dest_base_dir = 'data/images'
mask_dest_base_dir = 'data/masks'

In [7]:
# Mapping from class names to numeric labels
class_mapping = {
    'mel': 1,
    'nv': 2,
    'bcc': 3,
    'akiec': 4,
    'bkl': 5,
    'df': 6,
    'vasc': 7
}

In [8]:
# Create class-specific folders
for label in class_mapping.values():
    os.makedirs(os.path.join(image_dest_base_dir, str(label)), exist_ok=True)
    os.makedirs(os.path.join(mask_dest_base_dir, str(label)), exist_ok=True)

In [9]:
# Copy images and masks to their respective class folders
for _, row in df.iterrows():
    img_name = row['image_id']
    img_class = class_mapping[row['dx']]

    img_src_path = os.path.join(image_src_dir, img_name + '.jpg')
    mask_src_path = os.path.join(mask_src_dir, img_name + '_segmentation.png')
    
    img_dest_path = os.path.join(image_dest_base_dir, str(img_class), img_name + '.jpg')
    mask_dest_path = os.path.join(mask_dest_base_dir, str(img_class), img_name + '_segmentation.png')
    
    #shutil.copy(img_src_path, img_dest_path)
    #shutil.copy(mask_src_path, mask_dest_path)

In [10]:
def augment_images_and_masks(image_paths, mask_paths, target_count):
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.Rotate(limit=20, p=0.5),
        A.GaussianBlur(blur_limit=(3, 7), p=0.5),
        A.OneOf([
            A.GaussNoise(var_limit=(10.0, 50.0), p=0.5),
            A.MultiplicativeNoise(multiplier=(0.9, 1.1), p=0.5),
        ], p=0.5)
    ])

    count = len(image_paths)
    while count < target_count:
        idx = count % len(image_paths)
        image = cv2.imread(image_paths[idx])
        mask = cv2.imread(mask_paths[idx], cv2.IMREAD_GRAYSCALE)

        augmented = transform(image=image, mask=mask)
        aug_image = augmented['image']
        aug_mask = augmented['mask']

        aug_img_path = image_paths[idx].replace('.jpg', f'_aug{count}.jpg')
        aug_mask_path = mask_paths[idx].replace('_segmentation.png', f'_segmentation_aug{count}.png')

        cv2.imwrite(aug_img_path, aug_image)
        cv2.imwrite(aug_mask_path, aug_mask)

        count += 1

In [11]:
# Augment data for each class based on the new target distribution
class_counts = {
    1: 3000,  # mel
    2: 10000, # nv
    3: 2000,  # bcc
    4: 1500,  # akiec
    5: 3000,  # bkl
    6: 1000,  # df
    7: 1000   # vasc
}

In [12]:
for cls, target_count in class_counts.items():
    image_paths = glob(os.path.join(image_dest_base_dir, str(cls), '*.jpg'))
    mask_paths = glob(os.path.join(mask_dest_base_dir, str(cls), '*_segmentation.png'))
    
    augment_images_and_masks(image_paths, mask_paths, target_count)

In [13]:
annotations = []

In [14]:
for cls in range(1, 8):
    mask_paths = glob(os.path.join(mask_dest_base_dir, str(cls), '*_segmentation.png'))
    for mask_path in mask_paths:
        img_name = os.path.basename(mask_path).replace('_segmentation.png', '.jpg')
        annotations.append({
            'image': img_name,
            'class': cls,
            'mask': mask_path
        })

In [15]:
# Save annotations to a JSON file
with open('annotations.json', 'w') as f:
    json.dump(annotations, f)

In [16]:
# Load annotations
with open('annotations.json', 'r') as f:
    annotations = json.load(f)

In [18]:
# Split the dataset
train_val, test = train_test_split(annotations, test_size=0.2, stratify=[a['class'] for a in annotations])
train, val = train_test_split(train_val, test_size=0.2, stratify=[a['class'] for a in train_val])

In [19]:
# Save the splits
with open('train_annotations.json', 'w') as f:
    json.dump(train, f)

with open('val_annotations.json', 'w') as f:
    json.dump(val, f)

with open('test_annotations.json', 'w') as f:
    json.dump(test, f)