In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import shutil

In [2]:
# Load metadata
metadata = pd.read_csv('HAM10000_metadata.csv')

In [3]:
# Create directories for train and test sets
os.makedirs('data/train/images', exist_ok=True)
os.makedirs('data/train/labels', exist_ok=True)
os.makedirs('data/val/images', exist_ok=True)
os.makedirs('data/val/labels', exist_ok=True)

In [4]:
# Split the dataset into train and validation sets
train_df, val_df = train_test_split(metadata, test_size=0.2, stratify=metadata['dx'])

In [15]:
# Function to copy images and masks to respective directories
def copy_files(df, subset):
    for index, row in df.iterrows():
        image_file = f'HAM10000_combined/{row["image_id"]}.jpg'
        mask_file = f'HAM10000_segmentations/{row["image_id"]}_segmentation.png'
        if os.path.exists(image_file) and os.path.exists(mask_file):
            shutil.copy(image_file, f'data/{subset}/images/{row["image_id"]}.jpg')
            shutil.copy(mask_file, f'data/{subset}/labels/{row["image_id"]}_segmentation.png')

In [16]:
# Copy files to train and validation directories
copy_files(train_df, 'train')
copy_files(val_df, 'val')

In [17]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
import numpy as np

In [18]:
# Define augmentations
transform = A.Compose([
    A.RandomRotate90(),
    A.Flip(),
    A.Transpose(),
    A.OneOf([
        A.GaussNoise(),
    ], p=0.2),
    A.OneOf([
        A.MotionBlur(p=0.2),
        A.MedianBlur(blur_limit=3, p=0.1),
        A.Blur(blur_limit=3, p=0.1),
    ], p=0.2),
    A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=20, p=0.2),
    A.OneOf([
        A.OpticalDistortion(p=0.3),
        A.GridDistortion(p=0.1),
        A.PiecewiseAffine(p=0.3),
    ], p=0.2),
    A.OneOf([
        A.CLAHE(clip_limit=2),
        A.Sharpen(),
        A.Emboss(),
        A.RandomBrightnessContrast(),
    ], p=0.3),
    A.HueSaturationValue(p=0.3),
    ToTensorV2()
])

  original_init(self, **validated_kwargs)


In [23]:
# Apply augmentations and save augmented images
def augment_and_save(image_path, mask_path, save_dir, transform):
    if os.path.exists(image_path) and os.path.exists(mask_path):
        image = np.array(Image.open(image_path))
        mask = np.array(Image.open(mask_path))
        augmented = transform(image=image, mask=mask)
        aug_image = Image.fromarray(augmented['image'])
        aug_mask = Image.fromarray(augmented['mask'])
        aug_image.save(os.path.join(save_dir, 'images', os.path.basename(image_path)))
        aug_mask.save(os.path.join(save_dir, 'labels', os.path.basename(mask_path)))
    else:
        print(f"File not found: {image_path} or {mask_path}")

In [None]:
# Apply augmentations to train set
for index, row in train_df.iterrows():
    image_path = f'data/train/images/{row["image_id"]}.jpg'
    mask_path = f'data/train/labels/{row["image_id"]}_segmentation.png'
    augment_and_save(image_path, mask_path, 'data/train', transform)

In [25]:
def augment_and_save(image_path, mask_path, save_dir, transform):
    # Load image and mask
    image = Image.open(image_path)
    mask = Image.open(mask_path)
    
    # Convert PIL images to Numpy arrays
    image_np = np.array(image)
    mask_np = np.array(mask)
    
    # Apply transformations (assuming transform is a function that works with Numpy arrays)
    augmented_image, augmented_mask = transform(image_np, mask_np)
    
    # Convert back to PIL images if needed
    augmented_image = Image.fromarray(augmented_image)
    augmented_mask = Image.fromarray(augmented_mask)
    
    # Save augmented images and masks
    augmented_image.save(f'{save_dir}/augmented_{os.path.basename(image_path)}')
    augmented_mask.save(f'{save_dir}/augmented_{os.path.basename(mask_path)}')