In [1]:
import os
import random
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

#### Lets try some experimentation here, ill experiment with some class structures after reading Symposium

In [12]:
class ISIC2020Dataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, mode='train', split_ratio=0.8):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        self.mode = mode

        # Split data into train and test
        self.train_data, self.test_data = self.train_test_split(split_ratio)

        if mode == 'train':
            self.data = self.train_data
            # Augment minority class to match majority class
            # We do this because there is a massive class imbalance in the dataset 95% benign, 5% malignant
            benign_count = len(self.data[self.data['target'] == 0])
            malignant_samples = self.data[self.data['target'] == 1]
            augment_factor = benign_count // len(malignant_samples) - 1
            augmented_malignant = pd.concat([malignant_samples] * augment_factor, ignore_index=True)
            self.data = pd.concat([self.data, augmented_malignant], ignore_index=True)
        else:
            self.data = self.test_data

        self.benign = self.data[self.data['target'] == 0]
        self.malignant = self.data[self.data['target'] == 1]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        anchor_row = self.data.iloc[idx]
        anchor_img = self.get_image(anchor_row)
        anchor_label = anchor_row['target']

        if anchor_label == 0:  # benign
            positive = self.get_image(self.benign.sample().iloc[0])
            negative = self.get_image(self.malignant.sample().iloc[0])
        else:  # malignant
            positive = self.get_image(self.malignant.sample().iloc[0])
            negative = self.get_image(self.benign.sample().iloc[0])

        return anchor_img, positive, negative, anchor_label

    def get_image(self, row):
        img_name = row['image_name']
        img_path = os.path.join(self.img_dir, img_name + '.jpg')
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        # Apply additional random augmentation for malignant samples in training mode
        # This actually significantly helps with class imbalance
        if self.mode == 'train' and row['target'] == 1:
            image = self.random_augment(image)
        
        return image

    def random_augment(self, image):
        # Define a set of possible augmentations
        augmentations = [
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomRotation(20),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        ]
        # Randomly apply some of these augmentations
        for aug in augmentations:
            if random.random() > 0.5:
                image = aug(image)
        return image

    def train_test_split(self, split_ratio):
        train_data, test_data = train_test_split(
            self.data, 
            test_size=1-split_ratio, 
            stratify=self.data['target'],
            random_state=42
        )
        return train_data, test_data

#### There is problem with class imbalance, maybe ill try some transforms and replicating images

In [14]:
base_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a standard size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet stats
])

In [16]:
csv_file = 'ISIC_2020_Training_GroundTruth_v2.csv'
img_dir = 'data/ISIC_2020_Training_JPEG/train/'

train_dataset = ISIC2020Dataset(csv_file, img_dir, base_transform, mode='train')
test_dataset = ISIC2020Dataset(csv_file, img_dir, base_transform, mode='test')

In [17]:
# will start with 32 and see how it goes
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [18]:
# print debuggers
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")
print(f"Number of benign samples in training: {len(train_dataset.benign)}")
print(f"Number of malignant samples in training: {len(train_dataset.malignant)}")
print(f"Number of benign samples in testing: {len(test_dataset.benign)}")
print(f"Number of malignant samples in testing: {len(test_dataset.malignant)}")

Number of training samples: 51718
Number of testing samples: 6626
Number of benign samples in training: 26033
Number of malignant samples in training: 25685
Number of benign samples in testing: 6509
Number of malignant samples in testing: 117


### Still class imabalance in testing set
possible method to rectify this is using 2019 training set that is labelled with malignant, however there is still under 1000 malignant samples in 2019 dataset. Lets experiment results with what i have now