## Step 1: Preprocess data <br>
For consistency reasons and to be able to fuse our work together we should use the same test/training data for our different models


a) Import <br>
Labels are in a seperate file. Confusing folder structure


In [2]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split

# loading the label file (DataFrame)
BiomarkerLabel_df = pd.read_csv('OLIVES_Dataset_Labels/Biomarker_Clinical_Data_Images.csv')

## Extract data
# Get image paths as your features
df_features = BiomarkerLabel_df['Path (Trial/Arm/Folder/Visit/Eye/Image Name)']
# Get scan number
df_scan = BiomarkerLabel_df['Scan (n/49)']
# Extract labels (biomarkers)
labels = BiomarkerLabel_df.columns[2:18] 
# Get the biomarker data (target)
df_labels = BiomarkerLabel_df[labels] 
# Get clinical data columns
df_clinicalData = BiomarkerLabel_df[['Eye_ID', 'BCVA', 'CST', 'Patient_ID']]



b) Train/Test split <br>
We want an equal split. Also split the clinical data together.

There are a lot of considerations that can be made here:
- Do we want to keep sets of scans for one eye together?
- What do we use to stratify? All biomarkers together create too many different combinations of biomarkers (one combination only exists once -> error)
    - Use the oversampled biomarkers? (Uncommon split won't be controlled, however we don't expect good results for these anyways)
    - Use the undersampled ones? (And let the common ones just be randomly splitted?) Create artificial data points to multiple the appearence of these 
- Do we make a difference for the two initial datasets which were put together here?

<br>
How do we manage the Inbalance?

- Overrepresented Classes (Large sample sizes):
    - [5] IR HRF (6341)
    - [6] Fully attached vitreous face (5222)
    - [11] Fluid (IRF) (4088)
    - [10] DRT/ME (3003)
    - [5] Partially attached vitreous face (2984)
    - [8] Vitreous debris (2836)
- Moderately Represented Classes:
    - [7] Preretinal tissue/hemorrhage (807)
    - [1] Disruption of EZ (604)
    - [3] IR hemorrhages (373)
- Rare Classes:
    - [12] Fluid (SRF) (233)
    - [0] Atrophy / thinning of retinal layers (166)
- Underrepresented Classes:
    - [15] SHRM (76)
    - [2] DRIL (32)
    - [9] VMT (10)
    - [13] Disruption of RPE (10)
    - [14] PED (serous) (10)




In [15]:
# This will show how many NaN values are in each biomarker column
print("NaN values: \n", df_labels.isna().sum(), "\n")
# Since these are not a lot we will just replace them with a 0
df_labels = df_labels.fillna(0)

# Check the distribution of biomarker labels (count occurrences)
label_counts = df_labels.sum(axis=0).astype(int)
print("Label counts: \n", label_counts, "\n")


NaN values: 
 Atrophy / thinning of retinal layers    0
Disruption of EZ                        0
DRIL                                    0
IR hemorrhages                          0
IR HRF                                  0
Partially attached vitreous face        0
Fully attached vitreous face            0
Preretinal tissue/hemorrhage            0
Vitreous debris                         0
VMT                                     0
DRT/ME                                  0
Fluid (IRF)                             0
Fluid (SRF)                             0
Disruption of RPE                       0
PED (serous)                            0
SHRM                                    0
dtype: int64 

Label counts: 
 Atrophy / thinning of retinal layers     166
Disruption of EZ                         604
DRIL                                      32
IR hemorrhages                           373
IR HRF                                  6341
Partially attached vitreous face        2984
Fully attache

In [None]:
# OPTION 1:
# split by underrepresented classes

# Combine the data we want to split
df_combined = pd.concat([df_features, df_labels, df_clinicalData], axis=1)

# set which biomarkers will be used as stratifiers
# in this example the underrepresented classes will be used
underrepresented_classes = ['SHRM', 'DRIL', 'VMT', 'Disruption of RPE', 'PED (serous)']
df_labels['underrepresented'] = df_labels[underrepresented_classes].sum(axis=1) > 0
                         
## Split the data 
# 80/20 Train/Test split
# We use the biomarkers data for stratisfying to make sure we get an equal distribution (doesn't quite work)
train_data, train_data, train_labels, test_label = train_test_split(
    df_combined, df_labels, 
    test_size=0.2,
    random_state=0,
    stratify=df_labels['underrepresented']
)

In [17]:
# OPTION 2:
# split by biomarker presence for a complete eye. Keep eye set together for both visits.

# Summarize biomarker presence for each Eye_ID group
# If any scan in an eye group has a biomarker, the eye is marked as having the biomarker
BiomarkerPresence_FullEye = BiomarkerLabel_df.groupby('Eye_ID')[labels].max()

# Add a summarization column indicating overall biomarker presence for stratification
BiomarkerPresence_FullEye['Biomarker Presence'] = BiomarkerPresence_FullEye.any(axis=1).astype(int)
# Count Biomarker Presence for Each Label
biomarker_counts = BiomarkerPresence_FullEye[labels].sum(axis=0).astype(int)
print("Biomarker presence for the full eye scan (from 96 eyes):\n", biomarker_counts)

# Save the summarized biomarker presence to a separate CSV
BiomarkerPresence_FullEye.to_csv('OLIVES_Dataset_Labels/BiomarkerPresence_FullEye.csv', index=True, float_format='%.0f')

# Stratified split based on the summarized biomarker presence
train_eyes, test_eyes = train_test_split(
    BiomarkerPresence_FullEye, 
    test_size=0.2, 
    random_state=0, 
    stratify=BiomarkerPresence_FullEye['Biomarker Presence']
)

# Filter the original DataFrame to include only rows corresponding to the split Eye_IDs
train_data = BiomarkerLabel_df[BiomarkerLabel_df['Eye_ID'].isin(train_eyes.index)]
test_data = BiomarkerLabel_df[BiomarkerLabel_df['Eye_ID'].isin(test_eyes.index)]


Biomarker presence for the full eye scan (from 96 eyes):
 Atrophy / thinning of retinal layers    16
Disruption of EZ                        46
DRIL                                     4
IR hemorrhages                          51
IR HRF                                  96
Partially attached vitreous face        67
Fully attached vitreous face            75
Preretinal tissue/hemorrhage            34
Vitreous debris                         91
VMT                                      3
DRT/ME                                  72
Fluid (IRF)                             93
Fluid (SRF)                             27
Disruption of RPE                        5
PED (serous)                             1
SHRM                                    14
dtype: int64


Only run this to store the split in a file

In [12]:
# Save as .csv files
train_data.to_csv('OLIVES_Dataset_Labels/TEMP_BiomarkerLabel_train_data.csv', index=False)
test_data.to_csv('OLIVES_Dataset_Labels/TEMP_BiomarkerLabel_test_data.csv', index=False)


c) Create dataset & transformation <br>
Different transformations for testing and training

In [7]:
# Custom Dataset
class BiomarkerDataset(Dataset):
    def __init__(self, label_file, transform=None):
        """
        Args:
            label_file (str): Path to the CSV file.
            model_name (str): The name of the model to adjust the image size for.
            transform (callable, optional): Transform to be applied on a sample.
        """
        self.data = pd.read_csv(label_file)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join('/storage/ice1/shared/d-pace_community/makerspace-datasets/MEDICAL/OLIVES', self.data.iloc[idx, 0])  # Path column + base path
        img_path = self.data.iloc[idx, 0]  # Path column
        image = Image.open(img_path).convert("RGB")
        
        # Determine input size based on model
        input_size = self.model_input_sizes.get(self.model_name, self.default_size)
        
        if self.transform is not None:
            # apply data transformations
            img = self.transform(img)                    
        
        # Get labels (multi-label)
        labels = torch.tensor(self.data.iloc[idx, 2:18].values, dtype=torch.float32)  # Biomarker columns
        
        # Get extra clinical data
        clinical_data = {
            "Eye_ID": self.data.iloc[idx, 18],
            "BCVA": self.data.iloc[idx, 19],
            "CST": self.data.iloc[idx, 20],
            "Patient_ID": self.data.iloc[idx, 21],
        }
        
        return image, labels, clinical_data
    
    
# Define transformers

# Values for normalization taken from paper
mean = 0.1706
std = 0.2112

# train with data augmentation
trainEVA02_transformer = transforms.Compose([   
    transforms.RandomCrop((0.7, 1.0)),  # RandomCrop between 70% to 100% of original size
    # transforms.RandomPerspective(distortion_scale=0.2, p=0.5, fill=0),  # Add perspective shift
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Adjust color properties
    transforms.RandomRotation(degrees=10, fill=0),  # Rotates randomly between + and - degree and fills new pixels with black
    transforms.RandomHorizontalFlip(p=0.5),  # Random horizontal flip
    transforms.Resize(448), # Eva02 works with 448x448 pixels
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean, std) # we have to calculate these values for our dataset
])
# train without data augmentation
testEVA02_transformer = transforms.Compose([   
    transforms.Resize(448), # Eva02 works with 448x448 pixels
    transforms.CenterCrop(448), # shouldn't do anything
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])


# set up train loader for EVA02
trainEVA02_dataset = BiomarkerDataset(label_file='OLIVES_Dataset_Labels/BiomarkerLabel_train_data.csv', transform=trainEVA02_transformer)
trainloader_EVA02 = DataLoader(trainEVA02_dataset, batch_size=64, shuffle=True, num_workers=4, drop_last=True, pin_memory=True)

# set up test loader for EVA02
testEVA02_dataset = BiomarkerDataset(label_file='OLIVES_Dataset_Labels/BiomarkerLabel_train_data.csv', transform=testEVA02_transformer)
testloader_EVA02 = DataLoader(testEVA02_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
