In [1]:
import io
import time

import h5py
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
import torchvision
from einops import rearrange
from PIL import Image
from pytorch_lightning.callbacks import (Callback, EarlyStopping,
                                         LearningRateMonitor, ModelCheckpoint,
                                         ProgressBar)
from torchmetrics import Precision, Recall, Specificity
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from torch.amp import GradScaler, autocast
from torch.utils.data import DataLoader, Dataset, Sampler, Subset
from torchmetrics import Metric
from torchvision.transforms import transforms

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

pl.seed_everything(42, workers=True)

Seed set to 42


42

In [2]:
"""
2024 ISIC Challenge primary prize scoring metric

Given a list of binary labels, an associated list of prediction 
scores ranging from [0,1], this function produces, as a single value, 
the partial area under the receiver operating characteristic (pAUC) 
above a given true positive rate (TPR).
https://en.wikipedia.org/wiki/Partial_Area_Under_the_ROC_Curve.

(c) 2024 Nicholas R Kurtansky, MSKCC
"""

from collections import Counter

import numpy as np
import pandas as pd
import pandas.api.types
from sklearn.metrics import auc, roc_auc_score, roc_curve


class PartialAUROC(Metric):
    def __init__(
        self,
        min_tpr: float = 0.80,
        dist_sync_on_step: bool = False,
    ):
        super().__init__(dist_sync_on_step=dist_sync_on_step)
        self.min_tpr = min_tpr
        self.add_state("preds", default=[], dist_reduce_fx="cat")
        self.add_state("target", default=[], dist_reduce_fx="cat")

    def update(self, preds: torch.Tensor, target: torch.Tensor):
        self.preds.append(preds)
        self.target.append(target)

    def compute(self):
        preds = torch.cat(self.preds)
        target = torch.cat(self.target)
        return self._partial_auroc(target, preds, self.min_tpr)

    def _partial_auroc(
        self, y_true: torch.Tensor, y_score: torch.Tensor, min_tpr: float
    ) -> float:
        y_true = torch.abs(y_true - 1)
        y_score = -y_score

        fpr, tpr, _ = self._roc_curve(y_true, y_score)
        max_fpr = 1.0 - min_tpr

        # print(f"Computed FPR: {fpr}")
        # print(f"Computed TPR: {tpr}")

        if max_fpr == 1:
            return self._auc(fpr, tpr)
        if max_fpr <= 0 or max_fpr > 1:
            raise ValueError(f"Expected min_tpr in range [0, 1), got: {min_tpr}")

        stop = torch.searchsorted(fpr, torch.tensor(max_fpr), right=True)
        x_interp = fpr[stop - 1 : stop + 1]
        y_interp = tpr[stop - 1 : stop + 1]

        # print(f"x_interp: {x_interp}")
        # print(f"y_interp: {y_interp}")

        if len(x_interp) == 1:
            interp_tpr = y_interp[0]
        else:
            interp_tpr = y_interp[0] + (max_fpr - x_interp[0]) * (
                y_interp[1] - y_interp[0]
            ) / (x_interp[1] - x_interp[0])

        tpr = torch.cat([tpr[:stop], torch.tensor([interp_tpr])])
        fpr = torch.cat([fpr[:stop], torch.tensor([max_fpr])])

        partial_auc = self._auc(fpr, tpr)
        return partial_auc

    def _roc_curve(self, y_true: torch.Tensor, y_score: torch.Tensor):
        desc_score_indices = torch.argsort(y_score, descending=True)
        y_score = y_score[desc_score_indices]
        y_true = y_true[desc_score_indices]

        distinct_value_indices = torch.where(torch.diff(y_score))[0]
        threshold_idxs = torch.cat(
            [distinct_value_indices, torch.tensor([y_true.numel() - 1])]
        )

        tps = torch.cumsum(y_true, dim=0)[threshold_idxs]
        fps = 1 + threshold_idxs - tps
        
        # Handle the case where there are no positive samples
        if tps[-1] == 0:
            tpr = torch.zeros_like(tps)
        else:
            tpr = tps / tps[-1]
        
        fpr = fps / fps[-1]
        thresholds = y_score[threshold_idxs]

        # print(f"tps: {tps}")
        # print(f"fps: {fps}")
        # print(f"tpr: {tpr}")
        # print(f"fpr: {fpr}")
        # print(f"thresholds: {thresholds}")

        return fpr, tpr, thresholds

    def _auc(self, x: torch.Tensor, y: torch.Tensor) -> float:
        if torch.all(y == 0):
            print("Warning: All TPR values are zero. AUC is undefined.")
            return 0.0

        direction = 1
        dx = torch.diff(x)
        if torch.any(dx < 0):
            if torch.all(dx <= 0):
                direction = -1
            else:
                raise ValueError("x is neither increasing nor decreasing")
        auc_value = direction * torch.trapz(y, x).item()
        # print(f"Computed AUC: {auc_value}")
        return auc_value

In [3]:
print(torch.__version__)
print(torchvision.__version__)
print(pl.__version__)

2.4.0+cu121
0.19.0+cu121
2.4.0


In [4]:
class InvertedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, expand_ratio, stride):
        super(InvertedResidualBlock, self).__init__()
        hidden_dim = in_channels * expand_ratio
        self.use_res_connect = stride == 1 and in_channels == out_channels

        layers = []
        if expand_ratio != 1:
            layers.append(ConvBNActivation(in_channels, hidden_dim, kernel_size=1))
        layers.extend(
            [
                ConvBNActivation(
                    hidden_dim, hidden_dim, stride=stride, groups=hidden_dim
                ),
                nn.Conv2d(hidden_dim, out_channels, 1, bias=True),
                nn.BatchNorm2d(out_channels),
            ]
        )
        self.conv = nn.Sequential(*layers)

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class ConvBNActivation(nn.Sequential):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1):
        padding = (kernel_size - 1) // 2
        super(ConvBNActivation, self).__init__(
            nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                groups=groups,
                bias=True,
            ),
            nn.BatchNorm2d(out_channels),
            nn.Mish(),
        )


class DenseBlock(nn.Module):
    def __init__(self, in_channels, num_layers, growth_rate, dropout_rate=0.2):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList(
            [
                DenseLayer(in_channels + i * growth_rate, growth_rate, dropout_rate)
                for i in range(num_layers)
            ]
        )

    def forward(self, x):
        for layer in self.layers:
            x = torch.cat([x, layer(x)], 1)
        return x


class DenseLayer(nn.Sequential):
    def __init__(self, in_channels, growth_rate, dropout_rate):
        super(DenseLayer, self).__init__(
            nn.BatchNorm2d(in_channels),
            nn.Mish(),
            nn.Conv2d(in_channels, 4 * growth_rate, 1, bias=True),
            nn.BatchNorm2d(4 * growth_rate),
            nn.Mish(),
            nn.Conv2d(4 * growth_rate, growth_rate, 3, padding=1, bias=True),
            nn.Dropout2d(dropout_rate),
        )


class TransitionLayer(nn.Sequential):
    def __init__(self, in_channels, compression_factor=0.5):
        out_channels = int(in_channels * compression_factor)
        super(TransitionLayer, self).__init__(
            nn.BatchNorm2d(in_channels),
            nn.Mish(),
            nn.Conv2d(in_channels, out_channels, 1, bias=True),
            nn.AvgPool2d(2, stride=2),
        )


class AttentionBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(AttentionBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(in_channels, out_channels, 1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, 1, 1)
        self.bn3 = nn.BatchNorm2d(1)

    def forward(self, x):
        g = self.bn1(self.conv1(x))
        x = self.bn2(self.conv2(x))
        att = nn.Hardswish()(g + x)
        att = nn.Sigmoid()(self.bn3(self.conv3(att)))
        return x * att


class InceptionBlock(nn.Module):
    def __init__(self, in_channels, filters):
        super(InceptionBlock, self).__init__()
        f1, f2, f3 = filters
        self.branch1 = ConvBNActivation(in_channels, f1, kernel_size=1)
        self.branch2 = nn.Sequential(
            ConvBNActivation(in_channels, f2[0], kernel_size=1),
            ConvBNActivation(f2[0], f2[1], kernel_size=3),
        )
        self.branch3 = nn.Sequential(
            ConvBNActivation(in_channels, f3[0], kernel_size=1),
            ConvBNActivation(f3[0], f3[1], kernel_size=5),
        )
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            ConvBNActivation(in_channels, f1, kernel_size=1),
        )

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        return torch.cat([branch1, branch2, branch3, branch4], 1)


class GatedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, strides):
        super(GatedResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=strides,
            padding=kernel_size // 2,
        )
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels, 1)
        self.bn3 = nn.BatchNorm2d(out_channels)
        self.activation = nn.Mish()

        # Add a shortcut connection if input and output dimensions don't match
        self.shortcut = nn.Sequential()
        if strides != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_channels, out_channels, kernel_size=1, stride=strides, bias=True
                ),
                nn.BatchNorm2d(out_channels),
            )

    def forward(self, x):
        residual = self.shortcut(x)

        x = self.activation(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        gate = nn.Sigmoid()(self.bn3(self.conv3(x)))
        x = x * gate
        x += residual
        return self.activation(x)


class GuruNet(pl.LightningModule):
    def __init__(
        self,
        input_shape=(139, 139, 3),
        metadata_shape=None,
        classes=2,
    ):
        super(GuruNet, self).__init__()
        self.input_shape = input_shape
        self.metadata_shape = metadata_shape

        # Initial convolutional layer
        self.conv1 = nn.Conv2d(3, 256, kernel_size=5, stride=2, padding=1)
        self.bn1 = nn.BatchNorm2d(256)
        self.activation = nn.Hardswish()

        # Inverted Residual Blocks
        self.inv_res_blocks = nn.ModuleList()
        block_params = [
            # expand_ratio, filters, strides, repeats
            (6, 16, 1, 1),
            (6, 24, 2, 2),
            (6, 40, 2, 2),
            (6, 80, 2, 3),
            (6, 112, 1, 3),
            (6, 128, 2, 4),
            (6, 196, 1, 1),
        ]

        in_channels = 256
        for i, (expand_ratio, filters, strides, repeats) in enumerate(block_params):
            for j in range(repeats):
                if j > 0:
                    strides = 1
                self.inv_res_blocks.append(
                    InvertedResidualBlock(in_channels, filters, expand_ratio, strides)
                )
                in_channels = filters

        # Dense Block
        self.dense_block = DenseBlock(in_channels, num_layers=32, growth_rate=64)
        in_channels += 64 * 32  # Update in_channels after dense block

        # Transition Layer
        self.transition = TransitionLayer(in_channels, compression_factor=0.5)
        in_channels = int(in_channels * 0.5)

        # Attention Block
        self.attention = AttentionBlock(in_channels, 1024)
        in_channels = 1024

        # Average Pooling
        self.avg_pool = nn.AvgPool2d(kernel_size=3, stride=1, padding=1)

        # Inception Block
        self.inception = InceptionBlock(in_channels, [128, (128, 192), (32, 96)])
        in_channels = 128 + 192 + 96 + 128

        self.inception2 = InceptionBlock(in_channels, [128, (128, 192), (32, 96)])
        in_channels = 128 + 192 + 96 + 128

        # Attention Block
        self.attention2 = AttentionBlock(in_channels, 512)
        in_channels = 512

        # Gated Residual Block
        self.gated_res = GatedResidualBlock(in_channels, 512, kernel_size=3, strides=2)
        in_channels = 512

        # Attention Block
        self.attention3 = AttentionBlock(in_channels, 256)
        in_channels = 256

        # Global Average Pooling
        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
        self.flatten = nn.Flatten()

        # Fully connected layers
        self.fc1 = nn.Linear(in_channels, 4096)
        self.bn_fc1 = nn.BatchNorm1d(4096)
        self.fc2 = nn.Linear(4096, 2048)
        self.bn_fc2 = nn.BatchNorm1d(2048)
        self.fc3 = nn.Linear(2048, 1024)
        self.bn_fc3 = nn.BatchNorm1d(1024)
        self.fc4 = nn.Linear(1024, 256)
        self.bn_fc4 = nn.BatchNorm1d(256)
        self.dropout = nn.Dropout(0.5)

        self.metadata_fc1 = nn.Linear(41, 4096)
        self.metadata_bn1 = nn.BatchNorm1d(4096)
        self.metadata_fc2 = nn.Linear(4096, 1024)
        self.metadata_bn2 = nn.BatchNorm1d(1024)
        self.metadata_fc3 = nn.Linear(1024, 512)
        self.metadata_bn3 = nn.BatchNorm1d(512)
        self.metadata_fc4 = nn.Linear(512, 256)
        self.metadata_bn4 = nn.BatchNorm1d(256)
        self.final_fc = nn.Linear(256 + 256, classes)
        self.final_activation = nn.Sigmoid()
        self.scaler = GradScaler()
        self.loss = self.loss = nn.CrossEntropyLoss()
        self.auroc = PartialAUROC(min_tpr=0.8)

    def forward(self, x, metadata):
        x = self.activation(self.bn1(self.conv1(x)))

        # Inverted Residual Blocks
        for block in self.inv_res_blocks:
            x = block(x)

        # Dense Block
        x = self.dense_block(x)

        # Transition Layer
        x = self.transition(x)

        # Attention Block
        x = self.attention(x)

        # Average Pooling
        x = self.avg_pool(x)

        # Inception Block
        x = self.inception(x)
        x = self.inception2(x)

        # Attention Block
        x = self.attention2(x)

        # Gated Residual Block
        x = self.gated_res(x)

        # Attention Block
        x = self.attention3(x)

        x = self.global_avg_pool(x)
        x = self.flatten(x)
        x = self.activation(self.bn_fc1(self.fc1(x)))
        x = self.dropout(x)
        x = self.activation(self.bn_fc2(self.fc2(x)))
        x = self.dropout(x)
        x = self.activation(self.bn_fc3(self.fc3(x)))
        x = self.dropout(x)
        x = self.activation(self.bn_fc4(self.fc4(x)))

        metadata = self.activation(self.metadata_bn1(self.metadata_fc1(metadata)))
        metadata = self.dropout(metadata)
        metadata = self.activation(self.metadata_bn2(self.metadata_fc2(metadata)))
        metadata = self.dropout(metadata)
        metadata = self.activation(self.metadata_bn3(self.metadata_fc3(metadata)))
        metadata = self.dropout(metadata)
        metadata = self.activation(self.metadata_bn4(self.metadata_fc4(metadata)))

        x = torch.cat([x, metadata], dim=1)

        x = self.final_fc(x)
        # Apply sigmoid to ensure output is between 0 and 1
        x = self.final_activation(x)

        return x

    def training_step(self, batch, batch_idx):
        (images, metadata), targets = batch
        outputs = self(images, metadata)
        loss = self.loss(outputs, targets)  # targets is already one-hot encoded
        # Get the probability of the positive class
        pos_probs = outputs[:, 1].float().cpu()

        # Convert one-hot encoded targets to binary labels
        targets_binary = targets[:, 1].int().cpu()
        rocauc = self.auroc(pos_probs, targets_binary)  # Use class 1 probability

        self.log(
            "train_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )
        self.log(
            "train_pAUC",
            rocauc,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        (images, metadata), targets = batch
        outputs = self(images, metadata)
        loss = self.loss(outputs, targets)  # targets is already one-hot encoded
        # Get the probability of the positive class
        pos_probs = outputs[:, 1].float().cpu()

        # Convert one-hot encoded targets to binary labels
        targets_binary = targets[:, 1].int().cpu()
        rocauc = self.auroc(pos_probs, targets_binary)

        # Use class 1 probability

        self.log(
            "val_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )
        self.log(
            "val_pAUC",
            rocauc,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
        )

        return loss

    def test_step(self, batch, batch_idx):
        (images, metadata), targets = batch
        outputs = self(images, metadata)
        loss = self.loss(outputs, targets)  # targets is already one-hot encoded

        # Get the probability of the positive class
        pos_probs = outputs[:, 1].float().cpu()

        # Convert one-hot encoded targets to binary labels
        targets_binary = targets[:, 1].int().cpu()
        rocauc = self.auroc(pos_probs, targets_binary)

        self.log(
            "test_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )
        self.log(
            "test_pAUC",
            rocauc,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
        )

        return loss

    def configure_optimizers(self):
        optimizer = optim.NAdam(
            self.parameters(), lr=0.001, momentum_decay=0.5, weight_decay=1e-5
        )
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="min", factor=0.1, patience=2, verbose=True
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "train_loss",
            },
        }

In [5]:
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
import multiprocessing

import os
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import numpy as np
import pandas as pd
import torch
import joblib
import hashlib


def prepare_df(
    df: pd.DataFrame,
    is_training=True,
):
    print("Preparing DataFrame...")
    df_hash = hashlib.md5(pd.util.hash_pandas_object(df).values).hexdigest()
    cache_dir = "./cache"
    param_string = f"{is_training}"
    cache_file = os.path.join(cache_dir, f"prepared_df_{df_hash}_{param_string}.joblib")

    # Check if cached version exists
    if os.path.exists(cache_file):
        print("Loading cached prepared DataFrame...")
        return joblib.load(cache_file)
    start_time = time.time()

    drop_columns_train = [
        "lesion_id",
        "iddx_full",
        "iddx_1",
        "iddx_2",
        "iddx_3",
        "iddx_4",
        "iddx_5",
        "mel_mitotic_index",
        "mel_thick_mm",
        "tbp_lv_dnn_lesion_confidence",
    ]
    drop_columns_test = ["attribution", "copyright_license"]

    if is_training:
        df.drop(drop_columns_train, axis=1, inplace=True)
    df.drop(drop_columns_test, axis=1, inplace=True)
    target_columns = ["target"] if is_training else []
    X = df.drop(target_columns + ["isic_id"], axis=1)
    y = torch.tensor(df["target"].values, dtype=torch.int8) if is_training else None

    # Separate features by type
    integer_features = X.select_dtypes(include=["int64", "int32", "int16"]).columns
    float_features = X.select_dtypes(include=["float64", "float32", "float16"]).columns
    categorical_features = X.select_dtypes(include=["object"]).columns

    # Handle NaN values and type conversions
    for feature in float_features:
        X[feature] = X[feature].fillna(X[feature].mean()).astype("float32")

    for feature in integer_features:
        X[feature] = X[feature].fillna(X[feature].median()).astype("int32")

    for feature in categorical_features:
        X[feature] = X[feature].astype(str).fillna("Unknown")
        X[feature] = pd.Categorical(X[feature]).codes

    # Standardize all numeric features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    X_final = X_scaled

    # Final check for any remaining NaN values
    assert (
        not X_final.isnull().any().any()
    ), "There are still NaN values in the processed data"

    print("Data shape after preprocessing:", X_final.shape)
    print("Number of NaN values after preprocessing:", X_final.isnull().sum().sum())

    if is_training:
        print("Class distribution:")
        print(df["target"].value_counts(normalize=True))

    print(f"DataFrame prepared in {time.time() - start_time:.2f} seconds")
    print(f"Metadata Shape: {X_final.shape}")

    # Cache the results
    os.makedirs(cache_dir, exist_ok=True)
    joblib.dump((X_final, y, df["isic_id"]), cache_file)
    return X_final, y, df["isic_id"]


class ISICDataset(Dataset):
    def __init__(self, hdf5_path, metadata_df, is_training=True, transform=None):
        self.hdf5_path = hdf5_path
        self.metadata_df = metadata_df
        self.is_training = is_training
        self.transform = transform
        self.X, self.y, self.image_names = prepare_df(metadata_df, is_training)
        self.metadata_shape = self.X.shape
        self.train_transform = get_transforms(is_training=True)
        self.test_transform = get_transforms(is_training=False)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if isinstance(idx, tuple):
            idx, augment = idx
        else:
            augment = False

        isic_id = self.image_names[idx]
        metadata = torch.tensor(self.X.iloc[idx].values, dtype=torch.float32)

        with h5py.File(self.hdf5_path, "r") as hdf:
            image_data = hdf[str(isic_id)][()]
            image = Image.open(io.BytesIO(image_data))

        if self.is_training and augment:
            image = self.train_transform(image)
        elif self.transform:
            image = self.transform(image)

        if self.is_training:
            target = self.y[idx]
            target_long = target.long()
            del target
            target_one_hot = nn.functional.one_hot(target_long, num_classes=2).float()
            return (image, metadata), target_one_hot
        else:
            return (image, metadata), isic_id


# Create separate transforms for training and validation
def get_transforms(is_training=True):
    # Define augmentation parameters
    ROTATION_RANGE = 180
    BRIGHTNESS_RANGE = (0.99, 1.01)
    CONTRAST_RANGE = (0.99, 1.01)
    SATURATION_RANGE = (0.9, 1.1)
    HUE_RANGE = (-0.001, 0.001)
    base_transforms = [
        transforms.ToTensor(),
        transforms.Resize((139, 139), antialias=True),
    ]

    if is_training:
        train_transforms = [
            transforms.RandomRotation(
                degrees=ROTATION_RANGE,
                interpolation=transforms.InterpolationMode.BILINEAR,
            ),
            transforms.ColorJitter(
                brightness=BRIGHTNESS_RANGE,
                contrast=CONTRAST_RANGE,
                saturation=SATURATION_RANGE,
                hue=HUE_RANGE,
            ),
        ]
        return transforms.Compose(train_transforms + base_transforms)
    else:
        return transforms.Compose(base_transforms)


class ISICDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_hdf5_path: str,
        test_hdf5_path: str,
        train_metadata_df: pd.DataFrame,
        test_metadata_df: pd.DataFrame,
        batch_size: int = 32,
    ):
        super().__init__()
        self.train_hdf5_path = train_hdf5_path
        self.test_hdf5_path = test_hdf5_path
        self.batch_size = batch_size

        self.train_metadata_df = train_metadata_df
        self.test_metadata_df = test_metadata_df

    def setup(self, stage=None):
        full_dataset = ISICDataset(
            self.train_hdf5_path,
            self.train_metadata_df,
            True,
            transform=get_transforms(is_training=True),
        )
        self.metadata_shape = full_dataset.metadata_shape
        
        # Get targets for stratification
        targets = self.train_metadata_df["target"].values
        balanced_indices = self.balance_dataset(np.arange(len(full_dataset)), targets)
        
        # Extract actual indices and augmentation flags
        balanced_indices, augmentation_flags = zip(*balanced_indices)
        balanced_indices = np.array(balanced_indices)
        
        augmentation_flags = np.array(augmentation_flags)
        balanced_targets = targets[balanced_indices]

        print(f"Unique indices: {np.unique(balanced_indices)}")
        print(f"Unique targets: {np.unique(balanced_targets)}")
        
        print(len(balanced_indices))
        print(len(balanced_targets))
        
        unique, counts = np.unique(balanced_targets, return_counts=True)
        print(f"Setup Count: {dict(zip(unique, counts))}")
        # Perform stratified split
        train_indices, temp_indices, train_targets, temp_targets = train_test_split(
            balanced_indices,
            balanced_targets,
            test_size=0.2,
            # stratify=balanced_targets,
            random_state=42,
        )

        val_indices, test_indices, val_targets, test_targets = train_test_split(
            temp_indices,
            temp_targets,
            test_size=0.5,
            # stratify=temp_targets,
            random_state=42,
        )

        # Create subset datasets
        if stage in ["fit", "validate", "test"]:
            self.train_dataset = Subset(full_dataset, train_indices)
            self.val_dataset = Subset(full_dataset, val_indices)
            self.test_dataset = Subset(full_dataset, test_indices)

        # Check for class balance
        self._check_class_balance(train_targets.flatten(), "Train")
        self._check_class_balance(val_targets.flatten(), "Validation")
        self._check_class_balance(test_targets.flatten(), "Test")

        print(f"Length of full_dataset: {len(full_dataset)}")
        print(
            f"Length of train_indices: {len(train_indices)}, max index: {max(train_indices)}"
        )
        print(
            f"Length of val_indices: {len(val_indices)}, max index: {max(val_indices)}"
        )
        print(
            f"Length of test_indices: {len(test_indices)}, max index: {max(test_indices)}"
        )

    def balance_dataset(self, indices, targets):
        np.random.seed(42)
        positive_indices = indices[targets == 1]
        negative_indices = indices[targets == 0]

        num_positive_samples = len(positive_indices)
        num_negative_samples = len(negative_indices)

        print(f"Number of Positive Samples: {num_positive_samples}")
        print(f"Number of Negative Samples: {num_negative_samples}")

        # Determine the number of samples for each class
        num_samples = int(np.mean([num_positive_samples, num_negative_samples]))//50

        # Upsample positive indices
        upsampled_positive_indices = np.random.choice(
            positive_indices, size=num_samples // 2, replace=True
        )

        # Downsample negative indices
        downsampled_negative_indices = np.random.choice(
            negative_indices, size=num_samples // 2, replace=False
        )

        # Add augmentation flag
        balanced_indices = [(idx, True) for idx in upsampled_positive_indices] + [
            (idx, False) for idx in downsampled_negative_indices
        ]

        np.random.shuffle(balanced_indices)

        print(f"Length of Balanced Positive Indices: {len(upsampled_positive_indices)}")
        print(
            f"Length of Balanced Negative Indices: {len(downsampled_negative_indices)}"
        )

        return balanced_indices

    def _check_class_balance(self, targets, split_name):
        class_counts = np.bincount(targets)
        print(
            f"{split_name} class distribution: {class_counts / len(targets)}, {len(targets)}"
        )
        if len(class_counts) < 2 or min(class_counts) == 0:
            raise ValueError(f"Imbalanced classes in {split_name} split")

    def train_dataloader(self):
        data_loader = DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=16,
            pin_memory=True,
        )
        print(f"Number of batches in train_loader: {len(data_loader)}")
        return data_loader

    def val_dataloader(self):
        data_loader = DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4,
            pin_memory=True,
        )
        print(f"Number of batches in val_loader: {len(data_loader)}")
        return data_loader

    def test_dataloader(self):

        data_loader = DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4,
            pin_memory=True,
        )
        print(f"Number of batches in test_loader: {len(data_loader)}")
        return data_loader

In [6]:
# Define parameters
img_height, img_width = 139, 139

# Load metadata
train_metadata_df = pd.read_csv("train-metadata.csv")
test_metadata_df = pd.read_csv("test-metadata.csv")

  train_metadata_df = pd.read_csv("train-metadata.csv")


In [7]:
import os

os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
torch.cuda.memory.empty_cache()
torch.set_float32_matmul_precision("medium")
batch_size = 64
epochs = 200

logger = TensorBoardLogger("tb_logs", name="gurunet_model")

checkpoint_callback = ModelCheckpoint(
    dirpath=f"checkpoints/version_{logger.version}",
    filename="gurunet-{epoch:02d}-{val_pAUC:.5f}",
    save_top_k=3,
    monitor="val_pAUC",
    mode="max",
    verbose=True
)

early_stop_callback = EarlyStopping(monitor="val_pAUC", patience=15, mode="min")

lr_monitor = LearningRateMonitor(logging_interval="epoch", log_momentum=True)

# Initialize your data module
data_module = ISICDataModule(
    "train-image.hdf5",
    "test-image.hdf5",
    train_metadata_df,
    test_metadata_df,
    batch_size=batch_size,
)


# Initialize your model
model = GuruNet(
    input_shape=(139, 139, 3),
    metadata_shape=(None, 37),
    classes=2,
)
# Initialize a trainer
trainer = pl.Trainer(
    max_epochs=epochs,
    accelerator="gpu",
    devices=1,
    callbacks=[
        checkpoint_callback,
        early_stop_callback,
        lr_monitor,
    ],
    logger=logger,
    precision="16",
    enable_progress_bar=True,
    enable_checkpointing=True,
    accumulate_grad_batches=6,
    profiler="simple",
    deterministic=True,
    min_epochs=100
)
skip_training = False
if not skip_training:
    # Train the model
    trainer.fit(model, data_module)


/home/pupperemeritus/miniconda3/envs/isic/lib/python3.12/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Preparing DataFrame...
Loading cached prepared DataFrame...


  return torch.load(io.BytesIO(b))
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Number of Positive Samples: 393
Number of Negative Samples: 400666
Length of Balanced Positive Indices: 2005
Length of Balanced Negative Indices: 2005
Unique indices: [   302    305    387 ... 400775 400922 400991]
Unique targets: [0 1]
4010
4010
Setup Count: {np.int64(0): np.int64(2005), np.int64(1): np.int64(2005)}
Train class distribution: [0.50187032 0.49812968], 3208
Validation class distribution: [0.52119701 0.47880299], 401
Test class distribution: [0.4638404 0.5361596], 401
Length of full_dataset: 401059
Length of train_indices: 3208, max index: 400991
Length of val_indices: 401, max index: 400922
Length of test_indices: 401, max index: 400922



   | Name             | Type               | Params | Mode 
-----------------------------------------------------------------
0  | conv1            | Conv2d             | 19.5 K | train
1  | bn1              | BatchNorm2d        | 512    | train
2  | activation       | Hardswish          | 0      | train
3  | inv_res_blocks   | ModuleList         | 2.2 M  | train
4  | dense_block      | DenseBlock         | 14.6 M | train
5  | transition       | TransitionLayer    | 2.5 M  | train
6  | attention        | AttentionBlock     | 2.3 M  | train
7  | avg_pool         | AvgPool2d          | 0      | train
8  | inception        | InceptionBlock     | 726 K  | train
9  | inception2       | InceptionBlock     | 526 K  | train
10 | attention2       | AttentionBlock     | 560 K  | train
11 | gated_res        | GatedResidualBlock | 3.2 M  | train
12 | attention3       | AttentionBlock     | 263 K  | train
13 | global_avg_pool  | AdaptiveAvgPool2d  | 0      | train
14 | flatten          | Flatten  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]Number of batches in val_loader: 7
Number of batches in train_loader: 51                                      
Epoch 0: 100%|██████████| 51/51 [00:29<00:00,  1.73it/s, v_num=145, train_loss_step=0.537, train_pAUC_step=0.150, val_loss=0.476, val_pAUC=0.149, train_loss_epoch=0.554, train_pAUC_epoch=0.110]

Epoch 0, global step 9: 'val_pAUC' reached 0.14868 (best 0.14868), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=00-val_pAUC=0.14868.ckpt' as top 3


Epoch 1: 100%|██████████| 51/51 [00:26<00:00,  1.90it/s, v_num=145, train_loss_step=0.343, train_pAUC_step=0.200, val_loss=0.450, val_pAUC=0.154, train_loss_epoch=0.470, train_pAUC_epoch=0.149] 

Epoch 1, global step 18: 'val_pAUC' reached 0.15366 (best 0.15366), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=01-val_pAUC=0.15366.ckpt' as top 3


Epoch 2: 100%|██████████| 51/51 [00:27<00:00,  1.87it/s, v_num=145, train_loss_step=0.331, train_pAUC_step=0.200, val_loss=0.442, val_pAUC=0.162, train_loss_epoch=0.444, train_pAUC_epoch=0.161]

Epoch 2, global step 27: 'val_pAUC' reached 0.16240 (best 0.16240), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=02-val_pAUC=0.16240.ckpt' as top 3


Epoch 3: 100%|██████████| 51/51 [00:27<00:00,  1.84it/s, v_num=145, train_loss_step=0.615, train_pAUC_step=0.200, val_loss=0.422, val_pAUC=0.170, train_loss_epoch=0.425, train_pAUC_epoch=0.167]

Epoch 3, global step 36: 'val_pAUC' reached 0.17036 (best 0.17036), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=03-val_pAUC=0.17036.ckpt' as top 3


Epoch 4: 100%|██████████| 51/51 [00:24<00:00,  2.05it/s, v_num=145, train_loss_step=0.570, train_pAUC_step=0.200, val_loss=0.417, val_pAUC=0.172, train_loss_epoch=0.423, train_pAUC_epoch=0.168]

Epoch 4, global step 45: 'val_pAUC' reached 0.17171 (best 0.17171), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=04-val_pAUC=0.17171.ckpt' as top 3


Epoch 5: 100%|██████████| 51/51 [00:19<00:00,  2.55it/s, v_num=145, train_loss_step=0.414, train_pAUC_step=0.200, val_loss=0.421, val_pAUC=0.165, train_loss_epoch=0.419, train_pAUC_epoch=0.168]

Epoch 5, global step 54: 'val_pAUC' reached 0.16500 (best 0.17171), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=05-val_pAUC=0.16500.ckpt' as top 3


Epoch 6: 100%|██████████| 51/51 [00:19<00:00,  2.58it/s, v_num=145, train_loss_step=0.326, train_pAUC_step=0.200, val_loss=0.419, val_pAUC=0.165, train_loss_epoch=0.409, train_pAUC_epoch=0.172]

Epoch 6, global step 63: 'val_pAUC' reached 0.16507 (best 0.17171), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=06-val_pAUC=0.16507.ckpt' as top 3


Epoch 7: 100%|██████████| 51/51 [00:32<00:00,  1.55it/s, v_num=145, train_loss_step=0.606, train_pAUC_step=0.100, val_loss=0.412, val_pAUC=0.164, train_loss_epoch=0.413, train_pAUC_epoch=0.171]

Epoch 7, global step 72: 'val_pAUC' was not in top 3


Epoch 8: 100%|██████████| 51/51 [00:30<00:00,  1.66it/s, v_num=145, train_loss_step=0.318, train_pAUC_step=0.200, val_loss=0.412, val_pAUC=0.171, train_loss_epoch=0.405, train_pAUC_epoch=0.171] 

Epoch 8, global step 81: 'val_pAUC' reached 0.17070 (best 0.17171), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=08-val_pAUC=0.17070.ckpt' as top 3


Epoch 9: 100%|██████████| 51/51 [00:27<00:00,  1.85it/s, v_num=145, train_loss_step=0.417, train_pAUC_step=0.200, val_loss=0.400, val_pAUC=0.171, train_loss_epoch=0.403, train_pAUC_epoch=0.174]

Epoch 9, global step 90: 'val_pAUC' reached 0.17094 (best 0.17171), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=09-val_pAUC=0.17094.ckpt' as top 3


Epoch 10: 100%|██████████| 51/51 [00:27<00:00,  1.82it/s, v_num=145, train_loss_step=0.429, train_pAUC_step=0.200, val_loss=0.398, val_pAUC=0.172, train_loss_epoch=0.401, train_pAUC_epoch=0.174]

Epoch 10, global step 99: 'val_pAUC' reached 0.17245 (best 0.17245), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=10-val_pAUC=0.17245.ckpt' as top 3


Epoch 11: 100%|██████████| 51/51 [00:25<00:00,  2.01it/s, v_num=145, train_loss_step=0.581, train_pAUC_step=0.100, val_loss=0.399, val_pAUC=0.174, train_loss_epoch=0.398, train_pAUC_epoch=0.175]

Epoch 11, global step 108: 'val_pAUC' reached 0.17398 (best 0.17398), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=11-val_pAUC=0.17398.ckpt' as top 3


Epoch 12: 100%|██████████| 51/51 [00:20<00:00,  2.45it/s, v_num=145, train_loss_step=0.563, train_pAUC_step=0.133, val_loss=0.394, val_pAUC=0.177, train_loss_epoch=0.400, train_pAUC_epoch=0.175]

Epoch 12, global step 117: 'val_pAUC' reached 0.17693 (best 0.17693), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=12-val_pAUC=0.17693.ckpt' as top 3


Epoch 13: 100%|██████████| 51/51 [00:20<00:00,  2.48it/s, v_num=145, train_loss_step=0.709, train_pAUC_step=0.100, val_loss=0.394, val_pAUC=0.172, train_loss_epoch=0.394, train_pAUC_epoch=0.178]

Epoch 13, global step 126: 'val_pAUC' was not in top 3


Epoch 14: 100%|██████████| 51/51 [00:20<00:00,  2.50it/s, v_num=145, train_loss_step=0.324, train_pAUC_step=0.200, val_loss=0.400, val_pAUC=0.175, train_loss_epoch=0.399, train_pAUC_epoch=0.175]

Epoch 14, global step 135: 'val_pAUC' reached 0.17477 (best 0.17693), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=14-val_pAUC=0.17477.ckpt' as top 3


Epoch 15: 100%|██████████| 51/51 [00:20<00:00,  2.44it/s, v_num=145, train_loss_step=0.440, train_pAUC_step=0.200, val_loss=0.391, val_pAUC=0.175, train_loss_epoch=0.390, train_pAUC_epoch=0.179]

Epoch 15, global step 144: 'val_pAUC' reached 0.17460 (best 0.17693), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=15-val_pAUC=0.17460.ckpt' as top 3


Epoch 16:   0%|          | 0/51 [00:00<?, ?it/s, v_num=145, train_loss_step=0.440, train_pAUC_step=0.200, val_loss=0.391, val_pAUC=0.175, train_loss_epoch=0.390, train_pAUC_epoch=0.179]         

Trainer was signaled to stop but the required `min_epochs=100` or `min_steps=None` has not been met. Training will continue...


Epoch 16: 100%|██████████| 51/51 [00:20<00:00,  2.49it/s, v_num=145, train_loss_step=0.890, train_pAUC_step=0.000, val_loss=0.396, val_pAUC=0.177, train_loss_epoch=0.389, train_pAUC_epoch=0.180]

Epoch 16, global step 153: 'val_pAUC' reached 0.17688 (best 0.17693), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=16-val_pAUC=0.17688.ckpt' as top 3


Epoch 17: 100%|██████████| 51/51 [00:21<00:00,  2.42it/s, v_num=145, train_loss_step=0.396, train_pAUC_step=0.200, val_loss=0.396, val_pAUC=0.172, train_loss_epoch=0.399, train_pAUC_epoch=0.175]

Epoch 17, global step 162: 'val_pAUC' was not in top 3


Epoch 18: 100%|██████████| 51/51 [00:21<00:00,  2.39it/s, v_num=145, train_loss_step=0.317, train_pAUC_step=0.200, val_loss=0.392, val_pAUC=0.175, train_loss_epoch=0.384, train_pAUC_epoch=0.179]

Epoch 18, global step 171: 'val_pAUC' was not in top 3


Epoch 19: 100%|██████████| 51/51 [00:20<00:00,  2.47it/s, v_num=145, train_loss_step=0.413, train_pAUC_step=0.200, val_loss=0.386, val_pAUC=0.180, train_loss_epoch=0.393, train_pAUC_epoch=0.177]

Epoch 19, global step 180: 'val_pAUC' reached 0.17989 (best 0.17989), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=19-val_pAUC=0.17989.ckpt' as top 3


Epoch 20: 100%|██████████| 51/51 [00:20<00:00,  2.44it/s, v_num=145, train_loss_step=0.569, train_pAUC_step=0.120, val_loss=0.398, val_pAUC=0.175, train_loss_epoch=0.387, train_pAUC_epoch=0.180]

Epoch 20, global step 189: 'val_pAUC' was not in top 3


Epoch 21: 100%|██████████| 51/51 [00:20<00:00,  2.44it/s, v_num=145, train_loss_step=0.567, train_pAUC_step=0.0167, val_loss=0.380, val_pAUC=0.178, train_loss_epoch=0.390, train_pAUC_epoch=0.178]

Epoch 21, global step 198: 'val_pAUC' reached 0.17782 (best 0.17989), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=21-val_pAUC=0.17782.ckpt' as top 3


Epoch 22: 100%|██████████| 51/51 [00:20<00:00,  2.43it/s, v_num=145, train_loss_step=0.676, train_pAUC_step=0.000, val_loss=0.377, val_pAUC=0.180, train_loss_epoch=0.381, train_pAUC_epoch=0.181]

Epoch 22, global step 207: 'val_pAUC' reached 0.17968 (best 0.17989), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=22-val_pAUC=0.17968.ckpt' as top 3


Epoch 23: 100%|██████████| 51/51 [00:21<00:00,  2.41it/s, v_num=145, train_loss_step=0.507, train_pAUC_step=0.133, val_loss=0.385, val_pAUC=0.180, train_loss_epoch=0.377, train_pAUC_epoch=0.182]

Epoch 23, global step 216: 'val_pAUC' reached 0.18023 (best 0.18023), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=23-val_pAUC=0.18023.ckpt' as top 3


Epoch 24: 100%|██████████| 51/51 [00:21<00:00,  2.42it/s, v_num=145, train_loss_step=0.438, train_pAUC_step=0.133, val_loss=0.379, val_pAUC=0.182, train_loss_epoch=0.378, train_pAUC_epoch=0.182]

Epoch 24, global step 225: 'val_pAUC' reached 0.18247 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=24-val_pAUC=0.18247.ckpt' as top 3


Epoch 25: 100%|██████████| 51/51 [00:21<00:00,  2.41it/s, v_num=145, train_loss_step=0.554, train_pAUC_step=0.200, val_loss=0.384, val_pAUC=0.181, train_loss_epoch=0.376, train_pAUC_epoch=0.182]

Epoch 25, global step 234: 'val_pAUC' reached 0.18085 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=25-val_pAUC=0.18085.ckpt' as top 3


Epoch 26: 100%|██████████| 51/51 [00:21<00:00,  2.41it/s, v_num=145, train_loss_step=0.418, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 26, global step 243: 'val_pAUC' reached 0.18106 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=26-val_pAUC=0.18106.ckpt' as top 3


Epoch 27: 100%|██████████| 51/51 [00:21<00:00,  2.38it/s, v_num=145, train_loss_step=0.318, train_pAUC_step=0.200, val_loss=0.379, val_pAUC=0.180, train_loss_epoch=0.378, train_pAUC_epoch=0.182]

Epoch 27, global step 252: 'val_pAUC' was not in top 3


Epoch 28: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, v_num=145, train_loss_step=0.314, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.180, train_loss_epoch=0.379, train_pAUC_epoch=0.182]

Epoch 28, global step 261: 'val_pAUC' was not in top 3


Epoch 29: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, v_num=145, train_loss_step=0.542, train_pAUC_step=0.200, val_loss=0.383, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 29, global step 270: 'val_pAUC' was not in top 3


Epoch 30: 100%|██████████| 51/51 [00:21<00:00,  2.41it/s, v_num=145, train_loss_step=0.365, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.181, train_loss_epoch=0.377, train_pAUC_epoch=0.182]

Epoch 30, global step 279: 'val_pAUC' reached 0.18117 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=30-val_pAUC=0.18117.ckpt' as top 3


Epoch 31: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, v_num=145, train_loss_step=0.317, train_pAUC_step=0.200, val_loss=0.386, val_pAUC=0.179, train_loss_epoch=0.376, train_pAUC_epoch=0.182]

Epoch 31, global step 288: 'val_pAUC' was not in top 3


Epoch 32: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, v_num=145, train_loss_step=0.328, train_pAUC_step=0.200, val_loss=0.383, val_pAUC=0.179, train_loss_epoch=0.373, train_pAUC_epoch=0.185]

Epoch 32, global step 297: 'val_pAUC' was not in top 3


Epoch 33: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.316, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.181, train_loss_epoch=0.369, train_pAUC_epoch=0.183]

Epoch 33, global step 306: 'val_pAUC' reached 0.18117 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=33-val_pAUC=0.18117.ckpt' as top 3


Epoch 34: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.449, train_pAUC_step=0.200, val_loss=0.383, val_pAUC=0.181, train_loss_epoch=0.380, train_pAUC_epoch=0.182]

Epoch 34, global step 315: 'val_pAUC' was not in top 3


Epoch 35: 100%|██████████| 51/51 [00:21<00:00,  2.38it/s, v_num=145, train_loss_step=0.433, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 35, global step 324: 'val_pAUC' was not in top 3


Epoch 36: 100%|██████████| 51/51 [00:21<00:00,  2.40it/s, v_num=145, train_loss_step=0.601, train_pAUC_step=0.100, val_loss=0.382, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.183]

Epoch 36, global step 333: 'val_pAUC' was not in top 3


Epoch 37: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.497, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.377, train_pAUC_epoch=0.183]

Epoch 37, global step 342: 'val_pAUC' was not in top 3


Epoch 38: 100%|██████████| 51/51 [00:21<00:00,  2.37it/s, v_num=145, train_loss_step=0.673, train_pAUC_step=0.120, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.185]

Epoch 38, global step 351: 'val_pAUC' reached 0.18132 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=38-val_pAUC=0.18132.ckpt' as top 3


Epoch 39: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.410, train_pAUC_step=0.200, val_loss=0.379, val_pAUC=0.181, train_loss_epoch=0.375, train_pAUC_epoch=0.184]

Epoch 39, global step 360: 'val_pAUC' reached 0.18120 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=39-val_pAUC=0.18120.ckpt' as top 3


Epoch 40: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.334, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.372, train_pAUC_epoch=0.183]

Epoch 40, global step 369: 'val_pAUC' was not in top 3


Epoch 41: 100%|██████████| 51/51 [00:21<00:00,  2.34it/s, v_num=145, train_loss_step=0.558, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.183]

Epoch 41, global step 378: 'val_pAUC' reached 0.18138 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=41-val_pAUC=0.18138.ckpt' as top 3


Epoch 42: 100%|██████████| 51/51 [00:21<00:00,  2.35it/s, v_num=145, train_loss_step=0.474, train_pAUC_step=0.200, val_loss=0.379, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 42, global step 387: 'val_pAUC' was not in top 3


Epoch 43: 100%|██████████| 51/51 [00:22<00:00,  2.28it/s, v_num=145, train_loss_step=0.316, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.180, train_loss_epoch=0.373, train_pAUC_epoch=0.182]

Epoch 43, global step 396: 'val_pAUC' was not in top 3


Epoch 44: 100%|██████████| 51/51 [00:21<00:00,  2.38it/s, v_num=145, train_loss_step=0.323, train_pAUC_step=0.200, val_loss=0.378, val_pAUC=0.182, train_loss_epoch=0.370, train_pAUC_epoch=0.184]

Epoch 44, global step 405: 'val_pAUC' reached 0.18183 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=44-val_pAUC=0.18183.ckpt' as top 3


Epoch 45: 100%|██████████| 51/51 [00:21<00:00,  2.35it/s, v_num=145, train_loss_step=0.469, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.181, train_loss_epoch=0.371, train_pAUC_epoch=0.183]

Epoch 45, global step 414: 'val_pAUC' was not in top 3


Epoch 46: 100%|██████████| 51/51 [00:21<00:00,  2.35it/s, v_num=145, train_loss_step=0.348, train_pAUC_step=0.200, val_loss=0.379, val_pAUC=0.181, train_loss_epoch=0.371, train_pAUC_epoch=0.184]

Epoch 46, global step 423: 'val_pAUC' was not in top 3


Epoch 47: 100%|██████████| 51/51 [00:21<00:00,  2.37it/s, v_num=145, train_loss_step=0.318, train_pAUC_step=0.200, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.375, train_pAUC_epoch=0.184]

Epoch 47, global step 432: 'val_pAUC' was not in top 3


Epoch 48: 100%|██████████| 51/51 [00:21<00:00,  2.33it/s, v_num=145, train_loss_step=0.584, train_pAUC_step=0.100, val_loss=0.378, val_pAUC=0.180, train_loss_epoch=0.373, train_pAUC_epoch=0.183]

Epoch 48, global step 441: 'val_pAUC' was not in top 3


Epoch 49: 100%|██████████| 51/51 [00:22<00:00,  2.32it/s, v_num=145, train_loss_step=0.358, train_pAUC_step=0.200, val_loss=0.383, val_pAUC=0.180, train_loss_epoch=0.373, train_pAUC_epoch=0.184]

Epoch 49, global step 450: 'val_pAUC' was not in top 3


Epoch 50: 100%|██████████| 51/51 [00:22<00:00,  2.27it/s, v_num=145, train_loss_step=0.434, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.180, train_loss_epoch=0.370, train_pAUC_epoch=0.184]

Epoch 50, global step 459: 'val_pAUC' was not in top 3


Epoch 51: 100%|██████████| 51/51 [00:21<00:00,  2.34it/s, v_num=145, train_loss_step=0.458, train_pAUC_step=0.150, val_loss=0.379, val_pAUC=0.182, train_loss_epoch=0.379, train_pAUC_epoch=0.181]

Epoch 51, global step 468: 'val_pAUC' reached 0.18150 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=51-val_pAUC=0.18150.ckpt' as top 3


Epoch 52: 100%|██████████| 51/51 [00:21<00:00,  2.34it/s, v_num=145, train_loss_step=0.335, train_pAUC_step=0.200, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 52, global step 477: 'val_pAUC' was not in top 3


Epoch 53: 100%|██████████| 51/51 [00:22<00:00,  2.23it/s, v_num=145, train_loss_step=0.634, train_pAUC_step=0.133, val_loss=0.383, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.184]

Epoch 53, global step 486: 'val_pAUC' was not in top 3


Epoch 54: 100%|██████████| 51/51 [00:21<00:00,  2.36it/s, v_num=145, train_loss_step=0.422, train_pAUC_step=0.200, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.377, train_pAUC_epoch=0.183]

Epoch 54, global step 495: 'val_pAUC' was not in top 3


Epoch 55: 100%|██████████| 51/51 [00:22<00:00,  2.30it/s, v_num=145, train_loss_step=0.439, train_pAUC_step=0.133, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.372, train_pAUC_epoch=0.184]

Epoch 55, global step 504: 'val_pAUC' was not in top 3


Epoch 56: 100%|██████████| 51/51 [00:21<00:00,  2.34it/s, v_num=145, train_loss_step=0.491, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.377, train_pAUC_epoch=0.182]

Epoch 56, global step 513: 'val_pAUC' was not in top 3


Epoch 57: 100%|██████████| 51/51 [00:22<00:00,  2.30it/s, v_num=145, train_loss_step=0.806, train_pAUC_step=0.100, val_loss=0.378, val_pAUC=0.181, train_loss_epoch=0.376, train_pAUC_epoch=0.182]

Epoch 57, global step 522: 'val_pAUC' was not in top 3


Epoch 58: 100%|██████████| 51/51 [00:22<00:00,  2.28it/s, v_num=145, train_loss_step=0.332, train_pAUC_step=0.200, val_loss=0.382, val_pAUC=0.180, train_loss_epoch=0.376, train_pAUC_epoch=0.181]

Epoch 58, global step 531: 'val_pAUC' was not in top 3


Epoch 59: 100%|██████████| 51/51 [00:22<00:00,  2.29it/s, v_num=145, train_loss_step=0.443, train_pAUC_step=0.0667, val_loss=0.381, val_pAUC=0.180, train_loss_epoch=0.374, train_pAUC_epoch=0.184]

Epoch 59, global step 540: 'val_pAUC' was not in top 3


Epoch 60: 100%|██████████| 51/51 [00:22<00:00,  2.30it/s, v_num=145, train_loss_step=0.335, train_pAUC_step=0.200, val_loss=0.380, val_pAUC=0.182, train_loss_epoch=0.373, train_pAUC_epoch=0.183] 

Epoch 60, global step 549: 'val_pAUC' reached 0.18152 (best 0.18247), saving model to '/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_145/gurunet-epoch=60-val_pAUC=0.18152.ckpt' as top 3


Epoch 61: 100%|██████████| 51/51 [00:22<00:00,  2.27it/s, v_num=145, train_loss_step=0.324, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 61, global step 558: 'val_pAUC' was not in top 3


Epoch 62: 100%|██████████| 51/51 [00:22<00:00,  2.27it/s, v_num=145, train_loss_step=0.451, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.376, train_pAUC_epoch=0.183]

Epoch 62, global step 567: 'val_pAUC' was not in top 3


Epoch 63: 100%|██████████| 51/51 [00:22<00:00,  2.27it/s, v_num=145, train_loss_step=0.356, train_pAUC_step=0.200, val_loss=0.381, val_pAUC=0.181, train_loss_epoch=0.374, train_pAUC_epoch=0.183]

Epoch 63, global step 576: 'val_pAUC' was not in top 3


Epoch 64: 100%|██████████| 51/51 [00:22<00:00,  2.29it/s, v_num=145, train_loss_step=0.369, train_pAUC_step=0.200, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.183]

Epoch 64, global step 585: 'val_pAUC' was not in top 3


Epoch 65:  98%|█████████▊| 50/51 [00:20<00:00,  2.43it/s, v_num=145, train_loss_step=0.377, train_pAUC_step=0.187, val_loss=0.380, val_pAUC=0.181, train_loss_epoch=0.373, train_pAUC_epoch=0.183]

IndexError: index 0 is out of bounds for dimension 0 with size 0

In [None]:
# trainer.test(
#     model=model,
#     ckpt_path="/home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_144/gurunet-epoch=44-val_pAUC=0.17860.ckpt",
#     datamodule=data_module,
# )

Preparing DataFrame...
Loading cached prepared DataFrame...


  return torch.load(io.BytesIO(b))
Restoring states from the checkpoint path at /home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_144/gurunet-epoch=44-val_pAUC=0.17860.ckpt


Number of Positive Samples: 393
Number of Negative Samples: 400666
Length of Balanced Positive Indices: 2005
Length of Balanced Negative Indices: 2005
Unique indices: [   302    305    387 ... 400775 400922 400991]
Unique targets: [0 1]
4010
4010
Setup Count: {np.int64(0): np.int64(2005), np.int64(1): np.int64(2005)}
Train class distribution: [0.50187032 0.49812968], 3208
Validation class distribution: [0.52119701 0.47880299], 401
Test class distribution: [0.4638404 0.5361596], 401
Length of full_dataset: 401059
Length of train_indices: 3208, max index: 400991
Length of val_indices: 401, max index: 400922
Length of test_indices: 401, max index: 400922


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/pupperemeritus/DL/isic-2024-challenge/checkpoints/version_144/gurunet-epoch=44-val_pAUC=0.17860.ckpt


Number of batches in test_loader: 7
Testing DataLoader 0: 100%|██████████| 7/7 [00:02<00:00,  3.33it/s]


TEST Profiler Report

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Action                                                                                                                                                               	|  Mean duration (s)	|  Num calls      	|  Total time (s) 	|  Percentage %   	|
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Total                                                                                                                                                                	|  -      

[{'test_loss_epoch': 0.3789622485637665,
  'test_pAUC_epoch': 0.18669907748699188}]