# ResNet-50 Skin Cancer Classifier with Advanced Training Techniques
## ISIC Dataset (Stratified 5-Fold CV with Clinical Validation)

This notebook implements a production-ready skin cancer classifier with:
- Progressive layer unfreezing with discriminative learning rates
- Learning rate warmup (5 epochs) and cosine annealing
- Gradient clipping (max_norm=1.0)
- Stratified 5-fold cross-validation
- Bootstrap confidence intervals
- Focal Loss for class imbalance
- Temperature scaling calibration
- Per-class performance thresholds (melanoma F1 > 0.90)
- Comprehensive error analysis and clinical expert review


In [None]:
from IPython import get_ipython
if get_ipython():
    get_ipython().kernel.do_shutdown(restart=True)

: 

In [1]:
import os

print(os.getcwd())

os.chdir('C:\\Users\\elzha\\Desktop\\CSCI 494 DL\\GroupProject')

c:\Users\elzha\Desktop\CSCI 494 DL\GroupProject


In [2]:
!python -m venv .venvNew 

In [3]:
!.venv\Scripts\activate

The system cannot find the path specified.


In [4]:
# Install required packages
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available in PyTorch: {torch.cuda.is_available()}")
print(f"PyTorch CUDA version: {torch.version.cuda}")
print(f"Number of GPUs detected: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Compute Capability: {torch.cuda.get_device_capability(0)}")
else:
    print("GPU not detected - check hardware/drivers.")


PyTorch version: 2.5.1+cu121
CUDA available in PyTorch: True
PyTorch CUDA version: 12.1
Number of GPUs detected: 1
GPU Name: NVIDIA GeForce RTX 4070 SUPER
GPU Compute Capability: (8, 9)


In [6]:
# Imports
import os
import sys
import warnings
import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from collections import defaultdict
import pickle

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm import tqdm
from numpy.typing import ArrayLike

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.models as models
from torchvision.models import ResNet50_Weights

import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc,
    brier_score_loss,
    log_loss,
    precision_recall_curve,
    classification_report
)
from sklearn.manifold import TSNE

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.5.1+cu121
GPU available: True
GPU: NVIDIA GeForce RTX 4070 SUPER


In [18]:
%pip install kagglehub 

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [8]:
# Configuration
import os
from pathlib import Path

class Config:
    """Configuration container for training and model parameters."""

    # Dataset - Using local paths
    home_dir = str(Path.home())
    ISIC_DATASET_URL = 'https://api.isic-archive.com/collections/66/'
    DATA_DIR = os.path.join(home_dir, 'isic_data')
    OUTPUT_DIR = os.path.join(home_dir, 'melanoma_results')
    CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, 'checkpoints')
    RESULTS_DIR = os.path.join(OUTPUT_DIR, 'results')

    # Model
    MODEL_NAME = 'resnet50'
    IMG_SIZE = 224
    NUM_CLASSES = 2  # Binary: melanoma vs non-melanoma
    PRETRAINED = True

    # Training (IMPROVED)
    BATCH_SIZE = 64  # ↑ 2x larger for stable gradients
    MAX_EPOCHS = 150  # ↑ 50% more epochs
    WARMUP_EPOCHS = 8
    EARLY_STOPPING_PATIENCE = 20  # ↑ More patience
    
    # Learning rates (IMPROVED: 5-10x higher)
    HEAD_LR = 5e-3       # ↑ 5x increase
    BACKBONE_LR_LOW = 5e-5    # ↑ 5x increase
    BACKBONE_LR_MID = 1e-4    # ↑ 5x increase
    BACKBONE_LR_HIGH = 5e-4   # ↑ 10x increase
    WEIGHT_DECAY = 5e-5  # ↓ 50% reduction (less regularization)
    
    # Regularization (IMPROVED)
    DROPOUT = 0.3  # ↓ 40% reduction to prevent underfitting
    GRADIENT_CLIP = 1.0
    LABEL_SMOOTHING = 0.1  # NEW for calibration

    # Validation
    TEST_SIZE = 0.2
    N_SPLITS = 5  # For 5-fold CV
    RANDOM_STATE = SEED

    # Device
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    NUM_WORKERS = 0  # Set to 0 for Windows compatibility

    # Performance thresholds
    MIN_MELANOMA_F1 = 0.90

    # Bootstrap
    N_BOOTSTRAP_SAMPLES = 1000
    CI_LEVEL = 0.95

config = Config()
os.makedirs(config.CHECKPOINT_DIR, exist_ok=True)
os.makedirs(config.RESULTS_DIR, exist_ok=True)
os.makedirs(config.DATA_DIR, exist_ok=True)

print(f"✓ Directories created")
print(f"Device: {config.DEVICE}")
print(f"Data directory: {config.DATA_DIR}")
print(f"Output directory: {config.OUTPUT_DIR}")
print(f"Random seed used for initialization: {SEED}")


✓ Directories created
Device: cuda
Data directory: C:\Users\elzha\isic_data
Output directory: C:\Users\elzha\melanoma_results
Random seed used for initialization: 42


In [20]:
# Load HAM10000 dataset from local cache
def load_ham10000_dataset_local(dataset_path: str) -> pd.DataFrame:
    """Load HAM10000 dataset directly from local cache path."""
    import os
    from pathlib import Path
    
    print(f"Loading HAM10000 dataset from local cache...")
    print(f"Path: {dataset_path}")
    print()
    
    try:
        # Verify the path exists
        if not os.path.exists(dataset_path):
            print(f"❌ Dataset path does not exist: {dataset_path}")
            print()
            return pd.DataFrame()
        
        # Load metadata
        metadata_path = os.path.join(dataset_path, 'HAM10000_metadata.csv')
        if not os.path.exists(metadata_path):
            print(f"❌ Metadata file not found: {metadata_path}")
            print()
            return pd.DataFrame()
            
        df = pd.read_csv(metadata_path)
        
        # Find image directories (handles different naming conventions)
        image_dirs = []
        for dir_name in ['HAM10000_images_part_1', 'HAM10000_images_part_2',
                         'ham10000_images_part_1', 'ham10000_images_part_2']:
            dir_path = os.path.join(dataset_path, dir_name)
            if os.path.exists(dir_path):
                image_dirs.append(dir_path)
        
        if not image_dirs:
            print("⚠️  No image directories found!")
            print(f"Contents of {dataset_path}:")
            print(os.listdir(dataset_path))
            return pd.DataFrame()
        
        # Resolve full image paths
        def get_image_path(image_id):
            for img_dir in image_dirs:
                img_path = os.path.join(img_dir, f'{image_id}.jpg')
                if os.path.exists(img_path):
                    return img_path
            return None
        
        df['image_path'] = df['image_id'].apply(get_image_path)
        df = df[df['image_path'].notna()].reset_index(drop=True)
        
        # Create binary labels (melanoma vs non-melanoma)
        df['binary_label'] = (df['dx'] == 'mel').astype(int)
        
        # Rename diagnosis to match expected format
        df['diagnosis'] = df['dx']
        
        print(f"✓ Loaded {len(df)} images with verified paths")
        print()
        print("Diagnosis distribution:")
        print(df['diagnosis'].value_counts().to_dict())
        print()
        print(f"Melanoma (binary_label=1): {(df['binary_label'] == 1).sum()}")
        print(f"Non-melanoma (binary_label=0): {(df['binary_label'] == 0).sum()}")
        
        return df[['image_id', 'image_path', 'binary_label', 'diagnosis']].reset_index(drop=True)
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        print()
        return pd.DataFrame()


# Load dataset from specified local path
DATASET_PATH = r'C:\Users\elzha\.cache\kagglehub\datasets\kmader\skin-cancer-mnist-ham10000\versions\2'
df_isic = load_ham10000_dataset_local(DATASET_PATH)

if df_isic.empty:
    print()
    print("⚠️  Could not load dataset. Please check the path and try again.")
    print()
    use_ham10000 = False
else:
    print()
    print(f"✓ Dataset ready for training: {len(df_isic)} images")
    use_ham10000 = True


Loading HAM10000 dataset from local cache...
Path: C:\Users\elzha\.cache\kagglehub\datasets\kmader\skin-cancer-mnist-ham10000\versions\2

✓ Loaded 10015 images with verified paths

Diagnosis distribution:
{'nv': 6705, 'mel': 1113, 'bkl': 1099, 'bcc': 514, 'akiec': 327, 'vasc': 142, 'df': 115}

Melanoma (binary_label=1): 1113
Non-melanoma (binary_label=0): 8902

✓ Dataset ready for training: 10015 images


In [21]:
# Dataset class
class SkinCancerDataset(Dataset):
    """PyTorch Dataset for skin cancer classification."""

    def __init__(self, dataframe: pd.DataFrame, transform: Optional[A.Compose] = None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        img_path = self.df.loc[idx, 'image_path']
        label = int(self.df.loc[idx, 'binary_label'])

        # Load image
        image = cv2.imread(str(img_path))
        if image is None:
            # Fallback: create dummy image
            image = np.zeros((224, 224, 3), dtype=np.uint8)
        else:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Apply transforms
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        return {
            'image': image,
            'label': torch.tensor(label, dtype=torch.long),
            'image_id': self.df.loc[idx, 'image_id'] if 'image_id' in self.df.columns else str(idx)
        }

# Augmentation pipelines
def get_train_transforms(img_size: int = 224) -> A.Compose:
    return A.Compose([
        A.RandomResizedCrop(size=(img_size, img_size), scale=(0.8, 1.0), p=1.0),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Rotate(limit=30, border_mode=cv2.BORDER_REFLECT_101, p=0.7),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
        A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05, p=0.3),
        A.GaussNoise(p=0.2),
        A.Blur(blur_limit=3, p=0.2),
        A.CoarseDropout(max_holes=1, max_height=int(0.1*img_size), max_width=int(0.1*img_size), p=0.2),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

def get_val_transforms(img_size: int = 224) -> A.Compose:
    return A.Compose([
        A.Resize(img_size, img_size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

print("✓ Dataset class and transforms defined")

✓ Dataset class and transforms defined


In [23]:
# Model architecture
class ResNet50Classifier(nn.Module):
    def __init__(self, num_classes: int = 2, pretrained: bool = True, dropout: float = 0.3):
        super().__init__()
        
        weights = ResNet50_Weights.IMAGENET1K_V2 if pretrained else None
        self.backbone = models.resnet50(weights=weights)
        num_features = self.backbone.fc.in_features  # 2048
        self.backbone.fc = nn.Identity()
        
        # IMPROVED head: 2048 → 2048 → 1024 → 512 → 2 (added capacity)
        self.head = nn.Sequential(
            nn.Dropout(dropout),  # 0.3 - more permissive
            nn.Linear(num_features, 2048),  # 2048 → 2048 (EXPANDED)
            nn.ReLU(),
            nn.BatchNorm1d(2048),
            nn.Dropout(dropout * 0.5),  # 0.15
            nn.Linear(2048, 1024),  # 2048 → 1024 (NEW intermediate layer)
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(dropout * 0.5),  # 0.15
            nn.Linear(1024, 512),  # 1024 → 512
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(dropout * 0.25),  # 0.075 - lighter at end
            nn.Linear(512, num_classes)  # 512 → 2
        )
    
    def forward(self, x: torch.Tensor):
        features = self.backbone(x)
        logits = self.head(features)
        return logits, features


# Focal Loss for class imbalance
class FocalLoss(nn.Module):
    def __init__(self, alpha: Optional[torch.Tensor] = None, gamma: float = 1.8,  # TUNED
                 reduction: str = 'mean', label_smoothing: float = 0.1):  # NEW
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma  # 1.8 - less aggressive
        self.reduction = reduction
        self.label_smoothing = label_smoothing  # NEW for calibration
    
    def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        # Apply label smoothing if enabled
        if self.label_smoothing > 0:
            n_classes = inputs.size(1)
            targets_smooth = targets.float()
            targets_smooth = targets_smooth * (1 - self.label_smoothing) + self.label_smoothing / n_classes
        
        ce_loss = F.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss.sum()

# Temperature scaling
class TemperatureScaler(nn.Module):
    """Temperature scaling for calibration."""

    def __init__(self):
        super().__init__()
        self.log_temperature = nn.Parameter(torch.zeros(1))

    @property
    def temperature(self) -> torch.Tensor:
        return self.log_temperature.exp()

    def forward(self, logits: torch.Tensor) -> torch.Tensor:
        return logits / self.temperature

print("✓ Model architecture defined")
print(f"Random seed for weight initialization: {SEED}")


✓ Model architecture defined
Random seed for weight initialization: 42


In [24]:
# After loading the improved notebook, check config values:
print(f"Batch Size: {config.BATCH_SIZE}")  # Should be 64
print(f"Dropout: {config.DROPOUT}")        # Should be 0.3
print(f"Head LR: {config.HEAD_LR}")        # Should be 5e-3
print(f"Max Epochs: {config.MAX_EPOCHS}")  # Should be 150

# Count model parameters:
model = ResNet50Classifier(config.NUM_CLASSES, config.PRETRAINED, config.DROPOUT)
head_params = sum(p.numel() for p in model.head.parameters())
print(f"Head Parameters: {head_params}")  # Should be ~3.4M (was 2.6M)

# Check loss function:
criterion = FocalLoss(alpha=None, gamma=1.8, label_smoothing=0.1)
print(f"Focal Loss Gamma: {criterion.gamma}")  # Should be 1.8
print(f"Label Smoothing: {criterion.label_smoothing}")  # Should be 0.1


Batch Size: 64
Dropout: 0.3
Head LR: 0.005
Max Epochs: 150
Head Parameters: 6827522
Focal Loss Gamma: 1.8
Label Smoothing: 0.1


In [26]:
# Training utilities
def get_layer_groups(model: nn.Module) -> List[List[nn.Parameter]]:
    """Get parameter groups for discriminative fine-tuning."""
    groups = []

    # Layer 4 (deepest)
    groups.append(list(model.backbone.layer4.parameters()))
    # Layer 3
    groups.append(list(model.backbone.layer3.parameters()))
    # Layer 2
    groups.append(list(model.backbone.layer2.parameters()))
    # Layer 1
    groups.append(list(model.backbone.layer1.parameters()))
    # Conv1 + BN
    groups.append(list(model.backbone.conv1.parameters()) + list(model.backbone.bn1.parameters()))

    return groups

def build_optimizer_warmup(model: nn.Module, config: Config) -> optim.Optimizer:
    """Build optimizer for warmup (head only)."""
    return optim.AdamW(model.head.parameters(), lr=config.HEAD_LR, weight_decay=config.WEIGHT_DECAY)

def build_optimizer_finetune(model: nn.Module, config: Config) -> optim.Optimizer:
    """Build optimizer with discriminative learning rates."""
    param_groups = [
        {'params': model.head.parameters(), 'lr': config.HEAD_LR},
    ]

    layer_groups = get_layer_groups(model)
    lrs = [config.BACKBONE_LR_HIGH, config.BACKBONE_LR_HIGH, config.BACKBONE_LR_MID,
           config.BACKBONE_LR_MID, config.BACKBONE_LR_LOW]

    for params, lr in zip(layer_groups, lrs):
        param_groups.append({'params': params, 'lr': lr})

    return optim.AdamW(param_groups, weight_decay=config.WEIGHT_DECAY)

def freeze_backbone(model: nn.Module):
    """Freeze backbone parameters."""
    for param in model.backbone.parameters():
        param.requires_grad = False

def unfreeze_backbone(model: nn.Module):
    """Unfreeze backbone parameters."""
    for param in model.backbone.parameters():
        param.requires_grad = True

def cosine_annealing(epoch: int, max_epochs: int, base_lr: float) -> float:
    """Cosine annealing schedule."""
    return base_lr * 0.5 * (1 + np.cos(np.pi * epoch / max_epochs))

def warmup_lr(epoch: int, warmup_epochs: int, base_lr: float) -> float:
    """Warmup learning rate schedule."""
    if epoch < warmup_epochs:
        return base_lr * (epoch + 1) / warmup_epochs
    return base_lr

print("✓ Training utilities defined")


✓ Training utilities defined


In [27]:
# Training and evaluation functions
def train_epoch(model: nn.Module, loader: DataLoader, criterion: nn.Module,
                optimizer: optim.Optimizer, device: str, gradient_clip: float = 1.0) -> Tuple[float, float]:
    """Train for one epoch with gradient clipping."""
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(loader, desc='Training')
    for batch in pbar:
        images = batch['image'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits, _ = model(images)
        loss = criterion(logits, labels)

        loss.backward()
        if gradient_clip > 0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_clip)
        optimizer.step()

        total_loss += loss.item() * images.size(0)
        _, preds = logits.max(1)
        correct += preds.eq(labels).sum().item()
        total += labels.size(0)

        pbar.set_postfix({'loss': total_loss / total, 'acc': 100 * correct / total})

    return total_loss / total, correct / total

def validate(model: nn.Module, loader: DataLoader, criterion: nn.Module, device: str) -> Tuple[float, float, float]:
    """Validate model."""
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation'):
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            logits, _ = model(images)
            loss = criterion(logits, labels)

            total_loss += loss.item() * images.size(0)
            _, preds = logits.max(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    return avg_loss, accuracy, f1

def evaluate(model: nn.Module, loader: DataLoader, device: str) -> Dict[str, Any]:
    """Full evaluation with all metrics."""
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            logits, _ = model(images)
            probs = torch.softmax(logits, dim=1)
            _, preds = probs.max(1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)

    cm = confusion_matrix(all_labels, all_preds)

    # Per-class metrics
    report = classification_report(all_labels, all_preds, output_dict=True, zero_division=0)

    return {
        'accuracy': accuracy_score(all_labels, all_preds),
        'precision': precision_score(all_labels, all_preds, zero_division=0),
        'recall': recall_score(all_labels, all_preds, zero_division=0),
        'f1': f1_score(all_labels, all_preds, zero_division=0),
        'roc_auc': roc_auc_score(all_labels, all_probs),
        'confusion_matrix': cm,
        'predictions': all_preds,
        'labels': all_labels,
        'probabilities': all_probs,
        'report': report
    }

print("✓ Training and evaluation functions defined")


✓ Training and evaluation functions defined


In [28]:
# Main training pipeline
def train_single_fold(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader,
                     config: Config, device: str, fold_idx: int) -> Tuple[float, Dict[str, Any]]:
    """Train model on single fold."""

    # Compute class weights
    train_labels = []
    for batch in train_loader:
        train_labels.extend(batch['label'].numpy())
    train_labels = np.array(train_labels)

    class_counts = np.bincount(train_labels)
    class_weights = len(train_labels) / (len(class_counts) * class_counts)
    class_weights = torch.FloatTensor(class_weights).to(device)

    print(f"\\nFold {fold_idx+1} - Class weights: {class_weights.cpu().numpy()}")

    criterion = FocalLoss(alpha=class_weights, gamma=2.0)

    # Warmup phase
    freeze_backbone(model)
    optimizer = build_optimizer_warmup(model, config)

    best_val_f1 = 0.0
    patience_counter = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}

    print(f"\\n--- WARMUP PHASE (Epochs 1-{config.WARMUP_EPOCHS}) ---")
    for epoch in range(config.WARMUP_EPOCHS):
        lr = warmup_lr(epoch, config.WARMUP_EPOCHS, config.HEAD_LR)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, config.GRADIENT_CLIP)
        val_loss, val_acc, val_f1 = validate(model, val_loader, criterion, device)

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)

        print(f"Ep {epoch+1}: TrLoss={train_loss:.4f} TrAcc={train_acc:.4f} VlLoss={val_loss:.4f} VlAcc={val_acc:.4f} VlF1={val_f1:.4f} LR={lr:.6f}")

    # Fine-tuning phase
    print(f"\\n--- FINE-TUNING PHASE (Progressive unfreezing) ---")
    unfreeze_backbone(model)
    optimizer = build_optimizer_finetune(model, config)

    for epoch in range(config.WARMUP_EPOCHS, config.MAX_EPOCHS):
        # Cosine annealing for each param group
        for param_group in optimizer.param_groups:
            param_group['lr'] = cosine_annealing(epoch - config.WARMUP_EPOCHS,
                                                 config.MAX_EPOCHS - config.WARMUP_EPOCHS,
                                                 param_group['lr'])

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, config.GRADIENT_CLIP)
        val_loss, val_acc, val_f1 = validate(model, val_loader, criterion, device)

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)

        if epoch % 5 == 0:
            print(f"Ep {epoch+1}: TrLoss={train_loss:.4f} TrAcc={train_acc:.4f} VlLoss={val_loss:.4f} VlAcc={val_acc:.4f} VlF1={val_f1:.4f}")

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
            checkpoint_path = os.path.join(config.CHECKPOINT_DIR, f'fold_{fold_idx}_best.pth')
            torch.save(model.state_dict(), checkpoint_path)
        else:
            patience_counter += 1

        if patience_counter >= config.EARLY_STOPPING_PATIENCE:
            print(f"\\n✓ Early stopping at epoch {epoch+1}")
            break

    # Load best model
    checkpoint_path = os.path.join(config.CHECKPOINT_DIR, f'fold_{fold_idx}_best.pth')
    model.load_state_dict(torch.load(checkpoint_path))

    # Final evaluation
    val_metrics = evaluate(model, val_loader, device)

    return best_val_f1, val_metrics

print("✓ Single fold training function defined")


✓ Single fold training function defined


In [29]:
# Cross-validation and bootstrapping
def bootstrap_ci(metric_values: np.ndarray, n_samples: int = 1000, ci: float = 0.95) -> Tuple[float, float, float]:
    """Compute bootstrap confidence interval."""
    bootstrapped = []
    for _ in range(n_samples):
        indices = np.random.choice(len(metric_values), size=len(metric_values), replace=True)
        bootstrapped.append(np.mean(metric_values[indices]))

    bootstrapped = np.array(bootstrapped)
    mean = np.mean(bootstrapped)
    alpha = 1 - ci
    lower = np.percentile(bootstrapped, alpha/2 * 100)
    upper = np.percentile(bootstrapped, (1 - alpha/2) * 100)

    return mean, lower, upper

def stratified_kfold_cv(df: pd.DataFrame, config: Config, device: str) -> Dict[str, Any]:
    """Perform stratified k-fold cross-validation."""

    skf = StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=config.RANDOM_STATE)

    fold_results = []
    all_test_preds = []
    all_test_labels = []
    all_test_probs = []

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(df, df['binary_label'])):
        print(f"\\n{'='*70}")
        print(f"FOLD {fold_idx + 1}/{config.N_SPLITS}")
        print(f"{'='*70}")

        train_df = df.iloc[train_idx].reset_index(drop=True)
        test_df = df.iloc[test_idx].reset_index(drop=True)

        # Further split train into train/val
        train_df, val_df = train_test_split(train_df, test_size=0.2,
                                           random_state=config.RANDOM_STATE,
                                           stratify=train_df['binary_label'])

        # Create datasets and loaders
        train_dataset = SkinCancerDataset(train_df, get_train_transforms(config.IMG_SIZE))
        val_dataset = SkinCancerDataset(val_df, get_val_transforms(config.IMG_SIZE))
        test_dataset = SkinCancerDataset(test_df, get_val_transforms(config.IMG_SIZE))

        train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=config.NUM_WORKERS)
        val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=config.NUM_WORKERS)
        test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=config.NUM_WORKERS)

        print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
        print(f"Train melanoma: {(train_df['binary_label'] == 1).sum()} | Non-melanoma: {(train_df['binary_label'] == 0).sum()}")

        # Create and train model
        model = ResNet50Classifier(config.NUM_CLASSES, config.PRETRAINED, config.DROPOUT)
        model = model.to(device)

        best_f1, val_metrics = train_single_fold(model, train_loader, val_loader, config, device, fold_idx)

        # Test evaluation
        test_metrics = evaluate(model, test_loader, device)

        fold_results.append({
            'fold': fold_idx,
            'val_metrics': val_metrics,
            'test_metrics': test_metrics
        })

        all_test_preds.extend(test_metrics['predictions'])
        all_test_labels.extend(test_metrics['labels'])
        all_test_probs.extend(test_metrics['probabilities'])

        print(f"\\n✓ Fold {fold_idx+1} completed")
        print(f"Test F1: {test_metrics['f1']:.4f} | Test AUC: {test_metrics['roc_auc']:.4f}")

    all_test_preds = np.array(all_test_preds)
    all_test_labels = np.array(all_test_labels)
    all_test_probs = np.array(all_test_probs)

    # Aggregate metrics with bootstrap CI
    f1_scores = [fold_results[i]['test_metrics']['f1'] for i in range(len(fold_results))]
    auc_scores = [fold_results[i]['test_metrics']['roc_auc'] for i in range(len(fold_results))]

    aggregated = {
        'fold_results': fold_results,
        'all_test_predictions': all_test_preds,
        'all_test_labels': all_test_labels,
        'all_test_probabilities': all_test_probs,
        'f1_mean': np.mean(f1_scores),
        'f1_std': np.std(f1_scores),
        'auc_mean': np.mean(auc_scores),
        'auc_std': np.std(auc_scores)
    }

    # Bootstrap CIs
    f1_mean, f1_lower, f1_upper = bootstrap_ci(np.array(f1_scores), config.N_BOOTSTRAP_SAMPLES, config.CI_LEVEL)
    auc_mean, auc_lower, auc_upper = bootstrap_ci(np.array(auc_scores), config.N_BOOTSTRAP_SAMPLES, config.CI_LEVEL)

    aggregated['f1_ci'] = (f1_lower, f1_upper)
    aggregated['auc_ci'] = (auc_lower, auc_upper)

    return aggregated

print("✓ Cross-validation functions defined")


✓ Cross-validation functions defined


In [30]:
# Clinical validation and error analysis
def per_class_metrics(predictions: np.ndarray, labels: np.ndarray, probabilities: np.ndarray) -> Dict[str, Any]:
    """Compute per-class metrics and check clinical thresholds."""

    metrics_by_class = {}
    class_names = {0: 'Non-Melanoma', 1: 'Melanoma'}

    for class_idx in [0, 1]:
        mask = labels == class_idx
        if mask.sum() == 0:
            continue

        class_preds = predictions[mask]
        class_labels = labels[mask]
        class_probs = probabilities[mask]

        metrics_by_class[class_names[class_idx]] = {
            'count': mask.sum(),
            'precision': precision_score(class_labels, class_preds, pos_label=class_idx, zero_division=0),
            'recall': recall_score(class_labels, class_preds, pos_label=class_idx, zero_division=0),
            'f1': f1_score(class_labels, class_preds, pos_label=class_idx, zero_division=0),
            'auc': roc_auc_score((class_labels == class_idx).astype(int), class_probs) if len(np.unique(class_labels)) > 1 else np.nan
        }

    return metrics_by_class

def clinical_validation(metrics_by_class: Dict[str, Any]) -> Dict[str, bool]:
    """Check if models meet clinical performance requirements."""

    validation_status = {}

    if 'Melanoma' in metrics_by_class:
        melanoma_f1 = metrics_by_class['Melanoma']['f1']
        passed = melanoma_f1 >= config.MIN_MELANOMA_F1
        validation_status['Melanoma F1 >= 0.90'] = passed
        print(f"\\n{'='*70}")
        print(f"CLINICAL VALIDATION REPORT")
        print(f"{'='*70}")
        print(f"Melanoma F1 Score: {melanoma_f1:.4f}")
        print(f"Requirement: >= {config.MIN_MELANOMA_F1}")
        print(f"Status: {'✓ PASSED' if passed else '✗ FAILED'}")

    return validation_status

def error_analysis(predictions: np.ndarray, labels: np.ndarray, probabilities: np.ndarray) -> Dict[str, Any]:
    """Analyze model errors."""

    errors = predictions != labels

    # False positives: predicted 1, actual 0
    fp_mask = (predictions == 1) & (labels == 0)
    # False negatives: predicted 0, actual 1
    fn_mask = (predictions == 0) & (labels == 1)

    fp_probs = probabilities[fp_mask]
    fn_probs = probabilities[fn_mask]

    return {
        'total_errors': errors.sum(),
        'error_rate': errors.mean(),
        'false_positives': fp_mask.sum(),
        'false_negatives': fn_mask.sum(),
        'avg_fp_confidence': fp_probs.mean() if len(fp_probs) > 0 else 0,
        'avg_fn_confidence': 1 - fn_probs.mean() if len(fn_probs) > 0 else 0,
    }

print("✓ Clinical validation functions defined")


✓ Clinical validation functions defined


In [31]:
# Model complexity analysis utilities
def count_parameters(model: nn.Module) -> Dict[str, int]:
    """Count model parameters."""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = total_params - trainable_params
    
    return {
        'total_params': total_params,
        'trainable_params': trainable_params,
        'non_trainable_params': non_trainable_params,
        'total_params_millions': total_params / 1e6,
        'model_size_mb': (total_params * 4) / (1024**2)  # Assuming float32
    }

def estimate_flops(model: nn.Module, input_size: Tuple[int, int, int, int]) -> float:
    """Estimate FLOPs for the model (approximate)."""
    # Create a dummy input
    dummy_input = torch.randn(input_size).to(next(model.parameters()).device)
    
    # Count operations (simplified estimation)
    total_ops = 0
    
    def count_conv2d(m, x, y):
        nonlocal total_ops
        kernel_ops = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
        output_elements = y.numel()
        total_ops += kernel_ops * output_elements
    
    def count_linear(m, x, y):
        nonlocal total_ops
        total_ops += m.in_features * m.out_features
    
    # Register hooks
    hooks = []
    for module in model.modules():
        if isinstance(module, nn.Conv2d):
            hooks.append(module.register_forward_hook(count_conv2d))
        elif isinstance(module, nn.Linear):
            hooks.append(module.register_forward_hook(count_linear))
    
    # Forward pass
    model.eval()
    with torch.no_grad():
        _ = model(dummy_input)
    
    # Remove hooks
    for hook in hooks:
        hook.remove()
    
    return total_ops / 1e9  # Return in GFLOPs

def measure_inference_time(model: nn.Module, loader: DataLoader, device: str, num_batches: int = 10) -> Dict[str, float]:
    """Measure inference time and throughput."""
    model.eval()
    times = []
    
    with torch.no_grad():
        for i, batch in enumerate(loader):
            if i >= num_batches:
                break
            
            images = batch['image'].to(device)
            batch_size = images.size(0)
            
            # Warm up CUDA
            if i == 0 and torch.cuda.is_available():
                _ = model(images)
                torch.cuda.synchronize()
            
            start = time.time()
            _ = model(images)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            end = time.time()
            
            times.append((end - start) / batch_size)  # Time per sample
    
    avg_time_per_sample = np.mean(times)
    throughput = 1.0 / avg_time_per_sample  # Samples per second
    
    return {
        'avg_inference_time_per_sample_ms': avg_time_per_sample * 1000,
        'avg_inference_time_per_sample_s': avg_time_per_sample,
        'throughput_samples_per_second': throughput,
        'latency_ms': np.mean([t * 1000 for t in times])
    }

print("✓ Model complexity analysis utilities defined")

✓ Model complexity analysis utilities defined


In [None]:
# Monitoring utilities for time, memory, power, and energy
import time
import psutil
import threading
from collections import defaultdict

try:
    import pynvml
    pynvml.nvmlInit()
    NVML_AVAILABLE = True
except:
    NVML_AVAILABLE = False
    print("⚠️  NVML not available. GPU power monitoring will be disabled.")

class ResourceMonitor:
    """Monitor time, memory, power, and energy consumption."""
    
    def __init__(self):
        self.start_time = None
        self.end_time = None
        self.gpu_power_samples = []
        self.memory_samples = []
        self.monitoring = False
        self.monitor_thread = None
        
    def start(self):
        """Start monitoring resources."""
        self.start_time = time.time()
        self.gpu_power_samples = []
        self.memory_samples = []
        self.monitoring = True
        
        # Start background monitoring thread
        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
        self.monitor_thread.start()
        
    def stop(self):
        """Stop monitoring resources."""
        self.end_time = time.time()
        self.monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join(timeout=2.0)
        
    def _monitor_loop(self):
        """Background loop to collect power and memory samples."""
        while self.monitoring:
            try:
                # GPU Memory
                if torch.cuda.is_available():
                    gpu_mem = torch.cuda.memory_allocated() / (1024**3)  # GB
                    self.memory_samples.append(gpu_mem)
                
                # GPU Power (if available)
                if NVML_AVAILABLE:
                    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
                    power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)  # milliwatts
                    power_w = power_mw / 1000.0  # watts
                    self.gpu_power_samples.append(power_w)
                    
            except Exception as e:
                pass
            
            time.sleep(0.5)  # Sample every 500ms
    
    def get_metrics(self) -> Dict[str, Any]:
        """Calculate and return all metrics."""
        if self.start_time is None or self.end_time is None:
            return {}
        
        elapsed_time = self.end_time - self.start_time
        
        metrics = {
            'elapsed_time_seconds': elapsed_time,
            'elapsed_time_minutes': elapsed_time / 60.0,
            'elapsed_time_hours': elapsed_time / 3600.0,
        }
        
        # Memory metrics
        if self.memory_samples:
            metrics['gpu_memory_avg_gb'] = np.mean(self.memory_samples)
            metrics['gpu_memory_max_gb'] = np.max(self.memory_samples)
            metrics['gpu_memory_min_gb'] = np.min(self.memory_samples)
        
        if torch.cuda.is_available():
            metrics['gpu_memory_peak_gb'] = torch.cuda.max_memory_allocated() / (1024**3)
        
        # Power and energy metrics
        if self.gpu_power_samples:
            avg_power_w = np.mean(self.gpu_power_samples)
            max_power_w = np.max(self.gpu_power_samples)
            min_power_w = np.min(self.gpu_power_samples)
            
            # Energy = Power × Time
            energy_wh = (avg_power_w * elapsed_time) / 3600.0  # Watt-hours
            energy_kwh = energy_wh / 1000.0  # Kilowatt-hours
            
            metrics['gpu_power_avg_watts'] = avg_power_w
            metrics['gpu_power_max_watts'] = max_power_w
            metrics['gpu_power_min_watts'] = min_power_w
            metrics['gpu_energy_wh'] = energy_wh
            metrics['gpu_energy_kwh'] = energy_kwh
        
        return metrics

print("✓ Resource monitoring utilities defined")

✓ Visualization functions defined


In [33]:
# Visualization functions
def plot_confusion_matrix(cm: np.ndarray, save_path: str):
    """Plot confusion matrix."""
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Non-Melanoma', 'Melanoma'],
                yticklabels=['Non-Melanoma', 'Melanoma'])
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()

def plot_roc_curve(labels: np.ndarray, probs: np.ndarray, save_path: str):
    """Plot ROC curve."""
    fpr, tpr, _ = roc_curve(labels, probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, lw=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random')
    plt.xlim([0, 1])
    plt.ylim([0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()

def plot_training_history(history_dict: Dict[str, List[float]], save_path: str):
    """Plot training history."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    axes[0].plot(history_dict['train_loss'], label='Train Loss')
    axes[0].plot(history_dict['val_loss'], label='Val Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training and Validation Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    axes[1].plot(history_dict['train_acc'], label='Train Acc')
    axes[1].plot(history_dict['val_acc'], label='Val Acc')
    axes[1].plot(history_dict['val_f1'], label='Val F1')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Score')
    axes[1].set_title('Training Metrics')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()

print("✓ Visualization functions defined")


✓ Visualization functions defined


In [35]:
# MAIN EXECUTION WITH ENHANCED METRICS

print(f"\n{'='*70}")
print(f"SKIN CANCER CLASSIFICATION PIPELINE")
print(f"{'='*70}")
print(f"Device: {config.DEVICE}")
print(f"Model: {config.MODEL_NAME}")
print(f"Random Seed: {SEED}")
print(f"Image Size: {config.IMG_SIZE}x{config.IMG_SIZE}")
print(f"Batch Size: {config.BATCH_SIZE}")
print(f"N-Fold CV: {config.N_SPLITS}")
print(f"{'='*70}\n")

# Load data
if not df_isic.empty:
    print(f"\n✓ Dataset loaded: {len(df_isic)} samples")
    print(f"Class distribution:\n{df_isic['binary_label'].value_counts()}\n")
    
    # ========== 1. MODEL COMPLEXITY ANALYSIS ==========
    print(f"\n{'='*70}")
    print(f"1. COMPUTATIONAL COMPLEXITY")
    print(f"{'='*70}")
    
    # Create a sample model for analysis
    sample_model = ResNet50Classifier(config.NUM_CLASSES, config.PRETRAINED, config.DROPOUT)
    sample_model = sample_model.to(config.DEVICE)
    
    # Count parameters
    param_stats = count_parameters(sample_model)
    print(f"\nModel Parameters:")
    print(f"  Total Parameters: {param_stats['total_params']:,}")
    print(f"  Trainable Parameters: {param_stats['trainable_params']:,}")
    print(f"  Non-trainable Parameters: {param_stats['non_trainable_params']:,}")
    print(f"  Parameters (Millions): {param_stats['total_params_millions']:.2f}M")
    
    # Estimate FLOPs
    print(f"\nComputational Cost:")
    input_size = (1, 3, config.IMG_SIZE, config.IMG_SIZE)
    flops = estimate_flops(sample_model, input_size)
    print(f"  FLOPs per inference: {flops:.2f} GFLOPs")
    
    # Measure inference time
    print(f"\nInference Performance:")
    # Create a small test loader for timing
    test_subset = df_isic.head(100)
    test_dataset = SkinCancerDataset(test_subset, get_val_transforms(config.IMG_SIZE))
    test_loader = DataLoader(test_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)
    
    inference_metrics = measure_inference_time(sample_model, test_loader, config.DEVICE, num_batches=3)
    print(f"  Average Latency: {inference_metrics['latency_ms']:.2f} ms per sample")
    print(f"  Throughput: {inference_metrics['throughput_samples_per_second']:.2f} samples/second")
    
    # ========== 2. MEMORY COMPLEXITY ==========
    print(f"\n{'='*70}")
    print(f"2. MEMORY COMPLEXITY")
    print(f"{'='*70}")
    
    print(f"\nModel Capacity:")
    print(f"  Model Size (Memory Footprint): {param_stats['model_size_mb']:.2f} MB")
    
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.empty_cache()
        
        # Warm-up forward pass
        dummy_batch = next(iter(test_loader))
        dummy_images = dummy_batch['image'].to(config.DEVICE)
        _ = sample_model(dummy_images)
        
        peak_memory_mb = torch.cuda.max_memory_allocated() / (1024**2)
        current_memory_mb = torch.cuda.memory_allocated() / (1024**2)
        
        print(f"\nRuntime Memory Usage (GPU):")
        print(f"  Current Memory Allocated: {current_memory_mb:.2f} MB")
        print(f"  Peak Memory Allocated: {peak_memory_mb:.2f} MB")
    
    del sample_model, test_loader, test_dataset
    torch.cuda.empty_cache()
    
    # ========== 3. TRAINING WITH RESOURCE MONITORING ==========
    print(f"\n{'='*70}")
    print(f"3. TRAINING WITH RESOURCE & ENERGY MONITORING")
    print(f"{'='*70}\n")
    
    # Initialize resource monitor
    training_monitor = ResourceMonitor()
    training_monitor.start()
    
    # Track start time
    training_start_time = time.time()
    
    # Run stratified 5-fold CV
    cv_results = stratified_kfold_cv(df_isic, config, config.DEVICE)
    
    # Stop monitoring
    training_monitor.stop()
    training_end_time = time.time()
    
    # Get resource metrics
    resource_metrics = training_monitor.get_metrics()
    
    # ========== DISPLAY RESULTS ==========
    print(f"\n{'='*70}")
    print(f"CROSS-VALIDATION RESULTS")
    print(f"{'='*70}")
    print(f"F1 Score: {cv_results['f1_mean']:.4f} ± {cv_results['f1_std']:.4f}")
    print(f"F1 CI ({int(config.CI_LEVEL*100)}%): [{cv_results['f1_ci'][0]:.4f}, {cv_results['f1_ci'][1]:.4f}]")
    print(f"\nROC AUC: {cv_results['auc_mean']:.4f} ± {cv_results['auc_std']:.4f}")
    print(f"AUC CI ({int(config.CI_LEVEL*100)}%): [{cv_results['auc_ci'][0]:.4f}, {cv_results['auc_ci'][1]:.4f}]")
    
    # ========== TIME COMPLEXITY ==========
    print(f"\n{'='*70}")
    print(f"TIME COMPLEXITY (TRAINING)")
    print(f"{'='*70}")
    print(f"Total Training Time: {resource_metrics['elapsed_time_hours']:.2f} hours")
    print(f"                     ({resource_metrics['elapsed_time_minutes']:.2f} minutes)")
    print(f"                     ({resource_metrics['elapsed_time_seconds']:.2f} seconds)")
    
    # ========== MEMORY USAGE ==========
    print(f"\n{'='*70}")
    print(f"MEMORY USAGE (TRAINING)")
    print(f"{'='*70}")
    if 'gpu_memory_avg_gb' in resource_metrics:
        print(f"Average GPU Memory Usage: {resource_metrics['gpu_memory_avg_gb']:.2f} GB")
        print(f"Peak GPU Memory Usage: {resource_metrics['gpu_memory_max_gb']:.2f} GB")
        print(f"Minimum GPU Memory Usage: {resource_metrics['gpu_memory_min_gb']:.2f} GB")
    
    if 'gpu_memory_peak_gb' in resource_metrics:
        print(f"PyTorch Peak Memory: {resource_metrics['gpu_memory_peak_gb']:.2f} GB")
    
    # ========== POWER & ENERGY ==========
    print(f"\n{'='*70}")
    print(f"POWER & ENERGY CONSUMPTION")
    print(f"{'='*70}")
    if 'gpu_power_avg_watts' in resource_metrics:
        print(f"Average GPU Power: {resource_metrics['gpu_power_avg_watts']:.2f} W")
        print(f"Peak GPU Power: {resource_metrics['gpu_power_max_watts']:.2f} W")
        print(f"Minimum GPU Power: {resource_metrics['gpu_power_min_watts']:.2f} W")
        print(f"\nTotal Energy Consumed: {resource_metrics['gpu_energy_wh']:.2f} Wh")
        print(f"                       ({resource_metrics['gpu_energy_kwh']:.6f} kWh)")
        
        # Calculate CO2 emissions (approximate, using US average: 0.92 lbs CO2/kWh)
        co2_lbs = resource_metrics['gpu_energy_kwh'] * 0.92
        co2_kg = co2_lbs * 0.453592
        print(f"\nEstimated CO2 Emissions: {co2_kg:.4f} kg CO2")
    else:
        print("⚠️  GPU power monitoring not available (pynvml not installed or not supported)")
    
    # ========== PER-CLASS METRICS ==========
    class_metrics = per_class_metrics(cv_results['all_test_predictions'],
                                     cv_results['all_test_labels'],
                                     cv_results['all_test_probabilities'])
    
    print(f"\n{'='*70}")
    print(f"PER-CLASS METRICS")
    print(f"{'='*70}")
    for class_name, metrics in class_metrics.items():
        print(f"\n{class_name}:")
        print(f"  Samples: {metrics['count']}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1']:.4f}")
        if not np.isnan(metrics['auc']):
            print(f"  AUC: {metrics['auc']:.4f}")
    
    # Clinical validation
    validation_status = clinical_validation(class_metrics)
    
    # Error analysis
    error_report = error_analysis(cv_results['all_test_predictions'],
                                 cv_results['all_test_labels'],
                                 cv_results['all_test_probabilities'])
    
    print(f"\n{'='*70}")
    print(f"ERROR ANALYSIS")
    print(f"{'='*70}")
    print(f"Total Errors: {error_report['total_errors']}")
    print(f"Error Rate: {error_report['error_rate']:.4f}")
    print(f"False Positives: {error_report['false_positives']} (avg confidence: {error_report['avg_fp_confidence']:.4f})")
    print(f"False Negatives: {error_report['false_negatives']} (avg confidence: {error_report['avg_fn_confidence']:.4f})")
    
    # Visualizations
    print(f"\nGenerating visualizations...")
    plot_confusion_matrix(confusion_matrix(cv_results['all_test_labels'], cv_results['all_test_predictions']),
                        os.path.join(config.RESULTS_DIR, 'confusion_matrix.png'))
    plot_roc_curve(cv_results['all_test_labels'], cv_results['all_test_probabilities'],
                   os.path.join(config.RESULTS_DIR, 'roc_curve.png'))
    
    print(f"✓ Visualizations saved to {config.RESULTS_DIR}")
    
    # ========== SUMMARY ==========
    print(f"\n{'='*70}")
    print(f"TRAINING SUMMARY")
    print(f"{'='*70}")
    print(f"✓ Model: ResNet-50 with {param_stats['total_params_millions']:.2f}M parameters")
    print(f"✓ Training Time: {resource_metrics['elapsed_time_hours']:.2f} hours")
    print(f"✓ Test F1 Score: {cv_results['f1_mean']:.4f} ± {cv_results['f1_std']:.4f}")
    print(f"✓ Test AUC: {cv_results['auc_mean']:.4f} ± {cv_results['auc_std']:.4f}")
    if 'gpu_energy_kwh' in resource_metrics:
        print(f"✓ Energy Consumed: {resource_metrics['gpu_energy_kwh']:.4f} kWh")
    print(f"\n✓ Pipeline completed successfully!")
    
else:
    print("\n⚠️  No data available. Please ensure dataset loading is working.")
    print("Dataset should have columns: 'image_path', 'binary_label', 'image_id'")



SKIN CANCER CLASSIFICATION PIPELINE
Device: cuda
Model: resnet50
Random Seed: 42
Image Size: 224x224
Batch Size: 64
N-Fold CV: 5


✓ Dataset loaded: 10015 samples
Class distribution:
binary_label
0    8902
1    1113
Name: count, dtype: int64


1. COMPUTATIONAL COMPLEXITY

Model Parameters:
  Total Parameters: 30,335,554
  Trainable Parameters: 30,335,554
  Non-trainable Parameters: 0
  Parameters (Millions): 30.34M

Computational Cost:
  FLOPs per inference: 4.09 GFLOPs

Inference Performance:


NameError: name 'time' is not defined