In [None]:
# STEP 1: Install Required Libraries
!pip install -q kaggle timm scikit-learn albumentations tqdm
!pip install -q torch torchvision

In [None]:
# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# STEP 3: Import Libraries
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, f1_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim  # <-- THIS WAS MISSING
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import timm  # For ResNeSt-101

In [None]:
# STEP 4: Load ISIC 2019 Malignant Metadata
gt_2019 = pd.read_csv('/content/drive/MyDrive/PW2/ISIC_2019_Training_GroundTruth.csv')
meta_2019 = pd.read_csv('/content/drive/MyDrive/PW2/ISIC_2019_Training_Metadata.csv')

# ✅ Filter malignant images only (MEL == 1)
gt_2019['target'] = gt_2019['MEL']
malignant_2019 = gt_2019[gt_2019['target'] == 1]
malignant_2019 = pd.merge(malignant_2019, meta_2019, on='image')
malignant_2019['image_path'] = malignant_2019['image'].apply(
    lambda x: f"/content/drive/MyDrive/PW2/ISIC_2019_Malignant_Images/{x}.jpg")

In [None]:
# STEP 5: Load ISIC 2020 Metadata
meta_2020 = pd.read_csv('/content/drive/MyDrive/PW2/metadata.csv')
meta_2020['image_path'] = meta_2020['image_name'].apply(
    lambda x: f"/content/drive/MyDrive/PW2/dataset/train/{x}.jpg")
meta_2020.rename(columns={
    'image_name': 'image',
    'anatom_site_general_challenge': 'anatom_site_general'
}, inplace=True)

In [None]:
# STEP 6: Handle Missing Values
for df in [malignant_2019, meta_2020]:
    df['age_approx'].fillna(df['age_approx'].mean(), inplace=True)
    df['sex'].fillna('unknown', inplace=True)
    df['anatom_site_general'].fillna('unknown', inplace=True)

In [None]:
# STEP 7: Combine Malignant Images from 2019 and 2020
malignant_2020 = meta_2020[meta_2020['target'] == 1]
all_malignant = pd.concat([malignant_2019, malignant_2020], ignore_index=True)

# ✅ Balance by undersampling benign cases from 2020
benign_2020 = meta_2020[meta_2020['target'] == 0].sample(len(all_malignant), random_state=42)

# # ✅ Combine and shuffle the dataset
# balanced_data = pd.concat([all_malignant, benign_2020]).sample(frac=1, random_state=42).reset_index(drop=True)
# print(f"📊 Total dataset size after balancing: {len(balanced_data)}")


In [None]:
# ✅ Use all available benign images for a larger dataset
benign_2020 = meta_2020[meta_2020['target'] == 0]
balanced_data = pd.concat([all_malignant, benign_2020]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"📊 Total dataset size (all malignant + all benign): {len(balanced_data)}")


In [None]:
# STEP 8: Encode Categorical Features & Scale Age
label_encoders = {
    'sex': LabelEncoder().fit(balanced_data['sex'].astype(str)),
    'anatom_site_general': LabelEncoder().fit(balanced_data['anatom_site_general'].astype(str))
}
for col, le in label_encoders.items():
    balanced_data[col] = le.transform(balanced_data[col].astype(str))

scaler = StandardScaler().fit(balanced_data[['age_approx']])
balanced_data[['age_approx']] = scaler.transform(balanced_data[['age_approx']])
# Replace NaN patient_id with unique identifiers
import numpy as np

# ✅ Assign unique IDs for missing patient_id entries
balanced_data['patient_id'] = np.where(
    balanced_data['patient_id'].isnull(),
    'unknown_' + balanced_data.index.astype(str),
    balanced_data['patient_id']
)


In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_idx, test_idx = next(gss.split(balanced_data, groups=balanced_data['patient_id']))

train_val_data = balanced_data.iloc[train_idx]
test = balanced_data.iloc[test_idx]

# Further split train_val_data into train and validation
gss_val = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)
train_idx, val_idx = next(gss_val.split(train_val_data, groups=train_val_data['patient_id']))

train = train_val_data.iloc[train_idx]
val = train_val_data.iloc[val_idx]

# ✅ Check for overlaps again
print(f"Train-Test Patient Overlap: {len(set(train['patient_id']).intersection(set(test['patient_id'])))}")
print(f"Train-Val Patient Overlap: {len(set(train['patient_id']).intersection(set(val['patient_id'])))}")
print(f"Val-Test Patient Overlap: {len(set(val['patient_id']).intersection(set(test['patient_id'])))}")


In [None]:
import joblib

# STEP 1: Save train, val, test datasets
joblib.dump(train, '/content/drive/MyDrive/PW2/train_dataset.pkl')
joblib.dump(val, '/content/drive/MyDrive/PW2/val_dataset.pkl')
joblib.dump(test, '/content/drive/MyDrive/PW2/test_dataset.pkl')
print("✅ Datasets saved successfully!")

In [None]:
# STEP 10: Dataset Class (Handles OSError and Missing Files)
class ISICDatasetWithMeta(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        while True:
            img_path = self.data.iloc[idx]['image_path']
            try:
                image = Image.open(img_path).convert("RGB")
                if self.transform:
                    image = self.transform(image)
                sex = float(self.data.iloc[idx]['sex'])
                age = float(self.data.iloc[idx]['age_approx'])
                site = float(self.data.iloc[idx]['anatom_site_general'])
                meta_data = torch.tensor([sex, age, site], dtype=torch.float32)
                label = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.float32)
                return image, meta_data, label
            except (FileNotFoundError, UnidentifiedImageError, OSError) as e:
                print(f"⚠️ Error loading file {img_path}: {e}. Skipping...")
                idx = (idx + 1) % len(self.data)

In [None]:
# STEP 11: Data Transformations
train_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
])
val_test_transforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])


In [None]:
# STEP 12: DataLoaders
# Increase batch_size to 32 or 64 depending on GPU memory
train_loader = DataLoader(ISICDatasetWithMeta(train, train_transforms), batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(ISICDatasetWithMeta(val, val_test_transforms), batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(ISICDatasetWithMeta(test, val_test_transforms), batch_size=64, shuffle=False, num_workers=4, pin_memory=True)


In [None]:
# STEP 13: ResNeSt-101 Model
class ResNeSt101Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.base_model = timm.create_model('resnest101e', pretrained=True, num_classes=0, global_pool='avg')
        self.meta_fc = nn.Sequential(
            nn.Linear(3, 32), nn.BatchNorm1d(32), nn.ReLU(), nn.Dropout(0.3)
        )
        self.head = nn.Sequential(
            nn.Linear(self.base_model.num_features + 32, 256),
            nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(256, 1)
        )

    def forward(self, x, meta):
        x = self.base_model(x)
        meta = self.meta_fc(meta)
        x = torch.cat([x, meta], dim=1)
        return self.head(x).squeeze()

In [None]:
# STEP 14: Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNeSt101Model().to(device)

num_neg = len(train[train['target'] == 0])
num_pos = len(train[train['target'] == 1])
pos_weight = torch.tensor([num_neg / num_pos]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
# STEP 15: Training Loop (Best Model Saved to Drive)
drive_model_path = '/content/drive/MyDrive/PW2/models/ResNeSt101_BestModel_combined_improved.pth'
os.makedirs(os.path.dirname(drive_model_path), exist_ok=True)

def train_model(model, train_loader, val_loader, epochs=10):
    best_f1 = 0
    for epoch in range(epochs):
        model.train()
        for images, meta, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            images, meta, labels = images.to(device), meta.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images, meta)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for images, meta, labels in val_loader:
                images, meta = images.to(device), meta.to(device)
                outputs = torch.sigmoid(model(images, meta))
                val_preds.extend(outputs.cpu().numpy())
                val_labels.extend(labels.numpy())

        val_preds_binary = (np.array(val_preds) > 0.5).astype(int)
        val_f1 = f1_score(val_labels, val_preds_binary)
        val_auc = roc_auc_score(val_labels, val_preds)
        print(f"📈 Epoch {epoch+1} Validation F1 Score: {val_f1:.4f}, Validation ROC AUC: {val_auc:.4f}")

        # Save the model if F1 score improves
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), drive_model_path)
            print(f"✅ New best model saved with F1 score: {best_f1:.4f}")

    print(f"🏆 Best Validation F1 Score achieved: {best_f1:.4f}")

train_model(model, train_loader, val_loader, epochs=10)

In [None]:
# STEP 16: Evaluation
model.load_state_dict(torch.load(drive_model_path))
model.eval()

test_preds, test_labels = [], []
with torch.no_grad():
    for images, meta, labels in test_loader:
        images, meta = images.to(device), meta.to(device)
        outputs = torch.sigmoid(model(images, meta))
        test_preds.extend(outputs.cpu().numpy())
        test_labels.extend(labels.numpy())

test_preds = (np.array(test_preds) > 0.5).astype(int)
auc = roc_auc_score(test_labels, test_preds)
acc = accuracy_score(test_labels, test_preds)
recall = recall_score(test_labels, test_preds)
f1 = f1_score(test_labels, test_preds)
cm = confusion_matrix(test_labels, test_preds)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

print(f"\n📊 **Final Evaluation on Test Set**:")
print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {acc:.4f}")
print(f"Sensitivity (Recall): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay

# STEP 1: Compute probabilities for ROC
model.eval()
test_preds_proba, test_labels = [], []
with torch.no_grad():
    for images, meta, labels in test_loader:
        images, meta = images.to(device), meta.to(device)
        outputs = torch.sigmoid(model(images, meta))
        test_preds_proba.extend(outputs.cpu().numpy())
        test_labels.extend(labels.numpy())

# STEP 2: Plot ROC Curve
fpr, tpr, thresholds = roc_curve(test_labels, test_preds_proba)
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
plt.title('📈 ROC Curve')
plt.show()

# STEP 3: Compute Confusion Matrix
test_preds = (np.array(test_preds_proba) > 0.5).astype(int)
cm = confusion_matrix(test_labels, test_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Malignant'])
disp.plot(cmap=plt.cm.Blues)
plt.title('📊 Confusion Matrix')
plt.show()

# Optional: Print confusion matrix details
print(f"Confusion Matrix:\n{cm}")
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")
