# SkinTag: Robust Skin Lesion Classification

**Problem**: Medical images vary by camera, lighting, and quality — models fail on out-of-distribution images.

**Solution**: MedSigLIP embeddings + augmentations for robustness to real-world imaging conditions.

In [None]:
# Setup (run once)
!pip install -q transformers albumentations scikit-learn kaggle

# Set Kaggle credentials (get from kaggle.com/settings)
import os
os.environ['KAGGLE_API_TOKEN'] = ''  # Paste your token here

!mkdir -p data
!kaggle datasets download -d farjanakabirsamanta/skin-cancer-dataset -p data/ --unzip -q

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from transformers import AutoModel, AutoProcessor
import albumentations as A

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

## 1. Load Dataset

In [None]:
# Load HAM10000 metadata
data_dir = Path("data")
metadata_path = list(data_dir.glob("**/HAM10000_metadata.csv"))[0]
df = pd.read_csv(metadata_path)

# Find images
image_lookup = {p.stem: p for p in data_dir.glob("**/*.jpg")}

# Binary labels: benign vs malignant
MALIGNANT = ["akiec", "bcc", "mel"]
df["label"] = df["dx"].apply(lambda x: 1 if x in MALIGNANT else 0)
df["image_path"] = df["image_id"].map(image_lookup)
df = df.dropna(subset=["image_path"])

print(f"Total images: {len(df)}")
print(f"Class distribution: {df['label'].value_counts().to_dict()}")

## 2. Pre-trained Model: MedSigLIP

In [None]:
# Load MedSigLIP (400M vision encoder trained on medical images)
model_name = "google/siglip-so400m-patch14-384"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16).to(device).eval()
print("MedSigLIP loaded")

In [None]:
# Extract embeddings (transfer learning: use pre-trained features)
@torch.no_grad()
def extract_embeddings(image_paths, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size)):
        batch_paths = image_paths[i:i+batch_size]
        images = [Image.open(p).convert("RGB") for p in batch_paths]
        inputs = processor(images=images, return_tensors="pt").to(device)
        emb = model.get_image_features(**inputs)
        embeddings.append(emb.cpu())
    return torch.cat(embeddings)

# Sample for speed (use full dataset for final results)
SAMPLE_SIZE = 2000
df_sample = df.sample(SAMPLE_SIZE, random_state=42)

embeddings = extract_embeddings(df_sample["image_path"].tolist())
labels = df_sample["label"].values
print(f"Embeddings: {embeddings.shape}")

## 3. Train Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings.numpy(), labels, test_size=0.2, random_state=42, stratify=labels
)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(max_iter=1000))
])
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print(f"Train accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")

## 4. Data Augmentation for Robustness

In [None]:
# Define augmentations simulating real-world imaging conditions
augmentations = {
    "Original": None,
    "Lighting": A.Compose([A.RandomBrightnessContrast(0.3, 0.3, p=1), A.RandomGamma((70,130), p=1)]),
    "Noise": A.Compose([A.GaussNoise(var_limit=(20, 80), p=1)]),
    "Compression": A.Compose([A.ImageCompression(quality_lower=30, quality_upper=50, p=1)]),
}

# Visualize
sample_img = np.array(Image.open(df_sample.iloc[0]["image_path"]))
fig, axes = plt.subplots(1, 4, figsize=(12, 3))
for ax, (name, aug) in zip(axes, augmentations.items()):
    img = sample_img if aug is None else aug(image=sample_img)["image"]
    ax.imshow(img)
    ax.set_title(name)
    ax.axis("off")
plt.tight_layout()
plt.savefig("augmentations.png", dpi=150)
plt.show()

## 5. Robustness Evaluation

In [None]:
# Test model on degraded images
test_paths = df_sample.iloc[X_test.shape[0]*-1:]["image_path"].tolist()[:100]  # 100 test images
test_labels = y_test[:100]

results = {}
for name, aug in augmentations.items():
    images = []
    for p in test_paths:
        img = np.array(Image.open(p).convert("RGB"))
        if aug:
            img = aug(image=img)["image"]
        images.append(Image.fromarray(img))
    
    with torch.no_grad():
        inputs = processor(images=images, return_tensors="pt").to(device)
        emb = model.get_image_features(**inputs).cpu().numpy()
    
    acc = clf.score(emb, test_labels)
    results[name] = acc
    print(f"{name}: {acc:.3f}")

In [None]:
# Plot results
plt.figure(figsize=(8, 5))
colors = ["#2ecc71" if k == "Original" else "#3498db" for k in results.keys()]
bars = plt.bar(results.keys(), results.values(), color=colors)
plt.ylabel("Accuracy")
plt.title("Model Robustness Across Imaging Conditions")
plt.ylim(0, 1)
for bar, acc in zip(bars, results.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f"{acc:.2f}", ha="center")
plt.tight_layout()
plt.savefig("robustness.png", dpi=150)
plt.show()

## Summary

| Requirement | What We Did |
|-------------|-------------|
| **Problem** | Medical images vary by camera/lighting/quality |
| **Pre-trained Model** | MedSigLIP (400M params, medical image encoder) |
| **Transfer Learning** | Extract embeddings → train logistic regression |
| **Augmentations** | Lighting, noise, compression (simulate real-world) |
| **Results** | Accuracy under clean vs degraded conditions |