In [1]:
# Complete Skin Cancer Detection with Deep CNN
# Dataset: HAM10000 from Kaggle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB3, ResNet50
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import albumentations as A
from albumentations.pytorch import ToTensorV2
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


2025-08-21 02:50:34.931104: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755744635.313336      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755744635.415900      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
DATASET_PATH = "/kaggle/input/skin-cancer-mnist-ham10000"

# Load metadata
df = pd.read_csv(os.path.join(DATASET_PATH, "HAM10000_metadata.csv"))
print(df.head())

     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear


In [3]:
# 1. DATASET CONFIGURATION
# =============================================================================

DATASET_PATH = "/kaggle/input/skin-cancer-mnist-ham10000"
BATCH_SIZE = 16  # Reduced for Kaggle memory limits
IMG_SIZE = 224
EPOCHS = 30      # Reduced for quick testing

In [4]:
# =============================================================================
# 2. LOAD AND EXPLORE DATA
# =============================================================================

print("📊 Loading dataset...")
df = pd.read_csv(os.path.join(DATASET_PATH, "HAM10000_metadata.csv"))

print(f"Dataset shape: {df.shape}")
print(f"Classes: {df['dx'].value_counts()}")

📊 Loading dataset...
Dataset shape: (10015, 7)
Classes: dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [5]:
# Map class names
class_names = {
    'akiec': 'Actinic Keratoses',
    'bcc': 'Basal Cell Carcinoma', 
    'bkl': 'Benign Keratosis',
    'df': 'Dermatofibroma',
    'mel': 'Melanoma',
    'nv': 'Melanocytic Nevi',
    'vasc': 'Vascular Lesions'
}

In [6]:
# =============================================================================
# 3. FIND IMAGES
# =============================================================================

def find_image_files(dataset_path):
    """Find all image files in the dataset"""
    image_files = {}
    
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_id = file.split('.')[0]
                image_files[image_id] = os.path.join(root, file)
    
    return image_files

print("🔍 Finding image files...")
image_files = find_image_files(DATASET_PATH)
print(f"Found {len(image_files)} image files")

🔍 Finding image files...
Found 10015 image files


In [7]:
# Add image paths to dataframe
df['image_path'] = df['image_id'].map(image_files)
df = df.dropna(subset=['image_path'])  # Remove rows without images
print(f"Dataset shape after matching with images: {df.shape}")

Dataset shape after matching with images: (10015, 8)


In [8]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['dx'])
num_classes = len(label_encoder.classes_)

print(f"Number of classes: {num_classes}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, range(num_classes)))}")



Number of classes: 7
Label mapping: {'akiec': 0, 'bcc': 1, 'bkl': 2, 'df': 3, 'mel': 4, 'nv': 5, 'vasc': 6}


In [9]:
# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['dx'])
num_classes = len(label_encoder.classes_)

print(f"Number of classes: {num_classes}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, range(num_classes)))}")



Number of classes: 7
Label mapping: {'akiec': 0, 'bcc': 1, 'bkl': 2, 'df': 3, 'mel': 4, 'nv': 5, 'vasc': 6}


In [10]:
# 5. SIMPLE PREPROCESSING
# =============================================================================

def preprocess_image(image_path, target_size=(224, 224)):
    """Simple but effective preprocessing"""
    try:
        # Load image
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # Resize
        img = cv2.resize(img, target_size)
        
        # Normalize
        img = img.astype(np.float32) / 255.0
        
        return img
    except:
        # Return black image if loading fails
        return np.zeros((*target_size, 3), dtype=np.float32)


In [11]:
# 6. DATA GENERATOR
# =============================================================================

class QuickDataGenerator(keras.utils.Sequence):
    def __init__(self, dataframe, batch_size=32, shuffle=True, augment=False):
        self.df = dataframe.reset_index(drop=True)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.augment = augment
        self.indices = np.arange(len(self.df))
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(len(self.df) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self._generate_batch(batch_indices)
        return X, y
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def _generate_batch(self, batch_indices):
        X = np.empty((self.batch_size, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)
        y = np.empty((self.batch_size,), dtype=int)
        
        for i, idx in enumerate(batch_indices):
            # Load and preprocess image
            image_path = self.df.iloc[idx]['image_path']
            image = preprocess_image(image_path, (IMG_SIZE, IMG_SIZE))
            
            # Simple augmentation for training
            if self.augment and np.random.random() > 0.5:
                # Random flip
                if np.random.random() > 0.5:
                    image = np.fliplr(image)
                # Random rotation
                if np.random.random() > 0.5:
                    k = np.random.randint(1, 4)
                    image = np.rot90(image, k)
            
            X[i] = image
            y[i] = self.df.iloc[idx]['label']
        
        # Convert to categorical
        y = keras.utils.to_categorical(y, num_classes=num_classes)
        return X, y

In [12]:
# 7. SPLIT DATA
# =============================================================================

print("🔄 Splitting dataset...")
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")


🔄 Splitting dataset...
Train samples: 8012
Validation samples: 2003


In [13]:
# Create generators
train_gen = QuickDataGenerator(train_df, BATCH_SIZE, shuffle=True, augment=True)
val_gen = QuickDataGenerator(val_df, BATCH_SIZE, shuffle=False, augment=False)

In [14]:
# 8. CREATE MODEL
# =============================================================================

print("🏗️ Building model...")

def create_model():
    # Load pre-trained EfficientNet
    base_model = EfficientNetB3(
        weights='imagenet',
        include_top=False,
        input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )
    
    # Freeze base model
    base_model.trainable = False
    
    # Add classification head
    model = keras.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

model = create_model()

🏗️ Building model...


I0000 00:00:1755744685.142090      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1755744685.142809      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5
[1m43941136/43941136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [15]:
# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(f"Model parameters: {model.count_params():,}")

Model parameters: 10,981,174


In [None]:
# 9. TRAIN MODEL
# =============================================================================

print("🎯 Starting training...")

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7),
    ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True)
]

# Train
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)


🎯 Starting training...
Epoch 1/30


I0000 00:00:1755744713.876973      98 service.cc:148] XLA service 0x7e9ddc220590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755744713.878482      98 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1755744713.878502      98 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1755744716.982379      98 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  2/500[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m27s[0m 55ms/step - accuracy: 0.5156 - loss: 1.7210   

I0000 00:00:1755744732.495458      98 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 385ms/step - accuracy: 0.6679 - loss: 1.2119 - val_accuracy: 0.6700 - val_loss: 1.1338 - learning_rate: 1.0000e-04
Epoch 2/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 187ms/step - accuracy: 0.6659 - loss: 1.1650 - val_accuracy: 0.6700 - val_loss: 1.1289 - learning_rate: 1.0000e-04
Epoch 3/30
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 187ms/step - accuracy: 0.6717 - loss: 1.1459 - val_accuracy: 0.6700 - val_loss: 1.1303 - learning_rate: 1.0000e-04
Epoch 4/30
[1m 33/500[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:10[0m 151ms/step - accuracy: 0.6360 - loss: 1.2716

In [None]:
# 10. VISUALIZE RESULTS
# =============================================================================

print("📊 Training completed! Visualizing results...")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# 11. QUICK EVALUATION
# =============================================================================

print("🔍 Quick evaluation...")

# Get predictions on validation set
val_predictions = model.predict(val_gen)
val_pred_classes = np.argmax(val_predictions, axis=1)

# Get true labels
val_true_classes = []
for i in range(len(val_gen)):
    _, batch_y = val_gen[i]
    val_true_classes.extend(np.argmax(batch_y, axis=1))
val_true_classes = np.array(val_true_classes[:len(val_pred_classes)])

# Classification report
print("Classification Report:")
target_names = [class_names[cls] for cls in label_encoder.classes_]
print(classification_report(val_true_classes, val_pred_classes, target_names=target_names))

# Confusion matrix
cm = confusion_matrix(val_true_classes, val_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Final accuracy
final_acc = max(history.history['val_accuracy'])
print(f"\n🎉 Best Validation Accuracy: {final_acc:.4f} ({final_acc*100:.2f}%)")

print("\n✅ Quick training completed successfully!")
print("💡 For better results, run the full pipeline with more epochs and advanced preprocessing!")


In [None]:
# 12. SAMPLE PREDICTIONS
# =============================================================================

print("🔮 Sample predictions on validation data...")

# Get a few samples for prediction
sample_indices = np.random.choice(len(val_df), 6, replace=False)
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, idx in enumerate(sample_indices):
    # Get sample
    sample_row = val_df.iloc[idx]
    image_path = sample_row['image_path']
    true_label = sample_row['dx']
    
    # Load and preprocess image
    img = preprocess_image(image_path)
    img_batch = np.expand_dims(img, axis=0)
    
    # Predict
    pred = model.predict(img_batch, verbose=0)[0]
    pred_class_idx = np.argmax(pred)
    pred_class = label_encoder.classes_[pred_class_idx]
    confidence = pred[pred_class_idx]
    
    # Display
    axes[i].imshow(img)
    axes[i].set_title(f"True: {true_label}\nPred: {pred_class}\nConf: {confidence:.3f}")
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
model.save("model.h5")


In [None]:
import os
print("Saved files:", os.listdir("./"))

In [None]:
from IPython.display import FileLink
import shutil

# Save and zip (smaller, avoids browser issues with large files)
model.save("/kaggle/working/model.h5")
shutil.make_archive("/kaggle/working/model", 'zip', "/kaggle/working", "model.h5")

# Link for the zip
FileLink("/kaggle/working/model.zip")

