In [1]:
!pip install tensorflow imbalanced-learn opencv-python matplotlib keras-tuner seaborn



In [3]:
import os
import io
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import ipywidgets as widgets
from IPython.display import display

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, Conv2D, MaxPooling2D, Flatten, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import keras_tuner as kt  # Keras Tuner for hyperparameter tuning

In [15]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

# Update the CSV file path as needed
train_csv = pd.read_csv("/Users/samrudhsalas/Downloads/SkinGuard/Skin_Cancer/ISIC_2020_Training_GroundTruth.csv")

# Ensure the target column is integer type
train_csv['target'] = train_csv['target'].astype(int)

# If image filenames do not have an extension, add one:
if 'image_name' in train_csv.columns:
    train_csv['image_name'] = train_csv['image_name'].astype(str) + ".jpg"

# Select only numeric columns for SMOTE
numeric_columns = train_csv.select_dtypes(include=[np.number]).columns.tolist()
X = train_csv[numeric_columns].drop(columns=["target"])  # Exclude the target column
y = train_csv["target"]

# Handle NaN values by filling them with the mean of the respective columns
X = X.fillna(X.mean())

smote = SMOTE(sampling_strategy="auto", random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Reset index of resampled data
X_resampled = pd.DataFrame(X_resampled, columns=X.columns).reset_index(drop=True)
y_resampled = pd.Series(y_resampled).reset_index(drop=True)

# # Create a new balanced DataFrame and save it (optional)
# resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
# resampled_data["target"] = y_resampled
# resampled_data["image_name"] = train_csv.loc[X_resampled.index, "image_name"].values  # Add back the image_name column
# resampled_data.to_csv("balanced_skin_cancer_dataset.csv", index=False)

# # Reload the balanced dataset
# train_csv = pd.read_csv("balanced_skin_cancer_dataset.csv")

# Compute class weights for training
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_csv["target"]),
    y=train_csv["target"]
)
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

Class Weights: {0: 0.5089730194825149, 1: 28.361301369863014}


In [18]:
# Define directories for your images (update paths accordingly)
train_dir = "/Users/samrudhsalas/Downloads/SkinGuard/Skin_Cancer/train"  # Folder containing training images
test_dir  = "/Users/samrudhsalas/Downloads/SkinGuard/Skin_Cancer/test"   # Folder containing test images (if needed)

# Convert target column to string
train_csv['target'] = train_csv['target'].astype(str)

# Data Augmentation with a validation split (20% for validation)
data_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)

train_generator = data_gen.flow_from_dataframe(
    dataframe=train_csv,
    directory=train_dir,
    x_col='image_name',
    y_col='target',
    target_size=(224, 224),
    class_mode='sparse',  # Using 'sparse' as the targets are integers
    batch_size=32,
    subset='training'
)

val_generator = data_gen.flow_from_dataframe(
    dataframe=train_csv,
    directory=train_dir,
    x_col='image_name',
    y_col='target',
    target_size=(224, 224),
    class_mode='sparse',
    batch_size=32,
    subset='validation'
)

Found 26501 validated image filenames belonging to 2 classes.
Found 6625 validated image filenames belonging to 2 classes.


In [19]:
# CNN branch for low-level feature extraction
def build_cnn_base(input_shape):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, (3,3), activation="relu", padding="same")(inputs)
    x = MaxPooling2D(pool_size=(2,2))(x)
    x = Conv2D(64, (3,3), activation="relu", padding="same")(x)
    x = MaxPooling2D(pool_size=(2,2))(x)
    x = Conv2D(128, (3,3), activation="relu", padding="same")(x)
    x = MaxPooling2D(pool_size=(2,2))(x)
    x = Flatten()(x)
    return Model(inputs, x, name="CNN_Base")

# EfficientNetB7 branch for high-level features (transfer learning)
def build_efficientnet(input_shape):
    base_model = EfficientNetB7(weights="imagenet", include_top=False, input_shape=input_shape)
    base_model.trainable = False  # Freeze initial layers for feature extraction
    x = GlobalAveragePooling2D()(base_model.output)
    return Model(base_model.input, x, name="EfficientNetB7_Base")

# Focal Loss implementation (helps focus on hard-to-classify samples)
alpha = 0.25
gamma = 2.0
def focal_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1.0 - tf.keras.backend.epsilon())
    loss = -alpha * tf.pow(1 - y_pred, gamma) * y_true * tf.math.log(y_pred)
    return tf.reduce_mean(loss)

In [20]:
def build_hybrid_model(input_shape=(224, 224, 3), num_classes=3):
    # Build individual branches
    cnn_branch = build_cnn_base(input_shape)
    efficientnet_branch = build_efficientnet(input_shape)
    
    # Merge the outputs from both branches
    merged = tf.keras.layers.concatenate([cnn_branch.output, efficientnet_branch.output])
    
    # Add dense layers on top
    x = Dense(512, activation="relu")(merged)
    x = Dropout(0.3)(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.3)(x)
    output_layer = Dense(num_classes, activation="softmax")(x)
    
    # The model takes the same image for both branches
    model = Model(inputs=[cnn_branch.input, efficientnet_branch.input], outputs=output_layer)
    return model

# Create and compile the hybrid model
hybrid_model = build_hybrid_model()
hybrid_model.compile(optimizer=Adam(learning_rate=1e-4), loss=focal_loss, metrics=["accuracy"])
hybrid_model.summary()

2025-02-03 09:59:40.507121: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2025-02-03 09:59:40.507191: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-02-03 09:59:40.507197: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-02-03 09:59:40.507224: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-03 09:59:40.507478: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [47]:
print("✅ SMOTE applied successfully!")
print("Before SMOTE:", np.bincount(y))
print("After SMOTE:", np.bincount(y_resampled))

✅ SMOTE applied successfully!
Before SMOTE: [32542   584]
After SMOTE: [32542 32542]


In [48]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define a HyperModel class for tuning
class HybridHyperModel(kt.HyperModel):
    def build(self, hp):
        input_shape = (224, 224, 3)
        num_classes = 3

        # Build both branches
        cnn_branch = build_cnn_base(input_shape)
        efficientnet_branch = build_efficientnet(input_shape)

        # Merge the outputs
        merged = tf.keras.layers.concatenate([cnn_branch.output, efficientnet_branch.output])

        # Dense layers with hyperparameters
        x = Dense(hp.Int('dense_units1', min_value=256, max_value=512, step=128), activation="relu")(merged)
        x = Dropout(hp.Choice('dropout_rate1', values=[0.2, 0.3, 0.4]))(x)
        x = Dense(hp.Int('dense_units2', min_value=128, max_value=256, step=64), activation="relu")(x)
        x = Dropout(hp.Choice('dropout_rate2', values=[0.2, 0.3, 0.4]))(x)
        output_layer = Dense(num_classes, activation="softmax")(x)

        # Define model
        model = Model(inputs=[cnn_branch.input, efficientnet_branch.input], outputs=output_layer)

        # Choose a learning rate
        lr = hp.Choice('learning_rate', values=[1e-3, 1e-4, 1e-5])
        model.compile(optimizer=Adam(learning_rate=lr), loss=focal_loss, metrics=["accuracy"])

        return model

# Create the tuner using RandomSearch
tuner = kt.RandomSearch(
    hypermodel=HybridHyperModel(),
    objective="val_accuracy",
    max_trials=10,
    executions_per_trial=1,
    directory="hyperparam_tuning",
    project_name="hybrid_model_tuning"
)

# ---------------------------------------
# ✅ Custom Generator for Dual Input Model
# ---------------------------------------
def dual_input_generator(generator):
    """
    Custom generator that provides two identical inputs (for both CNN and EfficientNet branches).
    """
    for images, labels in generator:
        images = tf.convert_to_tensor(images, dtype=tf.float32)  # Ensure tensor format
        labels = tf.convert_to_tensor(labels, dtype=tf.float32)  # Convert labels
        yield ((images, images), labels)  # Return two inputs (CNN + EfficientNet) & labels

# Define the output signature for tf.data.Dataset
output_signature = (
    (tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),  # CNN branch input
     tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32)),  # EfficientNet branch input
    tf.TensorSpec(shape=(None,), dtype=tf.float32)  # Labels (sparse integer labels)
)

# Wrap the original data generators with tf.data.Dataset
train_gen_dual = tf.data.Dataset.from_generator(
    lambda: dual_input_generator(train_generator),
    output_signature=output_signature
).map(lambda x, y: (x, tf.one_hot(tf.cast(y, tf.int32), depth=3)))  # Convert labels to one-hot encoding

val_gen_dual = tf.data.Dataset.from_generator(
    lambda: dual_input_generator(val_generator),
    output_signature=output_signature
).map(lambda x, y: (x, tf.one_hot(tf.cast(y, tf.int32), depth=3)))  # Convert labels to one-hot encoding

# ---------------------------------------
# ✅ Run Hyperparameter Tuning
# ---------------------------------------
tuner.search(
    train_gen_dual,
    validation_data=val_gen_dual,
    epochs=5,
    steps_per_epoch=len(train_generator),
    validation_steps=len(val_generator),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)

# Retrieve the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:", best_hps.values)


Reloading Tuner from hyperparam_tuning/hybrid_model_tuning/tuner0.json

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
512               |512               |dense_units1
0.4               |0.4               |dropout_rate1
192               |128               |dense_units2
0.2               |0.4               |dropout_rate2
0.0001            |0.001             |learning_rate

Epoch 1/5


2025-02-03 21:06:03.328 Python[8115:180381] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-8115-2025-02-03_21_06_02-2272931890‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-02-03 21:06:03.346 Python[8115:180381] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-8115-2025-02-03_21_06_03-3810724709‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-02-03 21:06:03.353 Python[8115:180381] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-8115-2025-02-03_21_06_03-3692765593‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-02-03 21:06:03.360 Python[8115:180381] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-8115-2025-02-03_21_06_03-4245721646‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.


[1m 19/829[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:14:59[0m 6s/step - accuracy: 0.8548 - loss: 0.0152

KeyboardInterrupt: 

In [None]:
# Build the best model using the tuned hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Set up callbacks for learning rate reduction and early stopping
callbacks = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, verbose=1),
    EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
]

# Train the tuned model
history = best_model.fit(
    [train_generator, train_generator],
    validation_data=([val_generator, val_generator]),
    epochs=10,
    batch_size=32,
    class_weight=class_weights,
    callbacks=callbacks
)

# Save the optimized model
best_model.save("optimized_hybrid_skin_cancer_model.h5")

In [None]:
# Define class labels (adjust if needed)
class_names = {0: "Benign", 1: "Malignant", 2: "Other"}

# Custom focal loss must match the training definition
def focal_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1.0 - tf.keras.backend.epsilon())
    loss = -alpha * tf.pow(1 - y_pred, gamma) * y_true * tf.math.log(y_pred)
    return tf.reduce_mean(loss)

# Load the saved optimized model (make sure the custom_objects match)
model = tf.keras.models.load_model("optimized_hybrid_skin_cancer_model.h5",
                                   custom_objects={'focal_loss': focal_loss})

def preprocess_image(image_bytes):
    """
    Preprocess the uploaded image: convert to RGB, resize, normalize.
    """
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    image = image.resize((224, 224))
    image_np = np.array(image).astype("float32") / 255.0
    image_np = np.expand_dims(image_np, axis=0)
    return image_np

def predict_image(image_bytes):
    """
    Predict the class of the uploaded image.
    The model expects two identical inputs (one for each branch).
    """
    img_preprocessed = preprocess_image(image_bytes)
    preds = model.predict([img_preprocessed, img_preprocessed])
    predicted_class = np.argmax(preds)
    confidence = np.max(preds)
    
    print(f"Predicted Class: {class_names[predicted_class]}, Confidence: {confidence:.2f}")
    return predicted_class, confidence

# Create an interactive file uploader widget
upload_widget = widgets.FileUpload(
    accept='image/*',  # Accept image files only
    multiple=False     # Single file upload
)
display(upload_widget)

def on_upload_change(change):
    if upload_widget.value:
        for fname, file_info in upload_widget.value.items():
            print("Uploaded file:", fname)
            image_bytes = file_info['content']
            predict_image(image_bytes)

upload_widget.observe(on_upload_change, names='value')

In [None]:
def plot_training_history(history):
    # Plot training & validation accuracy
    plt.figure(figsize=(14, 5))
    
    plt.subplot(1, 2, 1)
    sns.lineplot(x=range(1, len(history.history['accuracy'])+1), y=history.history['accuracy'], label='Train Accuracy')
    sns.lineplot(x=range(1, len(history.history['val_accuracy'])+1), y=history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot training & validation loss
    plt.subplot(1, 2, 2)
    sns.lineplot(x=range(1, len(history.history['loss'])+1), y=history.history['loss'], label='Train Loss')
    sns.lineplot(x=range(1, len(history.history['val_loss'])+1), y=history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.show()

# Example usage (uncomment after training):
# plot_training_history(history)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def plot_confusion_matrix(model, generator):
    # Get true labels and predictions
    y_true = generator.classes
    y_pred_probs = model.predict(generator)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()
    
    # Print classification report
    print("Classification Report:\n", classification_report(y_true, y_pred, target_names=["Benign", "Malignant", "Other"]))

# Example usage (uncomment after model training):
# plot_confusion_matrix(best_model, val_generator)

In [None]:
import os
import pandas as pd

# Define the path to your unseen test images directory
test_images_dir = "/path/to/unseen_test_images/"

# Define label mapping
label_mapping = {
    "benign": 0,
    "malignant": 1,
    "other": 2  # Any non-skin lesion image
}

# Automatically create a CSV file
test_data = []

# Loop through images in the directory
for filename in os.listdir(test_images_dir):
    if filename.lower().startswith("benign"):
        label = label_mapping["benign"]
    elif filename.lower().startswith("malignant"):
        label = label_mapping["malignant"]
    else:
        label = label_mapping["other"]  # Any image that doesn't match skin lesion types

    test_data.append([filename, label])

# Convert to a DataFrame
df_test = pd.DataFrame(test_data, columns=["image_name", "target"])

# Save to CSV
df_test.to_csv("/path/to/unseen_test.csv", index=False)

print("✅ Unseen test CSV created successfully!")

In [None]:
# Update the CSV file path to your unseen test CSV
test_csv = pd.read_csv("/path/to/unseen_test.csv")

# If the filenames in the test CSV do not include the file extension, add it:
if 'image_name' in test_csv.columns:
    test_csv['image_name'] = test_csv['image_name'].astype(str) + ".jpg"

# Create a data generator for the unseen test set (only rescaling, no augmentation)
test_data_gen = ImageDataGenerator(rescale=1./255)

test_generator = test_data_gen.flow_from_dataframe(
    dataframe=test_csv,
    directory="/path/to/test",  # Update to your test images directory
    x_col='image_name',
    y_col='target',  # The true labels
    target_size=(224, 224),
    class_mode='sparse',  # assuming targets are integer-encoded
    batch_size=32,
    shuffle=False
)

In [None]:
# Assuming you have already trained and saved your optimized model as "optimized_hybrid_skin_cancer_model.h5"
# And then loaded it (best_model) in a previous cell.
# If not, load the model using:
# model = tf.keras.models.load_model("optimized_hybrid_skin_cancer_model.h5", custom_objects={'focal_loss': focal_loss})

# Evaluate the model on the unseen test set.
# Since the hybrid model expects two identical inputs (one for each branch), we feed the same test generator twice.
test_loss, test_acc = best_model.evaluate([test_generator, test_generator])
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def plot_confusion_matrix(model, generator):
    # Get the ground truth labels
    y_true = generator.classes
    # Predict using the model (feeding the same test generator to both branches)
    y_pred_probs = model.predict([generator, generator])
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Plot confusion matrix using Seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()
    
    # Print classification report for further details
    print("Classification Report:\n", classification_report(y_true, y_pred, target_names=["Benign", "Malignant", "Other"]))

# Plot confusion matrix and print classification report for the unseen test set
plot_confusion_matrix(best_model, test_generator)