In [31]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [32]:
base = '/home/rob/'
csv_file = os.path.join(base, 'ham10000_data/HAM10000_metadata.csv')
img_dir = os.path.join(base, 'ham10000_data/images')
file_ext = '.jpg'

df = pd.read_csv(csv_file)
df['image_path'] = df['image_id'].apply(lambda x: os.path.join(img_dir, x + file_ext))
class_names = df['dx'].unique()
num_classes = len(class_names)

print(df.head())

     lesion_id      image_id   dx dx_type   age   sex localization  \
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp   
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp   
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp   
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp   
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear   

                                        image_path  
0  /home/rob/ham10000_data/images/ISIC_0027419.jpg  
1  /home/rob/ham10000_data/images/ISIC_0025030.jpg  
2  /home/rob/ham10000_data/images/ISIC_0026769.jpg  
3  /home/rob/ham10000_data/images/ISIC_0025661.jpg  
4  /home/rob/ham10000_data/images/ISIC_0031633.jpg  


In [33]:
IMG_SIZE = 224
BATCH_SIZE = 32

datagen_train = ImageDataGenerator(
    rescale=1.0 / 255.0,
    rotation_range=5,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    brightness_range=[0.9, 1.1],
    horizontal_flip=True,
    fill_mode='nearest'
)

datagen_test = ImageDataGenerator(rescale=1.0 / 255.0)

In [34]:
def build_model(input_shape=(224, 224, 3), num_classes=7, base_trainable=False, learning_rate=0.001, dropout_rate=0.5):
    """
    Builds and compiles a ResNet50-based model.

    Args:
        input_shape (tuple): Shape of the input images (default: (224, 224, 3)).
        num_classes (int): Number of output classes (default: 7).
        base_trainable (bool): Whether to make the base ResNet50 model trainable (default: False).
        learning_rate (float): Learning rate for the optimizer (default: 0.001).
        dropout_rate (float): Dropout rate after the GlobalAveragePooling2D layer (default: 0.5).

    Returns:
        Model: A compiled Keras model.
    """
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = base_trainable

    x = GlobalAveragePooling2D()(base_model.output)
    x = Dropout(dropout_rate)(x)
    output = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=base_model.input, outputs=output)

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [35]:
model_path = 'models/best_model_resnet50.keras'

In [36]:
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
best_val_accuracy = 0.0
best_model_path = 'models/best_model_overall_resnet50.keras'

fold_results = []
for fold_no, (train_idx, val_idx) in enumerate(skf.split(df, df['dx']), 1):
    print(f"Training Fold {fold_no}...")

    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_generator = datagen_train.flow_from_dataframe(
        dataframe=train_df,
        x_col='image_path',
        y_col='dx',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=True
    )

    val_generator = datagen_test.flow_from_dataframe(
        dataframe=val_df,
        x_col='image_path',
        y_col='dx',
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=False,
    )

    train_dataset = tf.data.Dataset.from_generator(
        lambda: train_generator,
        output_types=(tf.float32, tf.float32),
        output_shapes=([None, IMG_SIZE, IMG_SIZE, 3], [None, len(class_names)])
    ).prefetch(tf.data.AUTOTUNE)

    val_dataset = tf.data.Dataset.from_generator(
        lambda: val_generator,
        output_types=(tf.float32, tf.float32),
        output_shapes=([None, IMG_SIZE, IMG_SIZE, 3], [None, len(class_names)])
    ).prefetch(tf.data.AUTOTUNE)


    model = build_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=num_classes, base_trainable=False)

    callbacks = [
        ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=3, verbose=1),
        EarlyStopping(monitor='val_accuracy', patience=7, verbose=1, restore_best_weights=True)
    ]

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=20,
        steps_per_epoch=len(train_generator),
        validation_steps=len(val_generator),
        callbacks=callbacks
    )

    val_loss, val_accuracy = model.evaluate(val_generator, steps=len(val_generator))
    print(f"Fold {fold_no} - Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        model.save(best_model_path)
        print(f"New Best Model Saved with Validation Accuracy: {best_val_accuracy:.4f}")

    fold_results.append({'fold': fold_no, 'val_loss': val_loss, 'val_accuracy': val_accuracy})

Training Fold 1...
Found 8012 validated image filenames belonging to 7 classes.
Found 2003 validated image filenames belonging to 7 classes.


KeyboardInterrupt: 

In [None]:
avg_val_accuracy = np.mean([result['val_accuracy'] for result in fold_results])
std_val_accuracy = np.std([result['val_accuracy'] for result in fold_results])
print(f"Cross-Validation Mean Accuracy: {avg_val_accuracy:.4f} Â± {std_val_accuracy:.4f}")

In [None]:
print("\nTraining Final Model on Full Dataset...")
final_train_generator = datagen_train.flow_from_dataframe(
    dataframe=df,
    x_col='image_path',
    y_col='dx',
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

In [None]:
final_model_path = 'best_models/resnet50.keras'

final_model = build_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=num_classes, base_trainable=True, learning_rate=1e-5)
final_model.fit(
    final_train_generator,
    epochs=20,
    steps_per_epoch=len(final_train_generator),
    callbacks=[
        ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=3, verbose=1),
        EarlyStopping(monitor='val_accuracy', patience=7, verbose=1, restore_best_weights=True),
        ModelCheckpoint(filepath=final_model_path, save_best_only=True, monitor='val_accuracy', verbose=1)
    ]
)

In [None]:
print("\nEvaluating Best Model on Test Set...")
best_model = load_model(best_model_path)

loss, accuracy = best_model.evaluate(val_generator, steps=len(val_generator))
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
y_pred = np.argmax(best_model.predict(val_generator), axis=-1)
y_true = val_generator.classes
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=class_names))


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [None]:
y_true_one_hot = label_binarize(y_true, classes=np.arange(num_classes))
y_pred_prob = best_model.predict(val_generator)

fpr, tpr, _ = roc_curve(y_true_one_hot.ravel(), y_pred_prob.ravel())
auc_score = roc_auc_score(y_true_one_hot, y_pred_prob, average="micro")

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f"Micro-Average ROC Curve (AUC = {auc_score:.4f})")
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()
