In [8]:
# imports
import pandas as pd
import numpy as np
import h5py
import io
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, Input

In [2]:
# loading Metadata
metadata_path = 'train-metadata.csv'
df = pd.read_csv(metadata_path, low_memory=False)
df = df[['isic_id', 'target']].dropna()  # Keep only needed columns

In [3]:
# sample 10000 images for training
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['target'], random_state=42)
train_df = train_df.sample(n=10000, random_state=42).reset_index(drop=True)
val_df = val_df.sample(n=2000, random_state=42).reset_index(drop=True)

In [4]:
# loading images
def load_images_from_hdf5(hdf5_path, df, target_size=(224, 224)):
    images = []
    labels = []
    with h5py.File(hdf5_path, 'r') as f:
        for _, row in tqdm(df.iterrows(), total=len(df)):
            isic_id = row['isic_id']
            target = row['target']
            img_bytes = f[isic_id][()]
            image = Image.open(io.BytesIO(img_bytes)).convert('RGB')
            image = image.resize(target_size)
            image = np.array(image) / 255.0
            images.append(image)
            labels.append(target)
    return np.array(images), np.array(labels)

In [5]:
# training/test set split
train_hdf5_path = 'train-image.hdf5'
X_train, y_train = load_images_from_hdf5(train_hdf5_path, train_df)
X_val, y_val = load_images_from_hdf5(train_hdf5_path, val_df)

100%|████████████████████████████████████| 10000/10000 [00:25<00:00, 384.87it/s]
100%|██████████████████████████████████████| 2000/2000 [00:04<00:00, 443.79it/s]


In [9]:
# CNN model architecture
model = Sequential([
    Input(shape=(224, 224, 3)),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

In [10]:
# model compiling
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

In [11]:
# fitting our model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 271ms/step - accuracy: 0.9857 - auc: 0.4344 - loss: 0.7980 - val_accuracy: 0.9990 - val_auc: 0.5000 - val_loss: 0.7053
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 217ms/step - accuracy: 0.9983 - auc: 0.4757 - loss: 0.4854 - val_accuracy: 0.9990 - val_auc: 0.5000 - val_loss: 0.5757
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 190ms/step - accuracy: 0.9993 - auc: 0.3342 - loss: 0.1766 - val_accuracy: 0.9990 - val_auc: 0.5000 - val_loss: 0.0825
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 186ms/step - accuracy: 0.9978 - auc: 0.4762 - loss: 0.0726 - val_accuracy: 0.9990 - val_auc: 0.5000 - val_loss: 0.3998
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 197ms/step - accuracy: 0.9987 - auc: 0.4611 - loss: 0.1938 - val_accuracy: 0.9990 - val_auc: 0.5000 - val_loss: 0.0121
Epoch 6/10
[1m

In [13]:
# finding validation accuracy
val_loss, val_acc, val_auc = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {val_acc:.4f}, AUC: {val_auc:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9993 - auc: 0.2578 - loss: 0.0133
Validation Accuracy: 0.9990, AUC: 0.5000


In [None]:
sample_ids = val_df['isic_id'].sample(5, random_state=1).tolist()
with h5py.File(train_hdf5_path, 'r') as f:
    fig, axs = plt.subplots(1, 5, figsize=(20, 4))
    for i, isic_id in enumerate(sample_ids):
        img_bytes = f[isic_id][()]
        image = Image.open(io.BytesIO(img_bytes)).convert('RGB').resize((224, 224))
        img_arr = np.array(image) / 255.0
        pred = model.predict(np.expand_dims(img_arr, axis=0))[0][0]
        axs[i].imshow(image)
        axs[i].axis('off')
        axs[i].set_title(f"{isic_id}\nPred: {pred:.2f}")
    plt.tight_layout()
    plt.show()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
