In [None]:
# Notebook 2 — Binary CNN
# Colab-ready. Run top-to-bottom.

# 0) Install / imports (no TF reinstall — use environment's TF)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix

print("TensorFlow:", tf.__version__)
print("GPU devices:", tf.config.list_physical_devices('GPU'))



In [None]:
# 1) GPU safety (prevent TF from grabbing all GPU memory)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Enabled memory growth on GPU(s).")
    except Exception as e:
        print("Could not set memory growth:", e)



In [None]:
# 2) Config — update paths if needed
IMAGE_FOLDER = "/content/drive/MyDrive/HAM10000_images"   # images folder created by Notebook 1
SPLITS_DIR = "/content/drive/MyDrive/splits"              # output from Notebook 1
OUTPUT_DIR = "/content/drive/MyDrive/models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 10
RANDOM_STATE = 42



In [None]:
!ls "/content/drive/"

In [None]:
!find /content/drive -type f -name "df_train.csv"
!find /content/drive -type f -name "df_val.csv"
!find /content/drive -type f -name "df_test.csv"


In [None]:
# 3) Load CSV splits (Notebook 1 must have created these)
df_train = pd.read_csv(os.path.join(SPLITS_DIR, "df_train.csv"))
df_val   = pd.read_csv(os.path.join(SPLITS_DIR, "df_val.csv"))
df_test  = pd.read_csv(os.path.join(SPLITS_DIR, "df_test.csv"))

# If Notebook1 saved oversampled train as df_train.csv then use it directly; else adapt accordingly.
print("Train rows:", len(df_train), "Val rows:", len(df_val), "Test rows:", len(df_test))



In [None]:
# 4) Ensure we have file paths in CSVs (if not present, create)
for d in [df_train, df_val, df_test]:
    if 'filepath' not in d.columns:
        d['filename'] = d['image_id'].astype(str) + '.jpg'
        d['filepath'] = d['filename'].apply(lambda x: os.path.join(IMAGE_FOLDER, x))



In [None]:
# 5) Build binary label column if not present (benign vs malignant mapping used in your code)
benign_classes = ['bkl', 'df', 'nv', 'vasc']
for df in [df_train, df_val, df_test]:
    if 'binary_label' not in df.columns:
        df['binary_label'] = df['dx'].apply(lambda x: 'benign' if x in benign_classes else 'malignant')

print("Example binary label counts (train):\n", df_train['binary_label'].value_counts())



In [None]:
# 6) Data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.10,
    height_shift_range=0.10,
    zoom_range=0.10,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_flow = train_datagen.flow_from_dataframe(
    df_train, x_col='filepath', y_col='binary_label',
    target_size=(IMG_SIZE,IMG_SIZE), color_mode='rgb',
    class_mode='categorical', batch_size=BATCH_SIZE, shuffle=True
)

val_flow = val_datagen.flow_from_dataframe(
    df_val, x_col='filepath', y_col='binary_label',
    target_size=(IMG_SIZE,IMG_SIZE), color_mode='rgb',
    class_mode='categorical', batch_size=BATCH_SIZE, shuffle=False
)

test_flow = test_datagen.flow_from_dataframe(
    df_test, x_col='filepath', y_col='binary_label',
    target_size=(IMG_SIZE,IMG_SIZE), color_mode='rgb',
    class_mode='categorical', batch_size=BATCH_SIZE, shuffle=False
)



In [None]:
# 7) Build Binary CNN model (matches your architecture but with Input & BatchNorm)
def build_binary_cnn(input_shape=(IMG_SIZE,IMG_SIZE,3)):
    model = Sequential([
        Input(shape=input_shape),
        Conv2D(32, (3,3), activation='relu', padding='valid'),
        MaxPooling2D(2,2),
        BatchNormalization(),

        Conv2D(64, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        BatchNormalization(),

        Conv2D(128, (3,3), activation='relu'),
        MaxPooling2D(2,2),
        BatchNormalization(),

        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(2, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

binary_model = build_binary_cnn()
binary_model.summary()



In [None]:
# 8) Callbacks and checkpoint
checkpoint_path = os.path.join(OUTPUT_DIR, "binary_cnn_best.h5")
callbacks = [
    EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, verbose=1),
    ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)
]

# 9) Train
history = binary_model.fit(
    train_flow,
    validation_data=val_flow,
    epochs=EPOCHS,
    callbacks=callbacks
)




In [None]:
# 10) Save final model (already saved best via checkpoint)
binary_model.save(os.path.join(OUTPUT_DIR, "binary_cnn_last.h5"))
print("Saved models to", OUTPUT_DIR)

# 11) Plot training curves
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend(); plt.title('Accuracy')

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend(); plt.title('Loss')
plt.show()

# 12) Evaluate on test set
# Load best model (safe)
best = load_model(checkpoint_path)
test_steps = int(np.ceil(test_flow.n / test_flow.batch_size))
loss, acc = best.evaluate(test_flow, steps=test_steps, verbose=1)
print(f"Test accuracy: {acc*100:.2f}%, test loss: {loss:.4f}")

# 13) Predictions -> confusion matrix and classification report
test_flow.reset()
y_prob = best.predict(test_flow, steps=test_steps, verbose=1)
y_pred = np.argmax(y_prob, axis=1)
y_true = test_flow.classes  # keras flow_from_dataframe stores integer indices in .classes

# Map numeric indices to class labels
label_map = (train_flow.class_indices)  # e.g. {'benign':0,'malignant':1}
inv_label_map = {v:k for k,v in label_map.items()}
y_pred_labels = [inv_label_map[int(i)] for i in y_pred]
y_true_labels = [inv_label_map[int(i)] for i in y_true]

print("\nClassification Report (binary):")
print(classification_report(y_true_labels, y_pred_labels))

cm = confusion_matrix(y_true_labels, y_pred_labels, labels=list(inv_label_map.values()))
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=list(inv_label_map.values()), yticklabels=list(inv_label_map.values()))
plt.xlabel('Predicted'); plt.ylabel('True'); plt.title('Confusion Matrix')
plt.show()



In [None]:
!ls "/content/drive/MyDrive/models"


In [None]:
from tensorflow.keras.models import load_model

checkpoint_path = "/content/drive/MyDrive/models/binary_cnn_best.h5"
best = load_model(checkpoint_path)
print("Model loaded!")


In [None]:
test_steps = int(np.ceil(test_flow.n / test_flow.batch_size))
loss, acc = best.evaluate(test_flow, steps=test_steps)
print(f"Test accuracy: {acc*100:.2f}%, Test loss: {loss:.4f}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

test_flow.reset()
y_prob = best.predict(test_flow, steps=test_steps)
y_pred = np.argmax(y_prob, axis=1)
y_true = test_flow.classes

label_map = test_flow.class_indices
inv_label_map = {v:k for k,v in label_map.items()}

y_pred_labels = [inv_label_map[i] for i in y_pred]
y_true_labels = [inv_label_map[i] for i in y_true]

print("\nClassification Report:")
print(classification_report(y_true_labels, y_pred_labels))


In [None]:
cm = confusion_matrix(y_true_labels, y_pred_labels)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=list(inv_label_map.values()),
            yticklabels=list(inv_label_map.values()))
plt.xlabel("Predicted"); plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# 14) Inference: upload an image in Colab and predict
# Run the cell below and select an image file to upload. It will print predicted label and confidence.
from google.colab import files
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def predict_single_image(model, img_path, target_size=(IMG_SIZE,IMG_SIZE)):
    img = load_img(img_path, target_size=target_size)
    arr = img_to_array(img) / 255.0
    arr = np.expand_dims(arr, axis=0)
    prob = model.predict(arr)[0]
    idx = np.argmax(prob)
    label = inv_label_map[idx]
    return label, prob[idx], prob

print("To run inference: upload a file using files.upload()")
uploaded = files.upload()  # interactive: pick file(s)
for fn in uploaded.keys():
    label, conf, allprob = predict_single_image(best, fn)
    print(f"File: {fn}  -> Predicted: {label} ({conf*100:.2f}%)  probs: {allprob}")
