In [1]:
# === Cell 1: Setup & Imports ===
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# allow GPU memory growth
for gpu in tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)


2025-04-17 02:37:52.026307: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744857472.229529      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744857472.286253      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# === Cell 2: Paths & DataFrame Preparation ===
TRAIN_DIR  = '/kaggle/input/histopathologic-cancer-detection/train'
TEST_DIR   = '/kaggle/input/histopathologic-cancer-detection/test'
LABELS_CSV = '/kaggle/input/histopathologic-cancer-detection/train_labels.csv'

# read labels and add filename column
df = pd.read_csv(LABELS_CSV)
df['filename'] = df['id'] + '.tif'

# check dataset mounting
print("Number of train images:", len(os.listdir(TRAIN_DIR)))
print("Number of test images: ", len(os.listdir(TEST_DIR)))
print("Sample train file exists:", os.path.exists(os.path.join(TRAIN_DIR, df['filename'].iloc[0])))

# stratified train/validation split (80/20)
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=SEED
)
print(f"Train samples: {len(train_df)}, Validation samples: {len(val_df)}")

# image parameters
IMG_SIZE = 96
BATCH    = 32


Number of train images: 220025
Number of test images:  57458
Sample train file exists: True
Train samples: 176020, Validation samples: 44005


In [4]:
# === Cell 3: Generator Sanity‑check (Small Sample) ===
# sample a small balanced subset for quick test
SAMPLE_N    = 2000
train_small = train_df.groupby('label').sample(n=SAMPLE_N//2, random_state=SEED)
val_small   = val_df  .groupby('label').sample(n=SAMPLE_N//2, random_state=SEED)

dgen = ImageDataGenerator(rescale=1./255)
gen_sm = dgen.flow_from_dataframe(
    dataframe   = train_small,
    directory   = TRAIN_DIR,
    x_col       = 'filename',
    y_col       = 'label',
    target_size = (IMG_SIZE, IMG_SIZE),
    class_mode  = 'raw',
    batch_size  = BATCH,
    shuffle     = True,
)

# fetch one batch and inspect shapes
x_batch, y_batch = next(gen_sm)
print("x_batch.shape:", x_batch.shape)  # expected (BATCH, IMG_SIZE, IMG_SIZE, 3)
print("y_batch.shape:", y_batch.shape)  # expected (BATCH,)
print("y_batch sample:", y_batch[:10])


Found 2000 validated image filenames.
x_batch.shape: (32, 96, 96, 3)
y_batch.shape: (32,)
y_batch sample: [1 0 1 0 1 1 0 1 0 1]


In [5]:
# === Cell 4: Baseline CNN on Small Sample ===
def build_baseline():
    model = models.Sequential([
        layers.Input((IMG_SIZE, IMG_SIZE, 3)),
        layers.Conv2D(32, 3, activation='relu', padding='same'), layers.MaxPooling2D(),
        layers.Conv2D(64, 3, activation='relu', padding='same'), layers.MaxPooling2D(),
        layers.Conv2D(128, 3, activation='relu', padding='same'), layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(128, activation='relu'), layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# build and train on the small subset
baseline = build_baseline()
history_sm = baseline.fit(
    gen_sm,
    validation_data=dgen.flow_from_dataframe(
        val_small, TRAIN_DIR, x_col='filename', y_col='label',
        target_size=(IMG_SIZE, IMG_SIZE), class_mode='raw',
        batch_size=BATCH, shuffle=False
    ),
    epochs=3,
    verbose=2
)

# evaluate on small subset
preds_sm = baseline.predict(gen_sm).ravel()
auc_sm = roc_auc_score(train_small['label'], preds_sm[:len(train_small)])
print("Small-sample ROC AUC:", auc_sm)


I0000 00:00:1744857520.868782      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Found 2000 validated image filenames.
Epoch 1/3


  self._warn_if_super_not_called()
I0000 00:00:1744857531.695463      94 service.cc:148] XLA service 0x7ae118006010 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744857531.696232      94 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1744857532.072230      94 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1744857534.865927      94 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


63/63 - 30s - 473ms/step - accuracy: 0.6600 - loss: 0.6085 - val_accuracy: 0.7615 - val_loss: 0.5050
Epoch 2/3
63/63 - 5s - 83ms/step - accuracy: 0.7940 - loss: 0.4760 - val_accuracy: 0.7670 - val_loss: 0.5223
Epoch 3/3
63/63 - 5s - 80ms/step - accuracy: 0.7535 - loss: 0.5025 - val_accuracy: 0.7645 - val_loss: 0.4998
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step
Small-sample ROC AUC: 0.502706


In [8]:
# === Cell 5: Baseline CNN on Full Data ===
# prepare full-data generators
full_train_gen = dgen.flow_from_dataframe(
    train_df, TRAIN_DIR, x_col='filename', y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE), class_mode='raw',
    batch_size=BATCH, shuffle=True
)
full_val_gen = dgen.flow_from_dataframe(
    val_df, TRAIN_DIR, x_col='filename', y_col='label',
    target_size=(IMG_SIZE, IMG_SIZE), class_mode='raw',
    batch_size=BATCH, shuffle=False
)

# train baseline model
baseline = build_baseline()
history_full = baseline.fit(
    full_train_gen,
    validation_data=full_val_gen,
    epochs=5,
    callbacks=[callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],
    verbose=2
)

# evaluate
val_preds_full = baseline.predict(full_val_gen).ravel()
val_auc_full    = roc_auc_score(val_df['label'], val_preds_full)
print("Final Baseline ROC AUC:", val_auc_full)


Found 176020 validated image filenames.
Found 44005 validated image filenames.
Epoch 1/5


  self._warn_if_super_not_called()


5501/5501 - 285s - 52ms/step - accuracy: 0.8175 - loss: 0.4091 - val_accuracy: 0.8466 - val_loss: 0.3503
Epoch 2/5
5501/5501 - 276s - 50ms/step - accuracy: 0.8736 - loss: 0.3051 - val_accuracy: 0.8772 - val_loss: 0.2951
Epoch 3/5
5501/5501 - 284s - 52ms/step - accuracy: 0.8930 - loss: 0.2638 - val_accuracy: 0.8992 - val_loss: 0.2550
Epoch 4/5
5501/5501 - 278s - 51ms/step - accuracy: 0.9058 - loss: 0.2368 - val_accuracy: 0.9066 - val_loss: 0.2387
Epoch 5/5
5501/5501 - 278s - 51ms/step - accuracy: 0.9163 - loss: 0.2126 - val_accuracy: 0.9046 - val_loss: 0.2342
[1m1376/1376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 40ms/step
Final Baseline ROC AUC: 0.9664226137262383


In [None]:
# === Cell 6: Transfer Learning with EfficientNetB0 ===
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models, callbacks

# 1) Create augmented training generator
train_aug = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=(0.8, 1.2)
)
train_gen = train_aug.flow_from_dataframe(
    dataframe   = train_df,
    directory   = TRAIN_DIR,
    x_col       = 'filename',
    y_col       = 'label',
    target_size = (IMG_SIZE, IMG_SIZE),
    class_mode  = 'raw',
    batch_size  = BATCH,
    shuffle     = True
)

# 2) Create validation generator (no augmentation)
val_gen = ImageDataGenerator(rescale=1./255).flow_from_dataframe(
    dataframe   = val_df,
    directory   = TRAIN_DIR,
    x_col       = 'filename',
    y_col       = 'label',
    target_size = (IMG_SIZE, IMG_SIZE),
    class_mode  = 'raw',
    batch_size  = BATCH,
    shuffle     = False
)

# 3) Build the transfer‑learning model
base_model = EfficientNetB0(
    weights='imagenet',
    include_top=False,
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)
base_model.trainable = False  # freeze backbone

tl_model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dropout(0.3),
    layers.Dense(128, activation='swish'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])
tl_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 4) Prepare callbacks (must save as .keras)
checkpoint = callbacks.ModelCheckpoint(
    'best_efficient.keras',     # <— ends with .keras
    monitor='val_loss',
    save_best_only=True
)
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    min_lr=1e-6
)

# 5) Train the head layers
tl_model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=5,
    callbacks=[checkpoint, reduce_lr],
    verbose=2
)

# 6) Unfreeze the backbone for fine‑tuning
base_model.trainable = True
tl_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
tl_model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=5,
    callbacks=[checkpoint, reduce_lr],
    verbose=2
)

# 7) Load the best weights and evaluate
tl_model.load_weights('best_efficient.keras')
val_preds_tl = tl_model.predict(val_gen).ravel()
print("Transfer Learning ROC AUC:", roc_auc_score(val_df['label'], val_preds_tl))


Found 176020 validated image filenames.
Found 44005 validated image filenames.
Epoch 1/5


  self._warn_if_super_not_called()


5501/5501 - 796s - 145ms/step - accuracy: 0.5943 - loss: 0.6771 - val_accuracy: 0.5950 - val_loss: 0.6749 - learning_rate: 0.0010
Epoch 2/5
5501/5501 - 773s - 140ms/step - accuracy: 0.5947 - loss: 0.6754 - val_accuracy: 0.5950 - val_loss: 0.6750 - learning_rate: 0.0010
Epoch 3/5
5501/5501 - 767s - 139ms/step - accuracy: 0.5949 - loss: 0.6751 - val_accuracy: 0.5950 - val_loss: 0.6750 - learning_rate: 0.0010
Epoch 4/5
5501/5501 - 772s - 140ms/step - accuracy: 0.5950 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6748 - learning_rate: 5.0000e-04
Epoch 5/5
5501/5501 - 767s - 139ms/step - accuracy: 0.5950 - loss: 0.6750 - val_accuracy: 0.5950 - val_loss: 0.6747 - learning_rate: 5.0000e-04
Epoch 1/5


E0000 00:00:1744867232.545466      93 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1744867232.728251      93 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1744867233.227215      93 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1744867233.417614      93 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1744867233.705380      93 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:0

5501/5501 - 936s - 170ms/step - accuracy: 0.7595 - loss: 0.8462 - val_accuracy: 0.8380 - val_loss: 0.4238 - learning_rate: 1.0000e-05
Epoch 2/5
5501/5501 - 792s - 144ms/step - accuracy: 0.8306 - loss: 0.4230 - val_accuracy: 0.8620 - val_loss: 0.3302 - learning_rate: 1.0000e-05
Epoch 3/5
5501/5501 - 794s - 144ms/step - accuracy: 0.8611 - loss: 0.3367 - val_accuracy: 0.8888 - val_loss: 0.2762 - learning_rate: 1.0000e-05
Epoch 4/5
5501/5501 - 799s - 145ms/step - accuracy: 0.8825 - loss: 0.2967 - val_accuracy: 0.8986 - val_loss: 0.2561 - learning_rate: 1.0000e-05
Epoch 5/5
5501/5501 - 813s - 148ms/step - accuracy: 0.8946 - loss: 0.2667 - val_accuracy: 0.9075 - val_loss: 0.2355 - learning_rate: 1.0000e-05
[1m 623/1376[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m32s[0m 43ms/step

In [13]:
# === Cell 7: Test Prediction & Submission ===
test_files = sorted([f for f in os.listdir(TEST_DIR) if f.endswith('.tif')])
test_df    = pd.DataFrame({'filename': test_files})

test_gen = dgen.flow_from_dataframe(
    test_df, TEST_DIR, x_col='filename', y_col=None,
    target_size=(IMG_SIZE, IMG_SIZE), class_mode=None,
    batch_size=BATCH, shuffle=False
)

predictions = tl_model.predict(test_gen, verbose=1).ravel()
submission  = pd.DataFrame({
    'id':    [f.replace('.tif','') for f in test_files],
    'label': predictions
})
submission.to_csv('submission.csv', index=False)
print("submission.csv has been created.")


Found 57458 validated image filenames.
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 41ms/step
submission.csv has been created.
