<a href="https://colab.research.google.com/github/rbuzmaa/MLproject/blob/main/MobileNetColonCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d andrewmvd/lung-and-colon-cancer-histopathological-images


Dataset URL: https://www.kaggle.com/datasets/andrewmvd/lung-and-colon-cancer-histopathological-images
License(s): CC-BY-SA-4.0
Downloading lung-and-colon-cancer-histopathological-images.zip to /content
100% 1.76G/1.76G [00:13<00:00, 155MB/s]
100% 1.76G/1.76G [00:13<00:00, 144MB/s]


In [6]:
import os, shutil, random
from sklearn.model_selection import train_test_split

SEED = 42
random.seed(SEED)

# ---- Paths (based on your Kaggle unzip structure) ----
src_base = "/content/lung_colon_image_set/colon_image_sets"
src_aca  = os.path.join(src_base, "colon_aca")
src_n    = os.path.join(src_base, "colon_n")

# --- Added Code to Handle Missing Dataset ---
if not os.path.exists(src_base):
    print(f"Dataset not found at {src_base}. Attempting to download and unzip from Kaggle...")
    # Kaggle API setup
    if not os.path.exists("/root/.kaggle"):
        os.makedirs("/root/.kaggle")
    !cp kaggle.json /root/.kaggle/
    !chmod 600 /root/.kaggle/kaggle.json

    # Download and Unzip (specific to 'lung-and-colon-cancer-histopathological-images')
    !kaggle datasets download -d balabaskar/lung-and-colon-cancer-histopathological-images -p /content/
    # The zip file itself is in /content/, unzip it there. The content will be in a new folder like /content/lung_colon_image_set
    !unzip -q /content/lung-and-colon-cancer-histopathological-images.zip -d /content/
    print("Kaggle dataset downloaded and unzipped.")
# --- End Added Code ---

# ---- Output split folder ----
out_base = "/content/colon_split"
train_out = os.path.join(out_base, "train")
val_out   = os.path.join(out_base, "val")
test_out  = os.path.join(out_base, "test")

classes = ["colon_aca", "colon_n"]

for split in [train_out, val_out, test_out]:
    for c in classes:
        os.makedirs(os.path.join(split, c), exist_ok=True)

def list_images(folder):
    return sorted([f for f in os.listdir(folder) if f.lower().endswith((".jpg",".jpeg",".png"))])

aca_files = list_images(src_aca)
n_files   = list_images(src_n)

# Labels: 1 = aca, 0 = n (only used for stratified split)
X = [("colon_aca", f) for f in aca_files] + [("colon_n", f) for f in n_files]
y = [1]*len(aca_files) + [0]*len(n_files)

# ---- 70/15/15 stratified split ----
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=SEED
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))

def copy_split(items, split_dir):
    for cls, fname in items:
        src_dir = src_aca if cls == "colon_aca" else src_n
        src_path = os.path.join(src_dir, fname)
        dst_path = os.path.join(split_dir, cls, fname)
        shutil.copy2(src_path, dst_path)

copy_split(X_train, train_out)
copy_split(X_val, val_out)
copy_split(X_test, test_out)

print("✅ Colon-only split created at:", out_base)
print("Train ACA:", len(os.listdir(os.path.join(train_out, "colon_aca"))))
print("Train N  :", len(os.listdir(os.path.join(train_out, "colon_n"))))

Dataset not found at /content/lung_colon_image_set/colon_image_sets. Attempting to download and unzip from Kaggle...
403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/datasets/metadata/balabaskar/lung-and-colon-cancer-histopathological-images
Kaggle dataset downloaded and unzipped.
Train: 7000 Val: 1500 Test: 1500
✅ Colon-only split created at: /content/colon_split
Train ACA: 3500
Train N  : 3500


In [16]:
import os

drive_base = "/content/drive/MyDrive/FYP_Project"
os.makedirs(drive_base, exist_ok=True)

print("Drive folder ready:", drive_base)

Drive folder ready: /content/drive/MyDrive/FYP_Project


In [18]:
import shutil

src = "/content/colon_split"
dst = "/content/drive/MyDrive/FYP_Project/colon_split"

# Remove old copy if exists (prevents duplication error)
if os.path.exists(dst):
    shutil.rmtree(dst)

shutil.copytree(src, dst)

print("colon_split successfully saved to Drive")

KeyboardInterrupt: 

In [19]:
import os
from google.colab import drive

# Remove the directory if it's not a mount point and contains files
if os.path.exists('/content/drive') and not os.path.ismount('/content/drive'):
    # Only remove if it's not a real mountpoint but a directory with content
    # This handles cases where a previous run created directories inside /content/drive
    # but the mount itself failed or was interrupted.
    try:
        os.rmdir('/content/drive') # Try removing as empty directory first
    except OSError:
        # If not empty, remove contents and then directory
        import shutil
        shutil.rmtree('/content/drive', ignore_errors=True)

drive.mount('/content/drive', force_remount=True)

KeyboardInterrupt: 

In [20]:
import os

drive_path = "/content/drive/MyDrive/FYP_Project"

print("Exists in Colab:", os.path.exists(drive_path))

Exists in Colab: True


In [21]:
import os

base = "/content/colon_split"

for split in ["train", "val", "test"]:
    print(f"\n{split.upper()}")
    for cls in ["colon_aca", "colon_n"]:
        path = os.path.join(base, split, cls)
        print(cls, ":", len(os.listdir(path)))


TRAIN
colon_aca : 3500
colon_n : 3500

VAL
colon_aca : 750
colon_n : 750

TEST
colon_aca : 750
colon_n : 750


In [23]:
import tensorflow as tf
from tensorflow.keras import layers, models

IMG_SIZE = 224
BATCH_SIZE = 32
SEED = 42

train_dir = "/content/colon_split/train"
val_dir   = "/content/colon_split/val"
test_dir  = "/content/colon_split/test"

train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    label_mode="binary",
    seed=SEED,
    shuffle=True
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    label_mode="binary",
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    label_mode="binary",
    shuffle=False
)

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.prefetch(AUTOTUNE)
val_ds   = val_ds.prefetch(AUTOTUNE)
test_ds  = test_ds.prefetch(AUTOTUNE)

# Augment only training
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.05),
    layers.RandomZoom(0.10),
    layers.RandomContrast(0.10),
])

preprocess = tf.keras.applications.mobilenet_v2.preprocess_input

inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = data_augmentation(inputs)
x = preprocess(x)

base_model = tf.keras.applications.MobileNetV2(
    include_top=False,
    weights="imagenet",
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)
base_model.trainable = False

x = base_model(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = models.Model(inputs, outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.5)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=callbacks
)

print("\n✅ Test evaluation:")
model.evaluate(test_ds)


Found 7000 files belonging to 2 classes.
Found 1500 files belonging to 2 classes.
Found 1500 files belonging to 2 classes.
Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m508s[0m 2s/step - accuracy: 0.8725 - auc: 0.9363 - loss: 0.2837 - val_accuracy: 0.9947 - val_auc: 0.9997 - val_loss: 0.0426 - learning_rate: 0.0010
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 2s/step - accuracy: 0.9865 - auc: 0.9987 - loss: 0.0509 - val_accuracy: 0.9960 - val_auc: 0.9999 - val_loss: 0.0240 - learning_rate: 0.0010
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 2s/step - accuracy: 0.9917 - auc: 0.9993 - loss: 0.0349 - val_accuracy: 0.9967 - val_auc: 0.9999 - val_loss: 0.0185 - learning_rate: 0.0010
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 2s/step - accuracy: 0.9936 - auc: 0.9997 - loss: 0.0273 - val_accuracy: 0.9973 - val_auc: 1.0000 - val_loss: 0.0132 - learning_rate: 0.0010
Epoch

[0.01517850998789072, 0.996666669845581, 0.999934196472168]

In [24]:
base_model.trainable = True
for layer in base_model.layers[:-30]:
    layer.trainable = False

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

history_ft = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=callbacks
)

print("\n✅ Test evaluation after fine-tuning:")
model.evaluate(test_ds)


Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 3s/step - accuracy: 0.9400 - auc: 0.9933 - loss: 0.1532 - val_accuracy: 0.9927 - val_auc: 1.0000 - val_loss: 0.0184 - learning_rate: 1.0000e-05
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m596s[0m 3s/step - accuracy: 0.9883 - auc: 0.9991 - loss: 0.0371 - val_accuracy: 0.9947 - val_auc: 1.0000 - val_loss: 0.0115 - learning_rate: 1.0000e-05
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m627s[0m 3s/step - accuracy: 0.9924 - auc: 0.9996 - loss: 0.0240 - val_accuracy: 0.9973 - val_auc: 1.0000 - val_loss: 0.0061 - learning_rate: 1.0000e-05
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m618s[0m 3s/step - accuracy: 0.9951 - auc: 0.9999 - loss: 0.0160 - val_accuracy: 0.9987 - val_auc: 1.0000 - val_loss: 0.0039 - learning_rate: 1.0000e-05
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m582s[0m 3s/step - accuracy: 0.9957 - auc: 

[0.004535323940217495, 0.9993333220481873, 1.0]

In [25]:
import tensorflow as tf

IMG_SIZE = 224
BATCH_SIZE = 32
test_dir = "/content/colon_split/test"

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    label_mode="binary",
    shuffle=False
)

Found 1500 files belonging to 2 classes.


In [26]:
results = model.evaluate(test_ds, verbose=1)
print("Test results (loss, accuracy, auc if compiled):", results)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 2s/step - accuracy: 0.9999 - auc: 0.5208 - loss: 0.0028
Test results (loss, accuracy, auc if compiled): [0.004535323940217495, 0.9993333220481873, 1.0]


In [27]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# True labels
y_true = np.concatenate([y.numpy().ravel() for _, y in test_ds]).astype(int)

# Predicted probabilities
y_prob = model.predict(test_ds, verbose=0).ravel()

# Default threshold 0.5
y_pred = (y_prob >= 0.5).astype(int)

print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
print("\nClassification report:\n",
      classification_report(y_true, y_pred, target_names=["colon_n", "colon_aca"], digits=4))

Confusion matrix:
 [[750   0]
 [  1 749]]

Classification report:
               precision    recall  f1-score   support

     colon_n     0.9987    1.0000    0.9993       750
   colon_aca     1.0000    0.9987    0.9993       750

    accuracy                         0.9993      1500
   macro avg     0.9993    0.9993    0.9993      1500
weighted avg     0.9993    0.9993    0.9993      1500

