In [1]:
# Install and setup Kaggle
!pip install kaggle

from google.colab import files
files.upload()

import os
import shutil  # IMPORTANT: Add this import
import pandas as pd
from sklearn.model_selection import train_test_split

# Create Kaggle directory and set permissions
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d awsaf49/cbis-ddsm-breast-cancer-image-dataset
!unzip -q /content/cbis-ddsm-breast-cancer-image-dataset.zip -d /content/breast_cancer_data

# ============================================
# STEP 1: Load and Combine Datasets
# ============================================
csv_path = "/content/breast_cancer_data/csv"
jpeg_path = "/content/breast_cancer_data/jpeg"

# Load both calc and mass datasets
calc_train_df = pd.read_csv(os.path.join(csv_path, "calc_case_description_train_set.csv"))
mass_train_df = pd.read_csv(os.path.join(csv_path, "mass_case_description_train_set.csv"))

# Combine the datasets
combined_df = pd.concat([calc_train_df, mass_train_df], ignore_index=True)

# ============================================
# STEP 2: Process Paths and Labels
# ============================================
# Fix image paths and extract unique_id
combined_df["image file path"] = combined_df["image file path"].str.replace("\\", "/")
combined_df["unique_id"] = combined_df["image file path"].apply(lambda x: x.strip("/").split("/")[-2])

# Simplify pathology labels (binary classification: benign vs malignant)
combined_df["pathology"] = combined_df["pathology"].str.lower().replace({
    "benign_without_callback": "benign"
})

print("Dataset shape:", combined_df.shape)
print("\nClass distribution:")
print(combined_df["pathology"].value_counts())

# ============================================
# STEP 3: Organize Images into Folders
# ============================================
def organize_images(df, dest_dir):
    """Copy images from source to destination organized by label"""
    for _, row in df.iterrows():
        unique_id = row["unique_id"]
        label = row["pathology"]
        src_dir = os.path.join(jpeg_path, unique_id)

        if not os.path.exists(src_dir):
            print(f"Skipping missing directory: {src_dir}")
            continue

        dest_folder = os.path.join(dest_dir, label)
        os.makedirs(dest_folder, exist_ok=True)

        # Copy all images from the subdirectory
        for file in os.listdir(src_dir):
            src_file = os.path.join(src_dir, file)
            dest_file = os.path.join(dest_folder, file)
            if os.path.isfile(src_file):
                shutil.copy(src_file, dest_file)

# Split data into train/test
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42, stratify=combined_df["pathology"])

print(f"\nTrain samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

# Organize images
organize_images(train_df, "/content/train")
organize_images(test_df, "/content/test")

# ============================================
# STEP 4: Data Generators
# ============================================
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Only rescale for testing
test_datagen = ImageDataGenerator(rescale=1.0/255)

# Create generators
train_generator = train_datagen.flow_from_directory(
    "/content/train",
    target_size=(224, 224),
    batch_size=32,
    class_mode="binary",  # Binary classification (benign/malignant)
    shuffle=True
)

test_generator = test_datagen.flow_from_directory(
    "/content/test",
    target_size=(224, 224),
    batch_size=32,
    class_mode="binary",
    shuffle=False
)

print("\nClass indices:", train_generator.class_indices)

# ============================================
# STEP 5: Build Model
# ============================================
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model

# Load pre-trained EfficientNetB0
base_model = EfficientNetB0(
    weights="imagenet",
    include_top=False,
    input_shape=(224, 224, 3)
)

# Freeze the base model
base_model.trainable = False

# Add custom layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation="relu")(x)
x = Dropout(0.5)(x)  # Add dropout to prevent overfitting
predictions = Dense(1, activation="sigmoid")(x)  # Binary classification

# Build the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# ============================================
# STEP 6: Train Model
# ============================================
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    epochs=10,
    validation_data=test_generator,
    validation_steps=test_generator.samples // test_generator.batch_size
)

# ============================================
# STEP 7: Evaluate Model
# ============================================
test_loss, test_accuracy = model.evaluate(test_generator)
print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Loss: {test_loss:.4f}")

# Save the model
model.save("/content/breast_cancer_model.h5")
print("\nModel saved!")



Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/awsaf49/cbis-ddsm-breast-cancer-image-dataset
License(s): CC-BY-SA-3.0
Downloading cbis-ddsm-breast-cancer-image-dataset.zip to /content
 99% 4.92G/4.95G [01:13<00:01, 28.6MB/s]
100% 4.95G/4.95G [01:14<00:00, 71.9MB/s]
Dataset shape: (2864, 18)

Class distribution:
pathology
benign       1683
malignant    1181
Name: count, dtype: int64

Train samples: 2291
Test samples: 573
Found 591 images belonging to 2 classes.
Found 367 images belonging to 2 classes.

Class indices: {'benign': 0, 'malignant': 1}
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 9s/step - accuracy: 0.5086 - loss: 0.8157 - val_accuracy: 0.4119 - val_loss: 0.8057
Epoch 2/10
[1m 1/18[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m39s[0m 2s/step - accuracy: 0.5938 - loss: 0.8081



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 3s/step - accuracy: 0.5938 - loss: 0.8081 - val_accuracy: 0.4119 - val_loss: 0.9014
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 9s/step - accuracy: 0.5237 - loss: 0.7572 - val_accuracy: 0.5881 - val_loss: 0.6853
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 3s/step - accuracy: 0.6250 - loss: 0.6409 - val_accuracy: 0.5881 - val_loss: 0.6837
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 8s/step - accuracy: 0.5106 - loss: 0.7256 - val_accuracy: 0.5881 - val_loss: 0.6780
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 3s/step - accuracy: 0.6250 - loss: 0.6852 - val_accuracy: 0.5881 - val_loss: 0.6800
Epoch 7/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 11s/step - accuracy: 0.4600 - loss: 0.7427 - val_accuracy: 




Test Accuracy: 0.5640
Test Loss: 0.6859

Model saved!
