## Mattis' code

### 1. Imports

In [1]:
# Base imports
import pandas as pd
import numpy as np
import os
import sys
import cv2

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Tensorflow imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import optimizers

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Custom imports
from classification.ml_logic.preprocessor import Preprocessor

In [2]:
preprocessor = Preprocessor()

### 2. Data imports

In [None]:
# Define file path
MASTER_PATH = '../raw_data/MINI-DDSM-Complete-JPEG-8'

# Load Excel file
df = pd.read_excel(os.path.join(MASTER_PATH, 'DataWMask.xlsx'))
print(f'Data loaded successfully. {len(df)} records found.')
print(df.head())

# Replace backslashes with forward slashes in fullPath
df['fullPath'] = df['fullPath'].str.replace('\\', '/', regex=False)
# Ensure fullPath is a string
df['fullPath'] = df['fullPath'].astype(str)

# Filter out mask files (keep only original images)
df_images = df[~df['fileName'].str.contains('Mask', na=False)]
print(f'Filtered images: {len(df_images)} records remaining after removing masks.')

# Extract patiend ID from fileName (format: C_{patient_id}_1_LATERALITY_VIEW.jpg)
df_images['fileName'] = df_images['fileName'].astype(str).str.strip()
df_images['patient_id'] = df_images['fileName'].str.extract(r'\w_(\d+)_1')

# Create full image paths
#df_images['full_image_path'] = df_images['fullPath']

# Binary mapping: Cancer = 1, Benign and Normal = 0
def create_binary_labels(status):
    if status == 'Cancer':
        return 1
    else: # Benign or Normal
        return 0

df_images['binary_label'] = df_images['Status'].apply(create_binary_labels)
print(df_images.tail())

# Remove any rows with missing labels
df_images = df_images.dropna(subset=['binary_label', 'patient_id'])
print(f'Final dataset size after removing missing labels: {len(df_images)} records.')

print(f"Total images: {len(df_images)}")
print(f"Total unique patients: {df_images['patient_id'].nunique()}")
print(f"Original class distribution:\n{df_images['Status'].value_counts()}")
print(f"\nBinary class distribution:")
print(f"Non-Cancer (Benign + Normal): {len(df_images[df_images['binary_label'] == 0])}")
print(f"Cancer: {len(df_images[df_images['binary_label'] == 1])}")

Data loaded successfully. 7808 records found.
                             fullPath                fileName View   Side  \
0    Benign\0029\C_0029_1.LEFT_CC.jpg    C_0029_1.LEFT_CC.jpg   CC   LEFT   
1   Benign\0029\C_0029_1.LEFT_MLO.jpg   C_0029_1.LEFT_MLO.jpg  MLO   LEFT   
2   Benign\0029\C_0029_1.RIGHT_CC.jpg   C_0029_1.RIGHT_CC.jpg   CC  RIGHT   
3  Benign\0029\C_0029_1.RIGHT_MLO.jpg  C_0029_1.RIGHT_MLO.jpg  MLO  RIGHT   
4    Benign\0033\C_0033_1.LEFT_CC.jpg    C_0033_1.LEFT_CC.jpg   CC   LEFT   

   Status                          Tumour_Contour Tumour_Contour2   Age  \
0  Benign   Benign\0029\C_0029_1.LEFT_CC_Mask.jpg               -  66.0   
1  Benign  Benign\0029\C_0029_1.LEFT_MLO_Mask.jpg               -  66.0   
2  Benign                                       -               -  66.0   
3  Benign                                       -               -  66.0   
4  Benign                                       -               -  60.0   

   Density  
0        3  
1        3  
2

### 3. Patient-based Train/Val/Test split

In [None]:
def patient_based_split(df_images, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_state=42):
    """
    Split data by patient ID to avoid data leakage.
    Args:
        df_images (pd.DataFrame): DataFrame containing image data with 'patient_id' and 'Status'.
        train_ratio (float): Proportion of data to use for training.
        val_ratio (float): Proportion of data to use for validation.
        test_ratio (float): Proportion of data to use for testing.
        random_state (int): Random seed for reproducibility.
    Returns:
        tuple: DataFrames for train, validation, and test sets.
    """
    # Get unique patients with their status
    patient_info = df_images.groupby('patient_id')['Status'].first().reset_index()
    patient_info['binary_label'] = patient_info['Status'].apply(create_binary_labels)

    unique_patients = patient_info['patient_id'].unique()

    # Shuffle patients
    np.random.seed(random_state)
    np.random.shuffle(unique_patients)

    # Calculate split sizes
    total_patients = len(unique_patients)
    train_size = int(train_ratio * total_patients)
    val_size = int(val_ratio * total_patients)

    # Split patient IDs
    train_patients = unique_patients[:train_size]
    val_patients = unique_patients[train_size:train_size+val_size]
    test_patients = unique_patients[train_size+val_size:]

    print(f"\nPatient distribution:")
    print(f"Train patients: {len(train_patients)}")
    print(f"Validation patients: {len(val_patients)}")
    print(f"Test patients: {len(test_patients)}")

    # Assign split labels to all images based on patient ID
    def assign_split(patient_id):
        if patient_id in train_patients:
            return 'train'
        elif patient_id in val_patients:
            return 'val'
        else:
            return 'test'

    df_images['split'] = df_images['patient_id'].apply(assign_split)

    # Create separate dataframes
    train_df = df_images[df_images['split'] == 'train'].copy()
    val_df = df_images[df_images['split'] == 'val'].copy()
    test_df = df_images[df_images['split'] == 'test'].copy()

    # Print detailed statistics
    print(f"\nImage distribution:")
    print(f"Train images: {len(train_df)} (Cancer: {sum(train_df['binary_label'])}, Non-Cancer: {len(train_df) - sum(train_df['binary_label'])})")
    print(f"Val images: {len(val_df)} (Cancer: {sum(val_df['binary_label'])}, Non-Cancer: {len(val_df) - sum(val_df['binary_label'])})")
    print(f"Test images: {len(test_df)} (Cancer: {sum(test_df['binary_label'])}, Non-Cancer: {len(test_df) - sum(test_df['binary_label'])})")

    # Verify no patient leakage
    train_patients_set = set(train_df['patient_id'].unique())
    val_patients_set = set(val_df['patient_id'].unique())
    test_patients_set = set(test_df['patient_id'].unique())

    assert len(train_patients_set.intersection(val_patients_set)) == 0, "Patient leakage between train and val!"
    assert len(train_patients_set.intersection(test_patients_set)) == 0, "Patient leakage between train and test!"
    assert len(val_patients_set.intersection(test_patients_set)) == 0, "Patient leakage between val and test!"

    print("✓ No patient leakage detected!")

    return train_df, val_df, test_df

# Execute the split
train_df, val_df, test_df = patient_based_split(df_images)


Patient distribution:
Train patients: 1184
Validation patients: 253
Test patients: 255

Image distribution:
Train images: 5480 (Cancer: 1880, Non-Cancer: 3600)
Val images: 1144 (Cancer: 412, Non-Cancer: 732)
Test images: 1184 (Cancer: 424, Non-Cancer: 760)
✓ No patient leakage detected!


### 4. Data loading function

In [None]:
def load_and_preprocess_data(df, preprocess_image_func):
    """
    Load images and apply preprocessing for a specific split
    Args:
        df (pd.DataFrame): DataFrame containing image paths and labels.
        preprocess_image_func (function): Function to preprocess images.
    Returns:
        tuple: Numpy arrays of images, labels, and patient IDs.
    """
    images = []
    labels = []
    patient_ids = []

    for idx, row in df.iterrows():
        try:
            # Load image
            img_path = os.path.join(MASTER_PATH, row['fullPath'])
            if os.path.exists(img_path):
                image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

                # Apply preprocessing
                processed_image = preprocess_image_func(image)

                images.append(processed_image)
                labels.append(row['binary_label'])
                patient_ids.append(row['patient_id'])
            else:
                print(f"Warning: Image not found at {img_path}")

        except Exception as e:
            print(f"Error processing {row['fileName']}: {str(e)}")

    return np.array(images), np.array(labels), patient_ids


### 5. Create Model

In [None]:
def create_binary_model(input_shape):
    """
    Create CNN model for binary classification (Cancer vs Non-Cancer)
    """
    model = Sequential()

    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 3))

    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 3))

    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(3, 3))

    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))  # Single output for binary classification

    return model

### 6. Main Training Pipeline

In [None]:
def train_mammography_classifier(train_df, val_df, test_df, preprocess_image_func):
    """
    Complete binary classification training pipeline with patient-based splits
    Args:
        train_df (pd.DataFrame): DataFrame for training set.
        val_df (pd.DataFrame): DataFrame for validation set.
        test_df (pd.DataFrame): DataFrame for test set.
        preprocess_image_func (function): Function to preprocess images.
    Returns:
        tuple: Trained model, training history, and test data.
    """
    # Load and preprocess data for each split
    print("Loading and preprocessing images...")
    X_train, y_train, train_patient_ids = load_and_preprocess_data(train_df, preprocess_image_func)
    X_val, y_val, val_patient_ids = load_and_preprocess_data(val_df, preprocess_image_func)
    X_test, y_test, test_patient_ids = load_and_preprocess_data(test_df, preprocess_image_func)

    print(f"Training samples: {len(X_train)} from {len(set(train_patient_ids))} patients")
    print(f"Validation samples: {len(X_val)} from {len(set(val_patient_ids))} patients")
    print(f"Test samples: {len(X_test)} from {len(set(test_patient_ids))} patients")

    # Create model
    input_shape = X_train.shape[1:]
    model = create_binary_model(input_shape)

    # Compile model
    optimizer = optimizers.Adam(learning_rate=0.001)
    model.compile(
        loss='binary_crossentropy',
        optimizer=optimizer,
        metrics=['accuracy', 'recall', 'precision']
    )

    print("\nModel Summary:")
    model.summary()

    # Data augmentation
    datagen = ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    # Callbacks
    es = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    plateau = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=0.00001
    )

    # Train model
    print("Starting training...")
    history = model.fit(
        # datagen.flow(X_train, y_train, batch_size=64),
        X_train, y_train, batch_size=64,
        epochs=30,
        # steps_per_epoch=len(X_train) // 32,
        callbacks=[es, plateau],
        validation_data=(X_val, y_val),
        verbose=1
    )

    return model, history, X_train, y_train, X_val, y_val, X_test, y_test

### 7. Model Evaluation function

In [None]:
def evaluate_final_model(model, X_test, y_test):
    """
    Evaluate the trained model on the test set
    Args:
        model (tf.keras.Model): Trained model.
        X_test (np.ndarray): Test images.
        y_test (np.ndarray): Test labels.
    Returns:
        None
    """
    # Make predictions
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()

    # Print classification report
    print("\nFinal Test Set Evaluation:")
    print("="*50)
    target_names = ['Non-Cancer (Benign+Normal)', 'Cancer']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # Confusion matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    print("[[True Non-Cancer, False Cancer]")
    print(" [False Non-Cancer, True Cancer]]")

    # Calculate medical metrics
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall for cancer detection
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # Recall for non-cancer detection
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value

    print(f"\nMedical Performance Metrics:")
    print(f"Sensitivity (Cancer Detection Rate): {sensitivity:.3f}")
    print(f"Specificity (Non-Cancer Detection Rate): {specificity:.3f}")
    print(f"Positive Predictive Value (PPV): {ppv:.3f}")
    print(f"Negative Predictive Value (NPV): {npv:.3f}")


### 8. Execute pipeline and evaluate on test set

In [9]:
!pip install tensorflow-metal



In [10]:
import tensorflow as tf
import sys

print("Python version:", sys.version)
print("TensorFlow version:", tf.__version__)
print("Physical devices:", tf.config.list_physical_devices())
print("GPU devices:", tf.config.list_physical_devices('GPU'))

# Check if tensorflow-metal is installed
try:
    import tensorflow_metal
    print("tensorflow-metal is installed")
except ImportError:
    print("tensorflow-metal is NOT installed")


Python version: 3.10.6 (main, Jun  2 2025, 11:35:41) [Clang 17.0.0 (clang-1700.0.13.5)]
TensorFlow version: 2.16.2
Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
tensorflow-metal is NOT installed


In [None]:
with tf.device('/GPU:O'):
    # Execute the pipeline
    model, history, X_train, y_train, X_val, y_val, X_test, y_test = train_mammography_classifier(train_df, val_df, test_df, Preprocessor.preprocess_image)

    # Evaluate the final model
    evaluate_final_model(model, X_test, y_test)