## Phase 1: Data Understanding & Preparation
## Step 1: Dataset Exploration

In [None]:
import os
import cv2
import warnings

In [20]:
data_dir = "C:/Users/People/Desktop/PPLafrica/Specialisation/AI-SoftwareDev/week4-intelSoft/assignment/src/predictive_analytics/data/complete_set"

# 2) Build the train/test sub-paths
training_benign_dir    = os.path.join(data_dir, 'training_set', 'benign')
training_malignant_dir = os.path.join(data_dir, 'training_set', 'malignant')
testing_data_dir       = os.path.join(data_dir, 'testing_set')

## Step 2: Image Visualization & Analysis

In [None]:
def visualize_samples():
    """Visualize sample images from each class"""
    fig, axes = plt.subplots(2, 5, figsize=(15, 8))

    # Benign samples
    benign_path = os.path.join(TRAIN_PATH, "benign")
    benign_files = os.listdir(benign_path)[:5]

    for i, file in enumerate(benign_files):
        img = cv2.imread(os.path.join(benign_path, file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[0, i].imshow(img)
        axes[0, i].set_title(f"Benign {i+1}")
        axes[0, i].axis('off')

    # Malignant samples
    malignant_path = os.path.join(TRAIN_PATH, "malignant")
    malignant_files = os.listdir(malignant_path)[:5]

    for i, file in enumerate(malignant_files):
        img = cv2.imread(os.path.join(malignant_path, file))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axes[1, i].imshow(img)
        axes[1, i].set_title(f"Malignant {i+1}")
        axes[1, i].axis('off')

    plt.tight_layout()
    plt.savefig('sample_images.png', dpi=300, bbox_inches='tight')
    plt.show()

visualize_samples()

## Step 3: Image Preprocessing & Data Pipeline

In [None]:
# Image parameters
IMG_SIZE = 224  # Standard for transfer learning models
BATCH_SIZE = 32
EPOCHS = 50

# Data generators with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2,
    validation_split=0.2  # 20% for validation
)

# Training generator
train_generator = train_datagen.flow_from_directory(
    TRAIN_PATH,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training',
    seed=42
)

# Validation generator
validation_generator = train_datagen.flow_from_directory(
    TRAIN_PATH,
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    seed=42
)

# Test generator (no augmentation)
test_datagen = ImageDataGenerator(rescale=1./255)

print(f"Training samples: {train_generator.samples}")
print(f"Validation samples: {validation_generator.samples}")
print(f"Class indices: {train_generator.class_indices}")

## Phase 2: Model Architecture & Training
### Step 4: Transfer Learning Implementation

In [None]:
def create_transfer_learning_model(base_model_name='VGG16'):
    """Create transfer learning model"""

    if base_model_name == 'VGG16':
        base_model = tf.keras.applications.VGG16(
            weights='imagenet',
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
    elif base_model_name == 'ResNet50':
        base_model = tf.keras.applications.ResNet50(
            weights='imagenet',
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
    elif base_model_name == 'EfficientNetB0':
        base_model = tf.keras.applications.EfficientNetB0(
            weights='imagenet',
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )

    # Freeze base model
    base_model.trainable = False

    # Add custom head
    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])

    return model

# Create and compile models
models_to_try = ['VGG16', 'ResNet50', 'EfficientNetB0']
trained_models = {}

for model_name in models_to_try:
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")

    model = create_transfer_learning_model(model_name)

    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )

    # Callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=5,
            min_lr=1e-7
        ),
        tf.keras.callbacks.ModelCheckpoint(
            f'best_model_{model_name.lower()}.h5',
            monitor='val_loss',
            save_best_only=True
        )
    ]

    # Train model
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // BATCH_SIZE,
        validation_data=validation_generator,
        validation_steps=validation_generator.samples // BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=callbacks,
        verbose=1
    )

    trained_models[model_name] = {
        'model': model,
        'history': history
    }

## Step 5: Model Fine-tuning

In [None]:
def fine_tune_model(model, base_model_name):
    """Fine-tune the best performing model"""

    # Unfreeze some layers of the base model
    base_model = model.layers[0]
    base_model.trainable = True

    # Fine-tune from this layer onwards
    fine_tune_at = len(base_model.layers) // 2

    # Freeze all the layers before fine_tune_at
    for layer in base_model.layers[:fine_tune_at]:
        layer.trainable = False

    # Use a lower learning rate for fine-tuning
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001/10),
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )

    # Fine-tune
    fine_tune_epochs = 10
    total_epochs = EPOCHS + fine_tune_epochs

    history_fine = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // BATCH_SIZE,
        validation_data=validation_generator,
        validation_steps=validation_generator.samples // BATCH_SIZE,
        epochs=total_epochs,
        initial_epoch=EPOCHS,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
        ]
    )

    return model, history_fine

## Phase 3: Model Evaluation & Analysis
### Step 6: Comprehensive Evaluation

In [None]:
def evaluate_model(model, model_name):
    """Comprehensive model evaluation"""

    # Predictions on validation set
    validation_generator.reset()
    predictions = model.predict(validation_generator)
    predicted_classes = (predictions > 0.5).astype(int).flatten()

    # True labels
    true_classes = validation_generator.classes

    # Metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    accuracy = accuracy_score(true_classes, predicted_classes)
    precision = precision_score(true_classes, predicted_classes)
    recall = recall_score(true_classes, predicted_classes)
    f1 = f1_score(true_classes, predicted_classes)
    auc = roc_auc_score(true_classes, predictions)

    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(true_classes, predicted_classes)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'confusion_matrix_{model_name.lower()}.png', dpi=300, bbox_inches='tight')
    plt.show()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }

# Evaluate all models
results = {}
for model_name, model_data in trained_models.items():
    results[model_name] = evaluate_model(model_data['model'], model_name)