In [None]:
# ==========================================
# Medical Diagnosis ML Pipeline
# Breast Cancer Classification
# ==========================================

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

# ----------------------------
# 1. Load Dataset
# ----------------------------
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

print("Dataset shape:", X.shape)

# ----------------------------
# 2. Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------
# 3. Define Models as Pipelines
# ----------------------------
models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=10)),
        ("model", LogisticRegression(max_iter=5000))
    ]),

    "Random Forest": Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=10)),
        ("model", RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            random_state=42
        ))
    ]),

    "Support Vector Machine": Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=10)),
        ("model", SVC(probability=True))
    ])
}

# ----------------------------
# 4. Cross Validation
# ----------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, pipeline in models.items():

    cv_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=["accuracy", "roc_auc"]
    )

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    test_auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "CV Accuracy Mean": np.mean(cv_scores["test_accuracy"]),
        "CV AUC Mean": np.mean(cv_scores["test_roc_auc"]),
        "Test AUC": test_auc
    })

    print("\n============================")
    print(f"{name}")
    print("============================")
    print(classification_report(y_test, y_pred))

# ----------------------------
# 5. Results Table
# ----------------------------
results_df = pd.DataFrame(results)
print("\nFinal Model Comparison:")
print(results_df.sort_values(by="Test AUC", ascending=False))

Dataset shape: (569, 30)

Logistic Regression
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        42
           1       0.99      0.97      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Random Forest
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114


Support Vector Machine
              precision    recall  f1-score   support

           0       0.95      0.98      0.96        42
           1       0.99      0.97      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97