In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_predict, GridSearchCV
from sklearn.metrics import (
    confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    AdaBoostClassifier, VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# ====================== Reproducibility ======================
SEED = 42
np.random.seed(SEED)

# ====================== 1. Load Raw Data (All Features) ======================
data = pd.read_csv("ovarian_cleaned_dataset.csv")
print(f"Raw dataset loaded: {data.shape}")

y = data['TYPE']
X = data.drop(columns=['TYPE'])   # All original features — no removal whatsoever

print(f"Using ALL {X.shape[1]} raw features (no selection or correlation filtering).")
print(f"Feature matrix final shape: {X.shape}\n")

# ====================== 2. CV Setup ======================
cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# ====================== 3. Model + Hyperparameter Grids ======================
base_models = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=SEED, n_jobs=-1),
        "params": {
            "n_estimators": [300, 500],
            "max_features": ["sqrt", "log2"],
            "criterion": ["entropy", "gini"]
        }
    },
    "Extra Trees": {
        "model": ExtraTreesClassifier(random_state=SEED, n_jobs=-1),
        "params": {
            "n_estimators": [300, 500],
            "max_features": ["sqrt", "log2"]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=SEED, eval_metric='logloss', n_jobs=-1),
        "params": {
            "n_estimators": [400, 600],
            "max_depth": [3, 4, 5],
            "learning_rate": [0.05, 0.1],
            "subsample": [0.8, 1.0]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=SEED),
        "params": {
            "n_estimators": [300, 500],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 4]
        }
    },
    "HistGradientBoosting": {
        "model": HistGradientBoostingClassifier(random_state=SEED),
        "params": {
            "max_iter": [300, 500],
            "learning_rate": [0.05, 0.1],
            "max_depth": [None, 10]
        }
    },
    "AdaBoost": {
        "model": AdaBoostClassifier(random_state=SEED),
        "params": {
            "n_estimators": [200, 400],
            "learning_rate": [0.5, 1.0]
        }
    },
    "Logistic Regression (L1)": {
        "model": Pipeline([('scaler', StandardScaler()),
                           ('lr', LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=SEED))]),
        "params": {
            "lr__C": [0.1, 1.0, 10.0]
        }
    },
    "SVC": {
        "model": Pipeline([('scaler', StandardScaler()),
                           ('svc', SVC(probability=True, random_state=SEED))]),
        "params": {
            "svc__C": [0.1, 1, 10],
            "svc__kernel": ["rbf", "linear"]
        }
    },
    "Gaussian NB": {
        "model": GaussianNB(),
        "params": {}  # No hyperparameters
    },
    "k-NN": {
        "model": Pipeline([('scaler', StandardScaler()),
                           ('knn', KNeighborsClassifier())]),
        "params": {
            "knn__n_neighbors": [3, 5, 7],
            "knn__weights": ["uniform", "distance"]
        }
    }
}

# ====================== 4. Leakage-Free Evaluation with Tuning ======================
results = []
best_models = {}
best_params_list = []

print("=== Starting Nested 5-Fold CV with Inner Hyperparameter Tuning ===\n")

for name, config in base_models.items():
    print(f"Tuning & evaluating: {name}")

    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    if config["params"]:
        grid = GridSearchCV(config["model"], config["params"], cv=inner_cv,
                            scoring='accuracy', n_jobs=-1)
        grid.fit(X, y)
        best_model = grid.best_estimator_
        best_param_str = str(grid.best_params_)
    else:
        best_model = config["model"]
        best_model.fit(X, y)
        best_param_str = "None (no tuning)"

    best_models[name] = best_model
    best_params_list.append({"Model": name, "Best Parameters": best_param_str})

    # Outer CV evaluation (leakage-free)
    y_pred = cross_val_predict(best_model, X, y, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED), n_jobs=-1)
    if hasattr(best_model, "predict_proba"):
        y_proba = cross_val_predict(best_model, X, y, cv=cv_outer, method="predict_proba", n_jobs=-1)[:, 1]
    elif hasattr(best_model, "decision_function"):
        y_proba = cross_val_predict(best_model, X, y, cv=cv_outer, method="decision_function", n_jobs=-1)
    else:
        y_proba = y_pred

    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    acc = (tp + tn) / (tp + tn + fp + fn)

    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "Precision": round(precision_score(y, y_pred, average='weighted'), 4),
        "Recall": round(recall_score(y, y_pred, average='weighted'), 4),
        "F1-Score": round(f1_score(y, y_pred, average='weighted'), 4),
        "ROC-AUC": round(roc_auc_score(y, y_proba), 4),
        "Error Rate": round(1 - acc, 4),
        "TP": int(tp), "FN": int(fn), "FP": int(fp), "TN": int(tn),
        "TPR": round(tp / (tp + fn), 4) if (tp + fn) > 0 else 0,
        "FNR": round(fn / (tp + fn), 4) if (tp + fn) > 0 else 0,
        "FPR": round(fp / (fp + tn), 4) if (fp + tn) > 0 else 0,
        "TNR": round(tn / (fp + tn), 4) if (fp + tn) > 0 else 0
    })

# ====================== 5. Optimus Ensemble (Weighted Soft Voting) ======================
results_df = pd.DataFrame(results).sort_values("Accuracy", ascending=False).reset_index(drop=True)
top3_names = results_df.head(3)["Model"].tolist()
print(f"\nTop-3 models selected for ensemble: {top3_names}")

estimators = []
for name in top3_names:
    short = "".join([c for c in name if c.isalnum()])[:8].lower()
    estimators.append((short, best_models[name]))

ensemble = VotingClassifier(estimators=estimators, voting='soft')
weight_grid = {'weights': [[1,1,1], [2,1,1], [1,2,1], [1,1,2], [2,2,1], [1,2,2], [2,1,2], [3,1,1], [1,3,1], [1,1,3]]}

grid_ens = GridSearchCV(ensemble, weight_grid, cv=cv_outer, scoring='accuracy', n_jobs=-1)
grid_ens.fit(X, y)

best_ens = grid_ens.best_estimator_
best_weights = grid_ens.best_params_['weights']

y_pred_ens = cross_val_predict(best_ens, X, y, cv=cv_outer, n_jobs=-1)
y_proba_ens = cross_val_predict(best_ens, X, y, cv=cv_outer, method='predict_proba', n_jobs=-1)[:, 1]

tn, fp, fn, tp = confusion_matrix(y, y_pred_ens).ravel()

ensemble_row = {
    "Model": "Optimus Ensemble",
    "Accuracy": round(grid_ens.best_score_, 4),
    "Precision": round(precision_score(y, y_pred_ens, average='weighted'), 4),
    "Recall": round(recall_score(y, y_pred_ens, average='weighted'), 4),
    "F1-Score": round(f1_score(y, y_pred_ens, average='weighted'), 4),
    "ROC-AUC": round(roc_auc_score(y, y_proba_ens), 4),
    "Error Rate": round(1 - grid_ens.best_score_, 4),
    "TP": int(tp), "FN": int(fn), "FP": int(fp), "TN": int(tn),
    "TPR": round(tp / (tp + fn), 4),
    "FNR": round(fn / (tp + fn), 4),
    "FPR": round(fp / (fp + tn), 4),
    "TNR": round(tn / (fp + tn), 4)
}

results_df = pd.concat([results_df, pd.DataFrame([ensemble_row])], ignore_index=True)
results_df = results_df.sort_values("Accuracy", ascending=False).reset_index(drop=True)

# ====================== Final Output ======================
print("\n" + "="*80)
print("FINAL PERFORMANCE TABLE (All Raw Features + Nested CV + Optimal Hyperparameters)")
print("="*80)
print(results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC',
                  'TPR', 'FNR', 'FPR', 'TNR', 'TP', 'FN', 'FP', 'TN']].to_string(index=False))

print("\n\nOPTIMAL HYPERPARAMETERS")
print("-"*80)
params_df = pd.DataFrame(best_params_list)
params_df.loc[len(params_df)] = {"Model": "Optimus Ensemble", "Best Parameters": f"weights = {best_weights} → {top3_names}"}
print(params_df.to_string(index=False))

Raw dataset loaded: (349, 63)
Using ALL 62 raw features (no selection or correlation filtering).
Feature matrix final shape: (349, 62)

=== Starting Nested 5-Fold CV with Inner Hyperparameter Tuning ===

Tuning & evaluating: Random Forest
Tuning & evaluating: Extra Trees
Tuning & evaluating: XGBoost
Tuning & evaluating: Gradient Boosting
Tuning & evaluating: HistGradientBoosting
Tuning & evaluating: AdaBoost
Tuning & evaluating: Logistic Regression (L1)
Tuning & evaluating: SVC
Tuning & evaluating: Gaussian NB
Tuning & evaluating: k-NN

Top-3 models selected for ensemble: ['Extra Trees', 'Random Forest', 'HistGradientBoosting']

FINAL PERFORMANCE TABLE (All Raw Features + Nested CV + Optimal Hyperparameters)
                   Model  Accuracy  Precision  Recall  F1-Score  ROC-AUC    TPR    FNR    FPR    TNR  TP  FN  FP  TN
             Extra Trees    0.8911     0.8942  0.8911    0.8908   0.9288 0.9382 0.0618 0.1579 0.8421 167  11  27 144
        Optimus Ensemble    0.8911     0.8951  0

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    AdaBoostClassifier, VotingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

SEED = 42
np.random.seed(SEED)

# Load data using all raw features without any filtering
data = pd.read_csv("ovarian_cleaned_dataset.csv")
y = data['TYPE']
X = data.drop(columns=['TYPE'])

cv_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# Model definitions with lightweight, hyperparameter grids
base_models = {
    "Decision Tree": {
        "model": DecisionTreeClassifier(random_state=SEED),
        "params": {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2],
            "criterion": ["gini", "entropy"]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=SEED, n_jobs=-1),
        "params": {"n_estimators": [300, 500], "max_features": ["sqrt", "log2"], "criterion": ["gini", "entropy"]}
    },
    "Extra Trees": {
        "model": ExtraTreesClassifier(random_state=SEED, n_jobs=-1),
        "params": {"n_estimators": [300, 500], "max_features": ["sqrt", "log2"]}
    },
    "XGBoost": {
        "model": XGBClassifier(random_state=SEED, eval_metric='logloss', n_jobs=-1),
        "params": {
            "n_estimators": [400, 600],
            "max_depth": [3, 4, 5],
            "learning_rate": [0.05, 0.1],
            "subsample": [0.8, 1.0]
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=SEED),
        "params": {"n_estimators": [300, 500], "learning_rate": [0.05, 0.1], "max_depth": [3, 4]}
    },
    "HistGradientBoosting": {
        "model": HistGradientBoostingClassifier(random_state=SEED),
        "params": {"max_iter": [300, 500], "learning_rate": [0.05, 0.1]}
    },
    "AdaBoost": {
        "model": AdaBoostClassifier(random_state=SEED),
        "params": {"n_estimators": [200, 400], "learning_rate": [0.5, 1.0]}
    },
    "Logistic Regression (L1)": {
        "model": Pipeline([('scaler', StandardScaler()),
                           ('lr', LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=SEED))]),
        "params": {"lr__C": [0.1, 1.0, 10.0]}
    },
    "SVC": {
        "model": Pipeline([('scaler', StandardScaler()),
                           ('svc', SVC(probability=True, random_state=SEED))]),
        "params": {"svc__C": [0.1, 1, 10], "svc__kernel": ["rbf", "linear"]}
    },
    "Gaussian NB": {"model": GaussianNB(), "params": {}},
    "k-NN": {
        "model": Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]),
        "params": {"knn__n_neighbors": [3, 5, 7], "knn__weights": ["uniform", "distance"]}
    }
}

results = []
best_models = {}
best_params_list = []

for name, config in base_models.items():
    # Inner CV for hyperparameter tuning (nested CV ensures unbiased performance)
    if config["params"]:
        grid = GridSearchCV(config["model"], config["params"], cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X, y)
        best_model = grid.best_estimator_
        best_param_str = str(grid.best_params_)
    else:
        best_model = config["model"].fit(X, y)
        best_param_str = "None"

    best_models[name] = best_model
    best_params_list.append({"Model": name, "Best Parameters": best_param_str})

    # Out-of-fold predictions (outer CV)
    y_pred = cross_val_predict(best_model, X, y, cv=cv_outer, n_jobs=-1)
    y_proba = (cross_val_predict(best_model, X, y, cv=cv_outer, method="predict_proba", n_jobs=-1)[:, 1]
               if hasattr(best_model, "predict_proba") else y_pred)

    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    results.append({
        "Model": name,
        "Accuracy": round((tp + tn) / len(y), 4),
        "Precision": round(precision_score(y, y_pred, average='weighted'), 4),
        "Recall": round(recall_score(y, y_pred, average='weighted'), 4),
        "F1-Score": round(f1_score(y, y_pred, average='weighted'), 4),
        "ROC-AUC": round(roc_auc_score(y, y_proba), 4),
        "TP": tp, "FN": fn, "FP": fp, "TN": tn,
        "TPR": round(tp / (tp + fn), 4),
        "FNR": round(fn / (tp + fn), 4),
        "FPR": round(fp / (fp + tn), 4),
        "TNR": round(tn / (fp + tn), 4)
    })

results_df = pd.DataFrame(results).sort_values("Accuracy", ascending=False).reset_index(drop=True)

# Optimus Ensemble: weighted soft voting of top-3 individually optimized models
top3 = results_df.head(3)["Model"].tolist()
estimators = [(name.replace(" ", "").replace("(", "").replace(")", "").lower()[:8], best_models[name]) for name in top3]

ensemble = VotingClassifier(estimators=estimators, voting='soft')
grid_ens = GridSearchCV(ensemble,
                        {'weights': [[1,1,1],[2,1,1],[1,2,1],[1,1,2],[3,1,1],[1,3,1],[1,1,3],[2,2,1],[2,1,2],[1,2,2]]},
                        cv=cv_outer, scoring='accuracy', n_jobs=-1)
grid_ens.fit(X, y)

y_pred_ens = cross_val_predict(grid_ens.best_estimator_, X, y, cv=cv_outer, n_jobs=-1)
y_proba_ens = cross_val_predict(grid_ens.best_estimator_, X, y, cv=cv_outer, method='predict_proba', n_jobs=-1)[:, 1]
tn, fp, fn, tp = confusion_matrix(y, y_pred_ens).ravel()

# Final tables
print("="*100)
print("BASE MODELS (Nested 5-Fold CV on All Raw Features)")
print("="*100)
print(results_df[['Model','Accuracy','Precision','Recall','F1-Score','ROC-AUC','TPR','FNR','FPR','TNR','TP','FN','FP','TN']].to_string(index=False))

print("\n"+"="*100)
print("OPTIMUS ENSEMBLE (Weighted Soft Voting of Top-3 Models)")
print("="*100)
ens_df = pd.DataFrame([{
    "Model": "Optimus Ensemble",
    "Accuracy": round(grid_ens.best_score_, 4),
    "Precision": round(precision_score(y, y_pred_ens, average='weighted'), 4),
    "Recall": round(recall_score(y, y_pred_ens, average='weighted'), 4),
    "F1-Score": round(f1_score(y, y_pred_ens, average='weighted'), 4),
    "ROC-AUC": round(roc_auc_score(y, y_proba_ens), 4),
    "TP": tp, "FN": fn, "FP": fp, "TN": tn,
    "TPR": round(tp/(tp+fn), 4),
    "FNR": round(fn/(tp+fn), 4),
    "FPR": round(fp/(fp+tn), 4),
    "TNR": round(tn/(fp+tn), 4)
}])
print(ens_df[['Model','Accuracy','Precision','Recall','F1-Score','ROC-AUC','TPR','FNR','FPR','TNR','TP','FN','FP','TN']].to_string(index=False))

print("\nOPTIMAL HYPERPARAMETERS")
print("-"*80)
params_df = pd.DataFrame(best_params_list)
params_df.loc[len(params_df)] = ["Optimus Ensemble", f"weights = {grid_ens.best_params_['weights']} → {top3}"]
print(params_df.to_string(index=False))