In [2]:
import pandas as pd
data = pd.read_csv('/content/cleaned_dataset_Thyroid1.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3771 entries, 0 to 3770
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        3771 non-null   float64
 1   sex                        3771 non-null   float64
 2   on thyroxine               3771 non-null   int64  
 3   query on thyroxine         3771 non-null   int64  
 4   on antithyroid medication  3771 non-null   int64  
 5   sick                       3771 non-null   int64  
 6   pregnant                   3771 non-null   int64  
 7   thyroid surgery            3771 non-null   int64  
 8   I131 treatment             3771 non-null   int64  
 9   query hypothyroid          3771 non-null   int64  
 10  query hyperthyroid         3771 non-null   int64  
 11  lithium                    3771 non-null   int64  
 12  goitre                     3771 non-null   int64  
 13  tumor                      3771 non-null   int64

In [4]:
data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
on thyroxine,0
query on thyroxine,0
on antithyroid medication,0
sick,0
pregnant,0
thyroid surgery,0
I131 treatment,0
query hypothyroid,0


In [5]:
# Features and target
X = data.drop(columns=["binaryClass"])
y = data["binaryClass"]


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3771 entries, 0 to 3770
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        3771 non-null   float64
 1   sex                        3771 non-null   float64
 2   on thyroxine               3771 non-null   int64  
 3   query on thyroxine         3771 non-null   int64  
 4   on antithyroid medication  3771 non-null   int64  
 5   sick                       3771 non-null   int64  
 6   pregnant                   3771 non-null   int64  
 7   thyroid surgery            3771 non-null   int64  
 8   I131 treatment             3771 non-null   int64  
 9   query hypothyroid          3771 non-null   int64  
 10  query hyperthyroid         3771 non-null   int64  
 11  lithium                    3771 non-null   int64  
 12  goitre                     3771 non-null   int64  
 13  tumor                      3771 non-null   int64

LR

In [6]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced'
    )

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[682  14]
 [  4  55]]
Accuracy: 0.9762
Precision: 0.7971
Recall (Sensitivity): 0.9322
Specificity: 0.9799
F1-score: 0.8594
AUC-ROC: 0.9936
AUPRC: 0.9519

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[684  12]
 [ 10  48]]
Accuracy: 0.9708
Precision: 0.8000
Recall (Sensitivity): 0.8276
Specificity: 0.9828
F1-score: 0.8136
AUC-ROC: 0.9602
AUPRC: 0.8281

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[683  13]
 [  9  49]]
Accuracy: 0.9708
Precision: 0.7903
Recall (Sensitivity): 0.8448
Specificity: 0.9813
F1-score: 0.8167
AUC-ROC: 0.9516
AUPRC: 0.8798

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[681  15]
 [ 10  48]]
Accuracy: 0.9668
Precision: 0.7619
Recall (Sensitivity)

RF

In [7]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[691   5]
 [  0  59]]
Accuracy: 0.9934
Precision: 0.9219
Recall (Sensitivity): 1.0000
Specificity: 0.9928
F1-score: 0.9593
AUC-ROC: 0.9999
AUPRC: 0.9988

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[693   3]
 [  0  58]]
Accuracy: 0.9960
Precision: 0.9508
Recall (Sensitivity): 1.0000
Specificity: 0.9957
F1-score: 0.9748
AUC-ROC: 1.0000
AUPRC: 1.0000

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[689   7]
 [  0  58]]
Accuracy: 0.9907
Precision: 0.8923
Recall (Sensitivity): 1.0000
Specificity: 0.9899
F1-score: 0.9431
AUC-ROC: 0.9973
AUPRC: 0.9528

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[688   8]
 [  2  56]]
Accuracy: 0.9867
Precision: 0.8750
Recall (Sensitivity)

KNN

In [8]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = KNeighborsClassifier(n_neighbors=5)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[613  83]
 [ 25  34]]
Accuracy: 0.8570
Precision: 0.2906
Recall (Sensitivity): 0.5763
Specificity: 0.8807
F1-score: 0.3864
AUC-ROC: 0.8155
AUPRC: 0.3659

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[609  87]
 [ 29  29]]
Accuracy: 0.8462
Precision: 0.2500
Recall (Sensitivity): 0.5000
Specificity: 0.8750
F1-score: 0.3333
AUC-ROC: 0.7700
AUPRC: 0.2721

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[616  80]
 [ 33  25]]
Accuracy: 0.8501
Precision: 0.2381
Recall (Sensitivity): 0.4310
Specificity: 0.8851
F1-score: 0.3067
AUC-ROC: 0.7511
AUPRC: 0.2993

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[631  65]
 [ 26  32]]
Accuracy: 0.8793
Precision: 0.3299
Recall (Sensitivity)

GNB

In [9]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = GaussianNB()

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[275 421]
 [  4  55]]
Accuracy: 0.4371
Precision: 0.1155
Recall (Sensitivity): 0.9322
Specificity: 0.3951
F1-score: 0.2056
AUC-ROC: 0.6763
AUPRC: 0.1182

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[276 420]
 [  9  49]]
Accuracy: 0.4310
Precision: 0.1045
Recall (Sensitivity): 0.8448
Specificity: 0.3966
F1-score: 0.1860
AUC-ROC: 0.6259
AUPRC: 0.1020

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[262 434]
 [ 11  47]]
Accuracy: 0.4098
Precision: 0.0977
Recall (Sensitivity): 0.8103
Specificity: 0.3764
F1-score: 0.1744
AUC-ROC: 0.6135
AUPRC: 0.0999

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[300 396]
 [ 11  47]]
Accuracy: 0.4602
Precision: 0.1061
Recall (Sensitivity)

MLP

In [10]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=1000, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[690   6]
 [  9  50]]
Accuracy: 0.9801
Precision: 0.8929
Recall (Sensitivity): 0.8475
Specificity: 0.9914
F1-score: 0.8696
AUC-ROC: 0.9887
AUPRC: 0.9443

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[693   3]
 [ 13  45]]
Accuracy: 0.9788
Precision: 0.9375
Recall (Sensitivity): 0.7759
Specificity: 0.9957
F1-score: 0.8491
AUC-ROC: 0.9642
AUPRC: 0.8752

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[682  14]
 [  6  52]]
Accuracy: 0.9735
Precision: 0.7879
Recall (Sensitivity): 0.8966
Specificity: 0.9799
F1-score: 0.8387
AUC-ROC: 0.9594
AUPRC: 0.9129

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[691   5]
 [  9  49]]
Accuracy: 0.9814
Precision: 0.9074
Recall (Sensitivity)

XGB

In [11]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[694   2]
 [  0  59]]
Accuracy: 0.9974
Precision: 0.9672
Recall (Sensitivity): 1.0000
Specificity: 0.9971
F1-score: 0.9833
AUC-ROC: 1.0000
AUPRC: 1.0000

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[696   0]
 [  0  58]]
Accuracy: 1.0000
Precision: 1.0000
Recall (Sensitivity): 1.0000
Specificity: 1.0000
F1-score: 1.0000
AUC-ROC: 1.0000
AUPRC: 1.0000

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[692   4]
 [  1  57]]
Accuracy: 0.9934
Precision: 0.9344
Recall (Sensitivity): 0.9828
Specificity: 0.9943
F1-score: 0.9580
AUC-ROC: 0.9981
AUPRC: 0.9480

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[696   0]
 [  2  56]]
Accuracy: 0.9973
Precision: 1.0000
Recall (Sensitivity)

ADB

In [12]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = AdaBoostClassifier(n_estimators=100, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[692   4]
 [  0  59]]
Accuracy: 0.9947
Precision: 0.9365
Recall (Sensitivity): 1.0000
Specificity: 0.9943
F1-score: 0.9672
AUC-ROC: 1.0000
AUPRC: 1.0000

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[695   1]
 [  1  57]]
Accuracy: 0.9973
Precision: 0.9828
Recall (Sensitivity): 0.9828
Specificity: 0.9986
F1-score: 0.9828
AUC-ROC: 0.9999
AUPRC: 0.9991

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[689   7]
 [  0  58]]
Accuracy: 0.9907
Precision: 0.8923
Recall (Sensitivity): 1.0000
Specificity: 0.9899
F1-score: 0.9431
AUC-ROC: 0.9969
AUPRC: 0.9429

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[688   8]
 [  3  55]]
Accuracy: 0.9854
Precision: 0.8730
Recall (Sensitivity)

GBC

In [62]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.001, max_depth=3, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[693   3]
 [  0  59]]
Accuracy: 0.9960
Precision: 0.9516
Recall (Sensitivity): 1.0000
Specificity: 0.9957
F1-score: 0.9752
AUC-ROC: 0.9993
AUPRC: 0.9833

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[694   2]
 [  0  58]]
Accuracy: 0.9973
Precision: 0.9667
Recall (Sensitivity): 1.0000
Specificity: 0.9971
F1-score: 0.9831
AUC-ROC: 1.0000
AUPRC: 1.0000

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[684  12]
 [  0  58]]
Accuracy: 0.9841
Precision: 0.8286
Recall (Sensitivity): 1.0000
Specificity: 0.9828
F1-score: 0.9062
AUC-ROC: 0.9958
AUPRC: 0.9095

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[690   6]
 [  3  55]]
Accuracy: 0.9881
Precision: 0.9016
Recall (Sensitivity)

ETC

In [66]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = ExtraTreesClassifier(
        n_estimators=500,
        max_depth=None,
        criterion="entropy",
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=None,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )


    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[692   4]
 [  0  59]]
Accuracy: 0.9947
Precision: 0.9365
Recall (Sensitivity): 1.0000
Specificity: 0.9943
F1-score: 0.9672
AUC-ROC: 0.9999
AUPRC: 0.9983

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[694   2]
 [  2  56]]
Accuracy: 0.9947
Precision: 0.9655
Recall (Sensitivity): 0.9655
Specificity: 0.9971
F1-score: 0.9655
AUC-ROC: 0.9998
AUPRC: 0.9979

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[690   6]
 [  0  58]]
Accuracy: 0.9920
Precision: 0.9062
Recall (Sensitivity): 1.0000
Specificity: 0.9914
F1-score: 0.9508
AUC-ROC: 0.9995
AUPRC: 0.9934

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[695   1]
 [  2  56]]
Accuracy: 0.9960
Precision: 0.9825
Recall (Sensitivity)

In [15]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = X_train_res
    X_test_scaled = X_test

    # =========================
    # Logistic Regression
    # =========================
    model = ExtraTreesClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=3,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[677  19]
 [  9  50]]
Accuracy: 0.9629
Precision: 0.7246
Recall (Sensitivity): 0.8475
Specificity: 0.9727
F1-score: 0.7812
AUC-ROC: 0.9854
AUPRC: 0.9102

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[684  12]
 [ 10  48]]
Accuracy: 0.9708
Precision: 0.8000
Recall (Sensitivity): 0.8276
Specificity: 0.9828
F1-score: 0.8136
AUC-ROC: 0.9838
AUPRC: 0.8783

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[681  15]
 [ 10  48]]
Accuracy: 0.9668
Precision: 0.7619
Recall (Sensitivity): 0.8276
Specificity: 0.9784
F1-score: 0.7934
AUC-ROC: 0.9859
AUPRC: 0.8905

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[687   9]
 [ 11  47]]
Accuracy: 0.9735
Precision: 0.8393
Recall (Sensitivity)

LGBM

In [64]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.0005, num_leaves=31, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})
[LightGBM] [Info] Number of positive: 2775, number of negative: 2775
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1614
[LightGBM] [Info] Number of data points in the train set: 5550, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Confusion Matrix:
[[691   5]
 [  1  58]]
Accuracy: 0.9921
Precision: 0.9206
Recall (Sensitivity): 0.9831
Specificity: 0.9928
F1-score: 0.9508
AUC-ROC: 0.9979
AUPRC: 0.9577

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})
[LightGBM] [Info] Number of positive: 2778, number of negative: 2778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the over

CB

In [17]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [65]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = CatBoostClassifier(
      iterations=100,
      learning_rate=0.005,
      depth=7,
      random_state=42,
      verbose=0  # Suppress verbose output
  )
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[690   6]
 [  0  59]]
Accuracy: 0.9921
Precision: 0.9077
Recall (Sensitivity): 1.0000
Specificity: 0.9914
F1-score: 0.9516
AUC-ROC: 0.9999
AUPRC: 0.9987

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[694   2]
 [  1  57]]
Accuracy: 0.9960
Precision: 0.9661
Recall (Sensitivity): 0.9828
Specificity: 0.9971
F1-score: 0.9744
AUC-ROC: 0.9999
AUPRC: 0.9988

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[689   7]
 [  0  58]]
Accuracy: 0.9907
Precision: 0.8923
Recall (Sensitivity): 1.0000
Specificity: 0.9899
F1-score: 0.9431
AUC-ROC: 0.9977
AUPRC: 0.9668

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[688   8]
 [  2  56]]
Accuracy: 0.9867
Precision: 0.8750
Recall (Sensitivity)

BNB

In [19]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = BernoulliNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[563 133]
 [ 11  48]]
Accuracy: 0.8093
Precision: 0.2652
Recall (Sensitivity): 0.8136
Specificity: 0.8089
F1-score: 0.4000
AUC-ROC: 0.8923
AUPRC: 0.6659

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[560 136]
 [ 22  36]]
Accuracy: 0.7905
Precision: 0.2093
Recall (Sensitivity): 0.6207
Specificity: 0.8046
F1-score: 0.3130
AUC-ROC: 0.7793
AUPRC: 0.4433

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[564 132]
 [ 15  43]]
Accuracy: 0.8050
Precision: 0.2457
Recall (Sensitivity): 0.7414
Specificity: 0.8103
F1-score: 0.3691
AUC-ROC: 0.8386
AUPRC: 0.6103

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[549 147]
 [ 19  39]]
Accuracy: 0.7798
Precision: 0.2097
Recall (Sensitivity)

CNB

In [20]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = ComplementNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[290 406]
 [  8  51]]
Accuracy: 0.4517
Precision: 0.1116
Recall (Sensitivity): 0.8644
Specificity: 0.4167
F1-score: 0.1977
AUC-ROC: 0.8249
AUPRC: 0.6517

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[281 415]
 [ 13  45]]
Accuracy: 0.4324
Precision: 0.0978
Recall (Sensitivity): 0.7759
Specificity: 0.4037
F1-score: 0.1737
AUC-ROC: 0.7076
AUPRC: 0.4443

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[304 392]
 [ 21  37]]
Accuracy: 0.4523
Precision: 0.0862
Recall (Sensitivity): 0.6379
Specificity: 0.4368
F1-score: 0.1520
AUC-ROC: 0.6763
AUPRC: 0.4713

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[296 400]
 [ 14  44]]
Accuracy: 0.4509
Precision: 0.0991
Recall (Sensitivity)

MNB

In [21]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = MultinomialNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[290 406]
 [  8  51]]
Accuracy: 0.4517
Precision: 0.1116
Recall (Sensitivity): 0.8644
Specificity: 0.4167
F1-score: 0.1977
AUC-ROC: 0.8249
AUPRC: 0.6517

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[281 415]
 [ 13  45]]
Accuracy: 0.4324
Precision: 0.0978
Recall (Sensitivity): 0.7759
Specificity: 0.4037
F1-score: 0.1737
AUC-ROC: 0.7076
AUPRC: 0.4443

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[304 392]
 [ 21  37]]
Accuracy: 0.4523
Precision: 0.0862
Recall (Sensitivity): 0.6379
Specificity: 0.4368
F1-score: 0.1520
AUC-ROC: 0.6763
AUPRC: 0.4713

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[296 400]
 [ 14  44]]
Accuracy: 0.4509
Precision: 0.0991
Recall (Sensitivity)

HGB

In [22]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = HistGradientBoostingClassifier(
        max_iter=100,
        learning_rate=0.5,
        max_depth=5,
        random_state=42
    )
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[320 376]
 [  5  54]]
Accuracy: 0.4954
Precision: 0.1256
Recall (Sensitivity): 0.9153
Specificity: 0.4598
F1-score: 0.2209
AUC-ROC: 0.6913
AUPRC: 0.1224

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[694   2]
 [  1  57]]
Accuracy: 0.9960
Precision: 0.9661
Recall (Sensitivity): 0.9828
Specificity: 0.9971
F1-score: 0.9744
AUC-ROC: 0.9999
AUPRC: 0.9991

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[692   4]
 [  0  58]]
Accuracy: 0.9947
Precision: 0.9355
Recall (Sensitivity): 1.0000
Specificity: 0.9943
F1-score: 0.9667
AUC-ROC: 0.9979
AUPRC: 0.9547

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[693   3]
 [  4  54]]
Accuracy: 0.9907
Precision: 0.9474
Recall (Sensitivity)

NC

In [23]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = NearestCentroid()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 2784, 1: 232})
After SMOTE-Tomek:  Counter({0: 2775, 1: 2775})

Confusion Matrix:
[[551 145]
 [ 17  42]]
Accuracy: 0.7854
Precision: 0.2246
Recall (Sensitivity): 0.7119
Specificity: 0.7917
F1-score: 0.3415
AUC-ROC: 0.8675
AUPRC: 0.5847

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2778, 1: 2778})

Confusion Matrix:
[[550 146]
 [ 22  36]]
Accuracy: 0.7772
Precision: 0.1978
Recall (Sensitivity): 0.6207
Specificity: 0.7902
F1-score: 0.3000
AUC-ROC: 0.7931
AUPRC: 0.4609

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2777, 1: 2777})

Confusion Matrix:
[[559 137]
 [ 16  42]]
Accuracy: 0.7971
Precision: 0.2346
Recall (Sensitivity): 0.7241
Specificity: 0.8032
F1-score: 0.3544
AUC-ROC: 0.8417
AUPRC: 0.5398

Before SMOTE-Tomek: Counter({0: 2784, 1: 233})
After SMOTE-Tomek:  Counter({0: 2779, 1: 2779})

Confusion Matrix:
[[560 136]
 [ 19  39]]
Accuracy: 0.7944
Precision: 0.2229
Recall (Sensitivity)

ANN

In [7]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)

from imblearn.combine import SMOTETomek

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target Series (0/1)

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# CV Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split
    X_train_raw, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_raw, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", dict(pd.Series(y_train_raw).value_counts()))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train_raw, y_train_raw)

    print("After SMOTE-Tomek: ", dict(pd.Series(y_train_res).value_counts()))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Build ANN
    # =========================
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=0
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )

    model.fit(
        X_train_scaled, y_train_res,
        epochs=300,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )

    # =========================
    # Predictions
    # =========================
    y_proba = model.predict(X_test_scaled).ravel()
    y_pred = (y_proba >= 0.8).astype(int)

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# CV Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary (ANN) ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: {0: np.int64(2784), 1: np.int64(232)}
After SMOTE-Tomek:  {0: np.int64(2775), 1: np.int64(2775)}

Epoch 42: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 53: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 58: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 63: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

Confusion Matrix:
[[694   2]
 [  4  55]]
Accuracy: 0.9921
Precision: 0.9649
Recall (Sensitivity): 0.9322
Specificity: 0.9971
F1-score: 0.9483
AUC-ROC: 0.9988
AUPRC: 0.9870

Before SMOTE-Tomek: {0: np.int64(2784), 1: np.int64(233)}
After SMOTE-Tomek:  {0: np.int64(2778), 1: np.int64(2778)}

Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 41: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 46: ReduceLROnPlateau reducing learning rate

In [8]:
model.save('ann_model.keras')