In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
differentiated_thyroid_cancer_recurrence = fetch_ucirepo(id=915)

# data (as pandas dataframes)
X = differentiated_thyroid_cancer_recurrence.data.features
y = differentiated_thyroid_cancer_recurrence.data.targets

# metadata
#print(differentiated_thyroid_cancer_recurrence.metadata)

# variable information
print(differentiated_thyroid_cancer_recurrence.variables)


                    name     role         type demographic description units  \
0                    Age  Feature      Integer         Age        None  None   
1                 Gender  Feature  Categorical      Gender        None  None   
2                Smoking  Feature  Categorical        None        None  None   
3             Hx Smoking  Feature  Categorical        None        None  None   
4        Hx Radiothreapy  Feature  Categorical        None        None  None   
5       Thyroid Function  Feature  Categorical        None        None  None   
6   Physical Examination  Feature  Categorical        None        None  None   
7             Adenopathy  Feature  Categorical        None        None  None   
8              Pathology  Feature  Categorical        None        None  None   
9               Focality  Feature  Categorical        None        None  None   
10                  Risk  Feature  Categorical        None        None  None   
11                     T  Feature  Categ

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [4]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Recurred  383 non-null    object
dtypes: object(1)
memory usage: 3.1+ KB


**One-Hot Encoding**

In [5]:
import pandas as pd

# One-hot encode all categorical columns (drop_first=True avoids multicollinearity)
X = pd.get_dummies(X, drop_first=True)

# Optionally encode the target variable too (if needed for ML)
y = y.copy()
y['Recurred'] = y['Recurred'].map({'No': 0, 'Yes': 1})  # adjust if labels differ


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 40 columns):
 #   Column                                            Non-Null Count  Dtype
---  ------                                            --------------  -----
 0   Age                                               383 non-null    int64
 1   Gender_M                                          383 non-null    bool 
 2   Smoking_Yes                                       383 non-null    bool 
 3   Hx Smoking_Yes                                    383 non-null    bool 
 4   Hx Radiothreapy_Yes                               383 non-null    bool 
 5   Thyroid Function_Clinical Hypothyroidism          383 non-null    bool 
 6   Thyroid Function_Euthyroid                        383 non-null    bool 
 7   Thyroid Function_Subclinical Hyperthyroidism      383 non-null    bool 
 8   Thyroid Function_Subclinical Hypothyroidism       383 non-null    bool 
 9   Physical Examination_Multinodular goiter   

In [7]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Recurred  383 non-null    int64
dtypes: int64(1)
memory usage: 3.1 KB


In [8]:
X.isnull().sum()

Unnamed: 0,0
Age,0
Gender_M,0
Smoking_Yes,0
Hx Smoking_Yes,0
Hx Radiothreapy_Yes,0
Thyroid Function_Clinical Hypothyroidism,0
Thyroid Function_Euthyroid,0
Thyroid Function_Subclinical Hyperthyroidism,0
Thyroid Function_Subclinical Hypothyroidism,0
Physical Examination_Multinodular goiter,0


In [9]:
y.isnull().sum()

Unnamed: 0,0
Recurred,0


**LR**

In [26]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced'
    )

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[85  7]
 [ 2 34]]
Accuracy: 0.9297
Precision: 0.8293
Recall (Sensitivity): 0.9444
Specificity: 0.9239
F1-score: 0.8831
AUC-ROC: 0.9683
AUPRC: 0.9680

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[89  3]
 [ 4 32]]
Accuracy: 0.9453
Precision: 0.9143
Recall (Sensitivity): 0.8889
Specificity: 0.9674
F1-score: 0.9014
AUC-ROC: 0.9701
AUPRC: 0.8912

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[89  2]
 [ 4 32]]
Accuracy: 0.9528
Precision: 0.9412
Recall (Sensitivity): 0.8889
Specificity: 0.9780
F1-score: 0.9143
AUC-ROC: 0.9664
AUPRC: 0.9651


Accuracy: 0.9426 ± 0.0118
Precision: 0.8949 ± 0.0584
Recall (Sensitivity): 0.9074 ± 0.0321
Specificity: 0.9564 ± 0.0287
F1-score: 0.8996 ± 0.0157
AUC-ROC: 0.9683 ± 0.0018
AUPRC: 0.9414 ± 0.0436


**RF**

In [25]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[55  0]
 [ 1 21]]
Accuracy: 0.9870
Precision: 1.0000
Recall (Sensitivity): 0.9545
Specificity: 1.0000
F1-score: 0.9767
AUC-ROC: 0.9926
AUPRC: 0.9868

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9760
AUPRC: 0.9576

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[52  3]
 [ 2 20]]
Accuracy: 0.9351
Precision: 0.8696
Recall (Sensitivity): 0.9091
Specificity: 0.9455
F1-score: 0.8889
AUC-ROC: 0.9893
AUPRC: 0.9781

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 1 20]]
Accuracy: 0.9605
Precision: 0.9091
Recall (Sensitivity): 0.9524
Specificity: 0.9636
F1-scor

**KNN**

In [17]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = KNeighborsClassifier(n_neighbors=5)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[54  1]
 [ 4 18]]
Accuracy: 0.9351
Precision: 0.9474
Recall (Sensitivity): 0.8182
Specificity: 0.9818
F1-score: 0.8780
AUC-ROC: 0.9599
AUPRC: 0.9219

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[50  5]
 [ 2 20]]
Accuracy: 0.9091
Precision: 0.8000
Recall (Sensitivity): 0.9091
Specificity: 0.9091
F1-score: 0.8511
AUC-ROC: 0.9426
AUPRC: 0.8605

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[53  2]
 [ 3 19]]
Accuracy: 0.9351
Precision: 0.9048
Recall (Sensitivity): 0.8636
Specificity: 0.9636
F1-score: 0.8837
AUC-ROC: 0.9607
AUPRC: 0.9333

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[53  2]
 [ 4 17]]
Accuracy: 0.9211
Precision: 0.8947
Recall (Sensitivity): 0.8095
Specificity: 0.9636
F1-

**GNB**

In [18]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = GaussianNB()

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[55  0]
 [ 9 13]]
Accuracy: 0.8831
Precision: 1.0000
Recall (Sensitivity): 0.5909
Specificity: 1.0000
F1-score: 0.7429
AUC-ROC: 0.9967
AUPRC: 0.9925

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[53  2]
 [ 4 18]]
Accuracy: 0.9221
Precision: 0.9000
Recall (Sensitivity): 0.8182
Specificity: 0.9636
F1-score: 0.8571
AUC-ROC: 0.9678
AUPRC: 0.8842

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[53  2]
 [ 4 18]]
Accuracy: 0.9221
Precision: 0.9000
Recall (Sensitivity): 0.8182
Specificity: 0.9636
F1-score: 0.8571
AUC-ROC: 0.9636
AUPRC: 0.8759

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[55  0]
 [ 5 16]]
Accuracy: 0.9342
Precision: 1.0000
Recall (Sensitivity): 0.7619
Specificity: 1.0000
F1-

**MLP**

In [19]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = MLPClassifier(hidden_layer_sizes=(150, 100, 50), max_iter=1000, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 1 21]]
Accuracy: 0.9481
Precision: 0.8750
Recall (Sensitivity): 0.9545
Specificity: 0.9455
F1-score: 0.9130
AUC-ROC: 0.9835
AUPRC: 0.9745

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9628
AUPRC: 0.9443

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9868
AUPRC: 0.9784

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[53  2]
 [ 2 19]]
Accuracy: 0.9474
Precision: 0.9048
Recall (Sensitivity): 0.9048
Specificity: 0.9636
F1-

**XGB**

In [28]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = XGBClassifier(
        n_estimators=100,        # ↓ fewer trees
        max_depth=3,             # ↓ shallow trees
        learning_rate=0.05,      # ↓ slower learning
        subsample=0.7,           # ↓ stochastic training
        colsample_bytree=0.7,    # ↓ fewer features per tree
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    )



    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 1 21]]
Accuracy: 0.9610
Precision: 0.9130
Recall (Sensitivity): 0.9545
Specificity: 0.9636
F1-score: 0.9333
AUC-ROC: 0.9893
AUPRC: 0.9819

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9711
AUPRC: 0.9385

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[52  3]
 [ 2 20]]
Accuracy: 0.9351
Precision: 0.8696
Recall (Sensitivity): 0.9091
Specificity: 0.9455
F1-score: 0.8889
AUC-ROC: 0.9917
AUPRC: 0.9821

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 1 20]]
Accuracy: 0.9605
Precision: 0.9091
Recall (Sensitivity): 0.9524
Specificity: 0.9636
F1-scor

**ADB**

In [21]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = AdaBoostClassifier(n_estimators=100, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 1 21]]
Accuracy: 0.9481
Precision: 0.8750
Recall (Sensitivity): 0.9545
Specificity: 0.9455
F1-score: 0.9130
AUC-ROC: 0.9917
AUPRC: 0.9858

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9752
AUPRC: 0.9476

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[50  5]
 [ 3 19]]
Accuracy: 0.8961
Precision: 0.7917
Recall (Sensitivity): 0.8636
Specificity: 0.9091
F1-score: 0.8261
AUC-ROC: 0.9835
AUPRC: 0.9650

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[55  0]
 [ 1 20]]
Accuracy: 0.9868
Precision: 1.0000
Recall (Sensitivity): 0.9524
Specificity: 1.0000
F1-

**GBC**

In [22]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[53  2]
 [ 1 21]]
Accuracy: 0.9610
Precision: 0.9130
Recall (Sensitivity): 0.9545
Specificity: 0.9636
F1-score: 0.9333
AUC-ROC: 0.9876
AUPRC: 0.9768

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9727
AUPRC: 0.9546

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 2 20]]
Accuracy: 0.9351
Precision: 0.8696
Recall (Sensitivity): 0.9091
Specificity: 0.9455
F1-score: 0.8889
AUC-ROC: 0.9876
AUPRC: 0.9722

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[53  2]
 [ 1 20]]
Accuracy: 0.9605
Precision: 0.9091
Recall (Sensitivity): 0.9524
Specificity: 0.9636
F1-

**ETC**

In [11]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = ExtraTreesClassifier(n_estimators=100, max_depth=6, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[55  0]
 [ 3 19]]
Accuracy: 0.9610
Precision: 1.0000
Recall (Sensitivity): 0.8636
Specificity: 1.0000
F1-score: 0.9268
AUC-ROC: 0.9909
AUPRC: 0.9848

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9793
AUPRC: 0.9630

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9909
AUPRC: 0.9808

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 1 20]]
Accuracy: 0.9737
Precision: 0.9524
Recall (Sensitivity): 0.9524
Specificity: 0.9818
F1-scor

In [23]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = ExtraTreesClassifier(
        n_estimators=500,
        max_depth=None,
        criterion="gini",
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        class_weight="balanced_subsample",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9868
AUPRC: 0.9759

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9802
AUPRC: 0.9688

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9942
AUPRC: 0.9869

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 1 20]]
Accuracy: 0.9605
Precision: 0.9091
Recall (Sensitivity): 0.9524
Specificity: 0.9636
F1-scor

**LGBM**

In [26]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.5, num_leaves=31, random_state=42)

    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})
[LightGBM] [Info] Number of positive: 218, number of negative: 218
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131
[LightGBM] [Info] Number of data points in the train set: 436, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9851
AUPRC: 0.9719

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})
[LightGBM] [Info] Number of positive: 219, number of negative: 219
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing wa

**CB**

In [30]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [32]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = CatBoostClassifier(
      iterations=100,
      learning_rate=0.1,
      depth=3,
      random_state=42,
      verbose=0  # Suppress verbose output
  )
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 1 21]]
Accuracy: 0.9610
Precision: 0.9130
Recall (Sensitivity): 0.9545
Specificity: 0.9636
F1-score: 0.9333
AUC-ROC: 0.9876
AUPRC: 0.9803

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9769
AUPRC: 0.9595

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[53  2]
 [ 2 20]]
Accuracy: 0.9481
Precision: 0.9091
Recall (Sensitivity): 0.9091
Specificity: 0.9636
F1-score: 0.9091
AUC-ROC: 0.9909
AUPRC: 0.9808

Before SMOTE-Tomek: Counter({'Recurred': 1})
After SMOTE-Tomek:  Counter({'Recurred': 1})

Confusion Matrix:
[[54  1]
 [ 1 20]]
Accuracy: 0.9737
Precision: 0.9524
Recall (Sensitivity): 0.9524
Specificity: 0.9818
F1-scor

**BNB**

In [29]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = BernoulliNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[54  1]
 [ 4 18]]
Accuracy: 0.9351
Precision: 0.9474
Recall (Sensitivity): 0.8182
Specificity: 0.9818
F1-score: 0.8780
AUC-ROC: 0.9810
AUPRC: 0.9600

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[50  5]
 [ 2 20]]
Accuracy: 0.9091
Precision: 0.8000
Recall (Sensitivity): 0.9091
Specificity: 0.9091
F1-score: 0.8511
AUC-ROC: 0.9711
AUPRC: 0.9349

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[49  6]
 [ 1 21]]
Accuracy: 0.9091
Precision: 0.7778
Recall (Sensitivity): 0.9545
Specificity: 0.8909
F1-score: 0.8571
AUC-ROC: 0.9769
AUPRC: 0.9574

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[50  5]
 [ 3 18]]
Accuracy: 0.8947
Precision: 0.7826
Recall (Sensitivity): 0.8571
Specificity: 0.9091
F1-

**CNB**

In [30]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = ComplementNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[55  0]
 [ 4 18]]
Accuracy: 0.9481
Precision: 1.0000
Recall (Sensitivity): 0.8182
Specificity: 1.0000
F1-score: 0.9000
AUC-ROC: 0.9876
AUPRC: 0.9731

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[51  4]
 [ 2 20]]
Accuracy: 0.9221
Precision: 0.8333
Recall (Sensitivity): 0.9091
Specificity: 0.9273
F1-score: 0.8696
AUC-ROC: 0.9727
AUPRC: 0.9456

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[49  6]
 [ 1 21]]
Accuracy: 0.9091
Precision: 0.7778
Recall (Sensitivity): 0.9545
Specificity: 0.8909
F1-score: 0.8571
AUC-ROC: 0.9810
AUPRC: 0.9636

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[51  4]
 [ 3 18]]
Accuracy: 0.9079
Precision: 0.8182
Recall (Sensitivity): 0.8571
Specificity: 0.9273
F1-

**MNB**

In [31]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = MultinomialNB()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[55  0]
 [ 4 18]]
Accuracy: 0.9481
Precision: 1.0000
Recall (Sensitivity): 0.8182
Specificity: 1.0000
F1-score: 0.9000
AUC-ROC: 0.9876
AUPRC: 0.9731

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[51  4]
 [ 2 20]]
Accuracy: 0.9221
Precision: 0.8333
Recall (Sensitivity): 0.9091
Specificity: 0.9273
F1-score: 0.8696
AUC-ROC: 0.9727
AUPRC: 0.9456

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[49  6]
 [ 1 21]]
Accuracy: 0.9091
Precision: 0.7778
Recall (Sensitivity): 0.9545
Specificity: 0.8909
F1-score: 0.8571
AUC-ROC: 0.9810
AUPRC: 0.9636

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[51  4]
 [ 3 18]]
Accuracy: 0.9079
Precision: 0.8182
Recall (Sensitivity): 0.8571
Specificity: 0.9273
F1-

**HGB**

In [32]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = HistGradientBoostingClassifier(
        max_iter=100,
        learning_rate=0.5,
        max_depth=5,
        random_state=42
    )
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 3 19]]
Accuracy: 0.9221
Precision: 0.8636
Recall (Sensitivity): 0.8636
Specificity: 0.9455
F1-score: 0.8636
AUC-ROC: 0.9752
AUPRC: 0.9618

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9868
AUPRC: 0.9766

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 1 21]]
Accuracy: 0.9481
Precision: 0.8750
Recall (Sensitivity): 0.9545
Specificity: 0.9455
F1-score: 0.9130
AUC-ROC: 0.9893
AUPRC: 0.9790

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[52  3]
 [ 1 20]]
Accuracy: 0.9474
Precision: 0.8696
Recall (Sensitivity): 0.9524
Specificity: 0.9455
F1-

**NC**

In [33]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    average_precision_score
)

from imblearn.combine import SMOTETomek
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target column as Series (0/1)
# Example:
# X = df.drop("Recurred", axis=1)
# y = df["Recurred"]

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# Cross-Validation Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", Counter(y_train))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

    print("After SMOTE-Tomek: ", Counter(y_train_res))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Logistic Regression
    # =========================
    model = NearestCentroid()
    model.fit(X_train_scaled, y_train_res)

    # =========================
    # Predictions
    # =========================
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    # Print metrics
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# Cross-Validation Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[52  3]
 [ 5 17]]
Accuracy: 0.8961
Precision: 0.8500
Recall (Sensitivity): 0.7727
Specificity: 0.9455
F1-score: 0.8095
AUC-ROC: 0.9777
AUPRC: 0.9528

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[50  5]
 [ 2 20]]
Accuracy: 0.9091
Precision: 0.8000
Recall (Sensitivity): 0.9091
Specificity: 0.9091
F1-score: 0.8511
AUC-ROC: 0.9698
AUPRC: 0.9128

Before SMOTE-Tomek: Counter({0: 220, 1: 86})
After SMOTE-Tomek:  Counter({0: 218, 1: 218})

Confusion Matrix:
[[49  6]
 [ 1 21]]
Accuracy: 0.9091
Precision: 0.7778
Recall (Sensitivity): 0.9545
Specificity: 0.8909
F1-score: 0.8571
AUC-ROC: 0.9793
AUPRC: 0.9611

Before SMOTE-Tomek: Counter({0: 220, 1: 87})
After SMOTE-Tomek:  Counter({0: 219, 1: 219})

Confusion Matrix:
[[50  5]
 [ 4 17]]
Accuracy: 0.8816
Precision: 0.7727
Recall (Sensitivity): 0.8095
Specificity: 0.9091
F1-

**ANN**

In [35]:
# =========================
# Required Libraries
# =========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)

from imblearn.combine import SMOTETomek

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings("ignore")

# =========================
# Features & Target
# =========================
# X -> after one-hot encoding
# y -> target Series (0/1)

# =========================
# Stratified 5-Fold CV
# =========================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
all_metrics = []

# =========================
# CV Loop
# =========================
for train_idx, test_idx in skf.split(X, y):

    print(f"\n================ Fold {fold} =================")

    # Split
    X_train_raw, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_raw, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print("Before SMOTE-Tomek:", dict(pd.Series(y_train_raw).value_counts()))

    # =========================
    # SMOTE-Tomek (TRAIN ONLY)
    # =========================
    smt = SMOTETomek(random_state=42)
    X_train_res, y_train_res = smt.fit_resample(X_train_raw, y_train_raw)

    print("After SMOTE-Tomek: ", dict(pd.Series(y_train_res).value_counts()))

    # =========================
    # Feature Scaling
    # =========================
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_res)
    X_test_scaled = scaler.transform(X_test)

    # =========================
    # Build ANN
    # =========================
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=0
    )
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )

    model.fit(
        X_train_scaled, y_train_res,
        epochs=300,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )

    # =========================
    # Predictions
    # =========================
    y_proba = model.predict(X_test_scaled).ravel()
    y_pred = (y_proba >= 0.8).astype(int)

    # =========================
    # Confusion Matrix
    # =========================
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(cm)

    # =========================
    # Metrics
    # =========================
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall (Sensitivity)": recall_score(y_test, y_pred),
        "Specificity": tn / (tn + fp) if (tn + fp) > 0 else 0,
        "F1-score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUPRC": average_precision_score(y_test, y_proba)
    }

    all_metrics.append(metrics)

    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    fold += 1

# =========================
# CV Summary
# =========================
metrics_df = pd.DataFrame(all_metrics)

summary = pd.DataFrame({
    "Mean": metrics_df.mean(),
    "Std": metrics_df.std()
})

print("\n=========== 5-Fold CV Summary (ANN) ===========\n")
for metric in summary.index:
    print(f"{metric}: {summary.loc[metric,'Mean']:.4f} ± {summary.loc[metric,'Std']:.4f}")



Before SMOTE-Tomek: {0: np.int64(220), 1: np.int64(86)}
After SMOTE-Tomek:  {0: np.int64(218), 1: np.int64(218)}

Epoch 39: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 44: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step

Confusion Matrix:
[[55  0]
 [ 2 20]]
Accuracy: 0.9740
Precision: 1.0000
Recall (Sensitivity): 0.9091
Specificity: 1.0000
F1-score: 0.9524
AUC-ROC: 0.9884
AUPRC: 0.9811

Before SMOTE-Tomek: {0: np.int64(220), 1: np.int64(86)}
After SMOTE-Tomek:  {0: np.int64(219), 1: np.int64(219)}

Epoch 43: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 61: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 66: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.



[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 180ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step

Confusion Matrix:
[[54  1]
 [ 2 20]]
Accuracy: 0.9610
Precision: 0.9524
Recall (Sensitivity): 0.9091
Specificity: 0.9818
F1-score: 0.9302
AUC-ROC: 0.9835
AUPRC: 0.9725

Before SMOTE-Tomek: {0: np.int64(220), 1: np.int64(87)}
After SMOTE-Tomek:  {0: np.int64(219), 1: np.int64(219)}

Epoch 45: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 50: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step

Confusion Matrix:
[[53  2]
 [ 1 20]]
Accuracy: 0.9605
Precision: 0.9091
Recall (Sensitivity): 0.9524
Specificity: 0.9636
F1-score: 0.9302
AUC-ROC: 0.9844
AUPRC: 0.9712

Before SMOTE-Tomek: {0: np.int64(220), 1: np.int64(87)}
After SMOTE-Tomek:  {0: np.int64(218), 1: np.int64(218)}

Epoch 37: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 47: ReduceLROnPlateau reducing learning rate to 