## CVD Prediction - Mendeley Dataset (Source: https://data.mendeley.com/datasets/dzz48mvjht/1)
Model Training and Evaluation

In [1]:
#load preprocessed data 
import pandas as pd
train_df = pd.read_csv("./data_subsets/train_75M_25F.csv")

X_test = pd.read_csv("./data_splits/X_test.csv")
y_test = pd.read_csv("./data_splits/y_test.csv")

#check out the data
train_df.head()

Unnamed: 0,source_id,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,71,77,1,1,125,135.0,0,0,100,0,1.8,2,1,0
1,139,23,1,3,143,221.0,0,0,152,1,2.0,2,0,0
2,589,21,1,0,126,139.0,0,0,150,1,1.4,2,1,0
3,713,53,1,2,171,328.877508,0,1,147,0,5.3,3,3,1
4,234,69,1,1,120,231.0,0,0,77,0,4.4,2,0,0


In [2]:
TARGET = "target"
SENSITIVE = "gender"   # 1 = Male, 0 = Female

categorical_cols = ['gender','chestpain','fastingbloodsugar','restingrelectro','exerciseangia','slope','noofmajorvessels']
continuous_cols  = ['age','restingBP','serumcholestrol','maxheartrate','oldpeak']

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

In [3]:
# SCALE NUMERIC FEATURES ONLY 

import pandas as pd
from sklearn.preprocessing import StandardScaler


# 1) fit scaler on TRAIN numeric columns only
scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[continuous_cols]),
    columns=continuous_cols,
    index=X_train.index
)

# 2) transform TEST with the same scaler
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X_test[continuous_cols]),
    columns=continuous_cols,
    index=X_test.index
)

# 3) reassemble: raw categoricals + scaled numerics
X_train_scaled = pd.concat([X_train[categorical_cols].reset_index(drop=True),
                            X_train_num_scaled.reset_index(drop=True)], axis=1)
X_test_scaled  = pd.concat([X_test[categorical_cols].reset_index(drop=True),
                            X_test_num_scaled.reset_index(drop=True)], axis=1)

# OPTIONAL: quick sanity checks
print("Train numeric means (≈0):")
print(X_train_scaled[continuous_cols].mean().round(3))
print("\nTrain numeric stds (≈1):")
print(X_train_scaled[continuous_cols].std(ddof=0).round(3))

# save for later steps
X_train_scaled.to_csv("data_subsets/train_75M_25F_scaled_only.csv", index=False)
#X_test_scaled.to_csv("data_splits/X_test_scaled_only.csv", index=False)


Train numeric means (≈0):
age                0.0
restingBP          0.0
serumcholestrol    0.0
maxheartrate       0.0
oldpeak            0.0
dtype: float64

Train numeric stds (≈1):
age                1.0
restingBP          1.0
serumcholestrol    1.0
maxheartrate       1.0
oldpeak            1.0
dtype: float64


In [4]:
# --- ONE-HOT ENCODE CATEGORICALS; KEEP SCALED NUMERICS AS-IS ---

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# categorical_cols and continuous_cols already defined
# X_train_scaled, X_test_scaled already created

# 1) fit encoder on TRAIN categoricals only
ohe = OneHotEncoder(handle_unknown="ignore", drop="if_binary", sparse_output=False)
ohe.fit(X_train_scaled[categorical_cols])

# 2) transform TRAIN and TEST
X_train_cat = pd.DataFrame(
    ohe.transform(X_train_scaled[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train_scaled.index
)
X_test_cat = pd.DataFrame(
    ohe.transform(X_test_scaled[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_test_scaled.index
)

# 3) concatenate: encoded categoricals + scaled numerics
X_train_ready = pd.concat([X_train_cat, X_train_scaled[continuous_cols]], axis=1)
X_test_ready  = pd.concat([X_test_cat,  X_test_scaled[continuous_cols]],  axis=1)

print("Final feature shapes:", X_train_ready.shape, X_test_ready.shape)

Final feature shapes: (600, 22) (200, 22)


### Traditional ML Models - Baseline: K-Nearest Neighbors (KNN) & Decision Tree (DT)

In [5]:
#import required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

#define a function 
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} Evaluation ===")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='binary'))
    print("Recall   :", recall_score(y_true, y_pred, average='binary'))
    print("F1 Score :", f1_score(y_true, y_pred, average='binary'))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n" + "="*40 + "\n")

In [6]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_ready, y_train)

y_pred_knn = knn.predict(X_test_ready)
y_prob_knn = knn.predict_proba(X_test_ready)[:, 1] 

evaluate_model(y_test, y_pred_knn, "KNN")


# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_ready, y_train)

y_pred_dt = dt.predict(X_test_ready)
y_prob_dt = dt.predict_proba(X_test_ready)[:, 1]  
evaluate_model(y_test, y_pred_dt, "Decision Tree")

=== KNN Evaluation ===
Accuracy : 0.91
Precision: 0.9454545454545454
Recall   : 0.896551724137931
F1 Score : 0.9203539823008849

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90        84
           1       0.95      0.90      0.92       116

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 78   6]
 [ 12 104]]


=== Decision Tree Evaluation ===
Accuracy : 0.89
Precision: 0.9351851851851852
Recall   : 0.8706896551724138
F1 Score : 0.9017857142857143

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88        84
           1       0.94      0.87      0.90       116

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.89      0.89      0

## KNN: Evaluation

- **Accuracy:** 0.910  
- **Precision:** 0.945  
- **Recall:** **0.897**  
- **F1 Score:** 0.920  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 78           | 6            |
| **Actual: 1** | 12           | 104          |

- **False negatives:** **12** (missed CVD)  
- **False positives:** **6** (healthy flagged)

**Interpretation:**  
High precision with strong recall; few false alarms and a modest number of missed CVD cases. Suitable when both correctness of positive flags and avoiding misses matter.

---

## Decision Tree: Evaluation

- **Accuracy:** 0.890  
- **Precision:** 0.935  
- **Recall:** **0.871**  
- **F1 Score:** 0.902  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 77           | 7            |
| **Actual: 1** | 15           | 101          |

- **False negatives:** **15** (missed CVD)  
- **False positives:** **7** (healthy flagged)

**Interpretation:**  
Good overall performance with solid precision and slightly lower recall; a few more missed CVD cases relative to its positives. Appropriate when controlling false alarms while maintaining reasonable sensitivity.

---

### KNN Improvement
The code improves the KNN model by performing a **grid search** over key hyperparameters (`n_neighbors`, `weights`, and `distance metric`) to find the configuration that yields the best performance. After selecting the optimal model, it further explores **decision threshold tuning** to boost recall, which is critical in medical prediction tasks. 

In [7]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

# 1) Hyperparameter tuning for KNN 
param_grid = {
    "n_neighbors": list(range(1, 31)),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "minkowski"],  # minkowski with p=2 is euclidean
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=cv,
    scoring="f1",        
    n_jobs=-1,
    verbose=0,
    refit=True
)

# Fit 
grid.fit(X_train_ready, y_train)

print("Best KNN params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

best_knn = grid.best_estimator_

# 2) Evaluate best KNN on TEST 
y_pred_knn_best = best_knn.predict(X_test_ready)
y_prob_knn_best = best_knn.predict_proba(X_test_ready)[:, 1]   

evaluate_model(y_test, y_pred_knn_best, "KNN (best params)")

Best KNN params: {'metric': 'manhattan', 'n_neighbors': 8, 'weights': 'distance'}
Best CV F1: 0.9360741590062324
=== KNN (best params) Evaluation ===
Accuracy : 0.93
Precision: 0.9722222222222222
Recall   : 0.9051724137931034
F1 Score : 0.9375

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92        84
           1       0.97      0.91      0.94       116

    accuracy                           0.93       200
   macro avg       0.93      0.93      0.93       200
weighted avg       0.93      0.93      0.93       200

Confusion Matrix:
 [[ 81   3]
 [ 11 105]]




## KNN: Tuned

**Best KNN params**: `{'metric': 'manhattan', 'n_neighbors': 8, 'weights': 'distance'}`  
**Best CV F1**: **0.9361**

---

### KNN (Best Params)
- **Accuracy:** 0.930  
- **Precision:** 0.972  
- **Recall:** **0.905**  
- **F1 Score:** 0.938  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 81           | 3            |
| **Actual: 1** | 11           | 105          |

- **False negatives:** **11** (missed positives)  
- **False positives:** **3** (healthy flagged)

**Interpretation:**  
This tuned KNN delivers **very high precision** with **strong recall**, yielding few false alarms and a manageable number of missed cases. The model setup is suitable when **correct positive identification** is prioritized while still **capturing most true cases**. Consider calibration and fairness checks before deployment.

---

### Further KNN Improvement - Implementing PCA 

In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

# 1) PCA + KNN pipeline 
pca_knn = Pipeline([
    ('pca', PCA(n_components=0.95, random_state=42)),  # keep 95% variance
    ('knn', KNeighborsClassifier(
        n_neighbors=15, metric='manhattan', weights='distance'
    ))
])

pca_knn.fit(X_train_ready, y_train)

# Inspect PCA details
n_comp = pca_knn.named_steps['pca'].n_components_
expl_var = pca_knn.named_steps['pca'].explained_variance_ratio_.sum()
print(f"PCA components: {n_comp} | Explained variance retained: {expl_var:.3f}")

#2) Evaluate 
y_pred_pca_knn = pca_knn.predict(X_test_ready)
probs_pca_knn = pca_knn.predict_proba(X_test_ready)[:, 1]

evaluate_model(y_test, y_pred_pca_knn, "PCA+KNN")

PCA components: 15 | Explained variance retained: 0.965
=== PCA+KNN Evaluation ===
Accuracy : 0.915
Precision: 0.9541284403669725
Recall   : 0.896551724137931
F1 Score : 0.9244444444444444

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.94      0.90        84
           1       0.95      0.90      0.92       116

    accuracy                           0.92       200
   macro avg       0.91      0.92      0.91       200
weighted avg       0.92      0.92      0.92       200

Confusion Matrix:
 [[ 79   5]
 [ 12 104]]




## PCA + KNN

**PCA components:** 15  
**Explained variance retained:** **0.965**

---

### Evaluation
- **Accuracy:** 0.915  
- **Precision:** 0.954  
- **Recall:** **0.897**  
- **F1 Score:** 0.924  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 79           | 5            |
| **Actual: 1** | 12           | 104          |

- **False negatives:** **12** (missed positives)  
- **False positives:** **5** (healthy flagged)

**Interpretation:**  
Dimensionality reduction to 15 components retains **96.5%** of variance while maintaining **strong precision** and **good recall**. The model produces **few false alarms** and a **manageable number of misses**, indicating a balanced operating point with potential gains in stability and efficiency.

---

# KNN Model Comparison (CVD Diagnosis) – Recall Priority

## 1. Baseline KNN
- **Accuracy**: 0.91  
- **Precision**: 0.945  
- **Recall**: 0.897  
- **F1**: 0.920  
- **Confusion Matrix**: [[78, 6], [12, 104]]

**Interpretation**:  
Strong precision (~95%) and balanced performance overall. Recall is just under 90%, meaning 12 CVD-positive patients were missed. This makes it good, but not the most sensitive option.

---

## 2. Best KNN (Tuned: `n_neighbors=8`, `metric=manhattan`, `weights=distance`)
- **Accuracy**: **0.93 (highest)**  
- **Precision**: **0.972 (highest)**  
- **Recall**: **0.905 (highest)**  
- **F1**: **0.938 (highest)**  
- **Confusion Matrix**: [[81, 3], [11, 105]]

**Interpretation**:  
The tuned KNN is the strongest performer overall: it improves both accuracy and F1, while recall increases to ~91%. It misses only 11 positives (slightly better than baseline) and greatly reduces false positives (just 3). This is the **best-balanced KNN**, with high sensitivity and outstanding precision.

---

## 3. PCA + KNN (15 components, 96.5% variance retained)
- **Accuracy**: 0.915  
- **Precision**: 0.954  
- **Recall**: 0.897  
- **F1**: 0.924  
- **Confusion Matrix**: [[79, 5], [12, 104]]

**Interpretation**:  
Very close to baseline KNN. Accuracy and precision are slightly higher, but recall remains the same (~90%). Dimensionality reduction does not improve sensitivity, suggesting PCA does not add value here (dataset likely not high-dimensional enough).

---

## Overview Table (Ranked by Recall Priority)

| Model                 | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|------------------------|----------|-----------|--------|------|-----------------|
| **Best KNN (Tuned)**  | **0.93** | **0.972** | **0.905** | **0.938** | 11 |
| Baseline KNN          | 0.91     | 0.945     | 0.897  | 0.920 | 12 |
| PCA + KNN             | 0.915    | 0.954     | 0.897  | 0.924 | 12 |

---

## Final Takeaway
- **Best KNN (tuned)** is the clear winner: it has the **highest recall (~91%)**, **highest accuracy**, and **highest precision**, making it the most reliable choice for diagnosis.  
- **Baseline KNN** and **PCA+KNN** are very close, with recall just under 90% (12 missed positives). PCA does not meaningfully improve performance.  
- For CVD diagnosis, where **sensitivity matters most**, the **Tuned KNN** provides the best balance: it improves recall slightly while significantly boosting overall reliability.  


In [9]:
#saving best performing KNN Model for fairness evaluation
import joblib, pandas as pd, numpy as np

# Ensure y_test is a Series (not a DataFrame)
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze("columns")

# Save model
model_filename = "knn_best_model.pkl"
joblib.dump(best_knn, model_filename)

# Ensure 1D arrays
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_pred = best_knn.predict(X_test_ready)
y_prob = best_knn.predict_proba(X_test_ready)[:, 1]

# Optional gender column if present
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_prob": y_prob,
    "y_pred": y_pred
})

preds_filename = "MendeleyData_75M25F_KNN_best_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved tuned KNN model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved tuned KNN model → knn_best_model.pkl
Saved predictions → MendeleyData_75M25F_KNN_best_predictions.csv


### Improvement - Decision Tree (DT)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# 1) Base model
dt = DecisionTreeClassifier(random_state=42)

# 2) Hyperparameter grid 
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7, 9, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 10],
}

# 3) Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4) Grid search 
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=cv,
    scoring="recall",
    n_jobs=-1,
    verbose=0
)

grid_dt.fit(X_train_ready, y_train)

print("Best Decision Tree params:", grid_dt.best_params_)
print("Best CV F1:", grid_dt.best_score_)

# 5) Train & evaluate best DT
best_dt = grid_dt.best_estimator_
y_pred_dt_best = best_dt.predict(X_test_ready)
y_prob_dt_best = best_dt.predict_proba(X_test_ready)[:, 1]  

evaluate_model(y_test, y_pred_dt_best, "Tuned Decision Tree")

Best Decision Tree params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV F1: 0.9233333333333332
=== Tuned Decision Tree Evaluation ===
Accuracy : 0.905
Precision: 0.907563025210084
Recall   : 0.9310344827586207
F1 Score : 0.9191489361702128

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88        84
           1       0.91      0.93      0.92       116

    accuracy                           0.91       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.91      0.90       200

Confusion Matrix:
 [[ 73  11]
 [  8 108]]




## Decision Tree: Tuned

**Best DT params**: `{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}`  
**Best CV F1**: **0.9233**

---

### Tuned Decision Tree
- **Accuracy:** 0.905  
- **Precision:** 0.908  
- **Recall:** **0.931**  
- **F1 Score:** 0.919  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 73           | 11           |
| **Actual: 1** | 8            | 108          |

- **False negatives:** **8** (missed CVD)  
- **False positives:** **11** (healthy flagged)  

**Interpretation (general):**  
The model emphasizes **high recall**, capturing most CVD cases while keeping precision solid. The error profile shows **few missed cases** (FN=8) and a **manageable number of false alarms** (FP=11). This model is suitable when **avoiding missed CVD** is a priority. 

---

In [11]:
# Alternative DT tuning: simpler trees + class balancing + cost-complexity pruning
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

# Stage A: bias toward simpler trees with class_weight="balanced"
base_dt = DecisionTreeClassifier(random_state=42, class_weight="balanced")

param_grid_simple = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6, 7],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [2, 4, 6],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],  # tiny regularization
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_simple = GridSearchCV(
    estimator=base_dt,
    param_grid=param_grid_simple,
    cv=cv,
    scoring="recall",        # recall-focused search
    n_jobs=-1,
    verbose=0,
    refit=True
)
grid_simple.fit(X_train_ready, y_train)

print("Stage A — Best simple DT params:", grid_simple.best_params_)
print("Stage A — Best CV Recall:", grid_simple.best_score_)
simple_dt = grid_simple.best_estimator_

# Stage B: cost-complexity pruning on the best simple DT
path = simple_dt.cost_complexity_pruning_path(X_train_ready, y_train)
ccp_alphas = path.ccp_alphas

unique_alphas = np.unique(np.round(ccp_alphas, 6))
candidate_alphas = np.linspace(unique_alphas.min(), unique_alphas.max(), num=min(20, len(unique_alphas)))
candidate_alphas = np.unique(np.concatenate([candidate_alphas, [0.0]]))  # include no-pruning baseline

cv_scores = []
for alpha in candidate_alphas:
    dt_alpha = DecisionTreeClassifier(
        random_state=42,
        class_weight="balanced",
        criterion=simple_dt.criterion,
        max_depth=simple_dt.max_depth,
        min_samples_split=simple_dt.min_samples_split,
        min_samples_leaf=simple_dt.min_samples_leaf,
        min_impurity_decrease=simple_dt.min_impurity_decrease,
        ccp_alpha=alpha
    )
    # recall-focused CV
    recall_cv = cross_val_score(dt_alpha, X_train_ready, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    cv_scores.append((alpha, recall_cv))

best_alpha, best_cv_recall = sorted(cv_scores, key=lambda x: x[1], reverse=True)[0]
print(f"Stage B — Best ccp_alpha: {best_alpha:.6f} | CV Recall: {best_cv_recall:.4f}")

# Final model fit with the chosen ccp_alpha
best_dt = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced",
    criterion=simple_dt.criterion,
    max_depth=simple_dt.max_depth,
    min_samples_split=simple_dt.min_samples_split,
    min_samples_leaf=simple_dt.min_samples_leaf,
    min_impurity_decrease=simple_dt.min_impurity_decrease,
    ccp_alpha=best_alpha
).fit(X_train_ready, y_train)

# Evaluation
y_pred_dt = best_dt.predict(X_test_ready)
y_prob_dt = best_dt.predict_proba(X_test_ready)[:, 1]   

evaluate_model(y_test, y_pred_dt, "Alternative Tuned & Pruned DT")

Stage A — Best simple DT params: {'criterion': 'gini', 'max_depth': 4, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 5}
Stage A — Best CV Recall: 0.9199999999999999
Stage B — Best ccp_alpha: 0.000000 | CV Recall: 0.9200
=== Alternative Tuned & Pruned DT Evaluation ===
Accuracy : 0.88
Precision: 0.8709677419354839
Recall   : 0.9310344827586207
F1 Score : 0.9

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.81      0.85        84
           1       0.87      0.93      0.90       116

    accuracy                           0.88       200
   macro avg       0.88      0.87      0.88       200
weighted avg       0.88      0.88      0.88       200

Confusion Matrix:
 [[ 68  16]
 [  8 108]]




### Decision Tree: Tuned & Pruned (Alternative)

**Stage A — Best simple DT params**: `{'criterion': 'gini', 'max_depth': 4, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 5}`  
**Stage A — Best CV Recall**: **0.9200**  
**Stage B — Best `ccp_alpha`**: **0.000000** (no extra pruning) | **CV Recall**: **0.9200**

---

### Test Evaluation
- **Accuracy:** 0.880  
- **Precision:** 0.871  
- **Recall:** **0.931**  
- **F1 Score:** 0.900  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 68           | 16           |
| **Actual: 1** | 8            | 108          |

- **False negatives:** **8** (missed CVD)  
- **False positives:** **16** (healthy flagged)  

**Summary:**  
This DT is tuned for **high recall** (captures ~93% of CVD cases), accepting more **false positives** (specificity ≈ 0.81). Suitable when **missing CVD is costlier** than extra follow-ups.

---

In [12]:
# Alternative DT tuning focused on higher recall
# Changes vs previous:
#  - Remove calibration (predict uses raw tree probs at 0.5)
#  - Tune class_weight (heavier positive weights allowed)
#  - Broaden depth a bit but keep regularization via min_samples_* and tiny impurity decrease
#  - Prune only with very small ccp_alphas to avoid killing recall

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

# Simpler-but-expressive trees + tuned class weights
base_dt = DecisionTreeClassifier(random_state=42)

param_grid_simple = {
    "criterion": ["gini", "entropy"],                  # add "log_loss" if your sklearn supports it
    "max_depth": [4, 5, 6, 7, 8, 9, 10],               # a bit deeper to help recall
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],
    "class_weight": ["balanced", {0:1,1:2}, {0:1,1:3}, {0:1,1:4}],  # stronger push toward positives
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

grid_simple = GridSearchCV(
    estimator=base_dt,
    param_grid=param_grid_simple,
    cv=cv,
    scoring="recall",      # prioritize sensitivity for class 1
    n_jobs=-1,
    verbose=0,
    refit=True
)
grid_simple.fit(X_train_ready, y_train)

best_params = grid_simple.best_params_
print("Stage A — Best DT params:", best_params)
print("Stage A — Best CV Recall:", round(grid_simple.best_score_, 4))

# Train a zero-pruned model with best params to get the pruning path
dt0 = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=0.0).fit(X_train_ready, y_train)


# Stage B — Gentle cost-complexity pruning (favor small alphas)
path = dt0.cost_complexity_pruning_path(X_train_ready, y_train)
ccp_alphas = path.ccp_alphas

# Focus on tiny alphas only + 0.0 to avoid big recall loss
small_slice = ccp_alphas[: min(30, len(ccp_alphas))]  # first 30 values are typically the smallest
candidate_alphas = np.unique(np.r_[0.0, small_slice])

cv_scores = []
for alpha in candidate_alphas:
    dt_alpha = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=alpha)
    rec = cross_val_score(dt_alpha, X_train_ready, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    cv_scores.append((alpha, rec))

best_alpha, best_cv_recall = max(cv_scores, key=lambda x: x[1])
print(f"Stage B — Best ccp_alpha: {best_alpha:.6f} | CV Recall: {best_cv_recall:.4f}")

alt_best_dt = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=best_alpha).fit(X_train_ready, y_train)


# Evaluation
y_pred = alt_best_dt.predict(X_test_ready)               
y_prob = alt_best_dt.predict_proba(X_test_ready)[:, 1]   

evaluate_model(y_test, y_pred, "Alternative Tuned & Pruned Decision Tree")

Stage A — Best DT params: {'class_weight': {0: 1, 1: 4}, 'criterion': 'gini', 'max_depth': 4, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 6, 'min_samples_split': 5}
Stage A — Best CV Recall: 0.965
Stage B — Best ccp_alpha: 0.016578 | CV Recall: 0.9700
=== Alternative Tuned & Pruned Decision Tree Evaluation ===
Accuracy : 0.81
Precision: 0.7635135135135135
Recall   : 0.9741379310344828
F1 Score : 0.8560606060606061

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.58      0.72        84
           1       0.76      0.97      0.86       116

    accuracy                           0.81       200
   macro avg       0.85      0.78      0.79       200
weighted avg       0.84      0.81      0.80       200

Confusion Matrix:
 [[ 49  35]
 [  3 113]]




## Decision Tree Model Comparison (CVD Diagnosis) – Recall Priority

### 1. Baseline Decision Tree
- **Accuracy**: 0.89  
- **Precision**: **0.935** (highest)  
- **Recall**: 0.871 (lowest)  
- **F1**: 0.902  
- **Confusion Matrix**: [[77, 7], [15, 101]]

**Interpretation**:  
Strong precision but weak recall — misses 15 positives. Too risky for diagnosis because it overlooks too many patients with CVD.

---

### 2. Tuned Decision Tree (`gini`, `max_depth=5`, `min_samples_split=2`, `min_samples_leaf=1`)
- **Accuracy**: 0.905  
- **Precision**: 0.908  
- **Recall**: 0.931  
- **F1**: 0.919  
- **Confusion Matrix**: [[73, 11], [8, 108]]

**Interpretation**:  
Balanced and reliable: recall improves significantly to ~93%, reducing missed positives to just 8. Excellent trade-off for diagnosis, since accuracy and precision remain strong. This is the **most practical model** for deployment.

---

### 3. Alternative Tuned & Pruned DT (`gini`, `max_depth=4`, `min_samples_split=5`, `min_samples_leaf=4`)
- **Accuracy**: 0.88  
- **Precision**: 0.871  
- **Recall**: 0.931  
- **F1**: 0.900  
- **Confusion Matrix**: [[68, 16], [8, 108]]

**Interpretation**:  
Recall stays high (~93%), but accuracy and precision drop. Still misses only 8 positives, but generates more false alarms (16 false positives). Good for sensitivity, weaker for balance.

---

### 4. Class-Weighted Tuned & Pruned DT (`class_weight={0:1, 1:4}`, `max_depth=4`, `min_samples_leaf=6`)
- **Accuracy**: 0.81 (lowest)  
- **Precision**: 0.764 (lowest)  
- **Recall**: **0.974 (highest)**  
- **F1**: 0.856  
- **Confusion Matrix**: [[49, 35], [3, 113]]

**Interpretation**:  
Outstanding recall — only 3 missed positives (best sensitivity). But this comes at a **huge cost**: accuracy drops by nearly 10%, and false positives rise sharply (35 healthy patients misclassified). The recall gain (~5%) does not justify the >9% drop in accuracy. Too aggressive for diagnosis.

---

### Overview Table (Ranked by Recall Priority)

| Model                                | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|--------------------------------------|----------|-----------|--------|------|-----------------|
| **Class-Weighted Tuned & Pruned DT** | 0.81     | 0.764     | **0.974** | 0.856 | **3** |
| Tuned DT                             | 0.905    | 0.908     | 0.931 | 0.919 | 8 |
| Alt. Tuned & Pruned DT               | 0.88     | 0.871     | 0.931 | 0.900 | 8 |
| Baseline DT                          | 0.89     | **0.935** | 0.871 | 0.902 | 15 |

---

## Final Takeaway
- **Class-Weighted DT**: Achieves the highest recall (97%) but sacrifices too much accuracy and precision.  
- **Tuned DT**: Best **balanced option** — high recall (~93%), strong accuracy (91%), and reasonable precision. This makes it the most **clinically appropriate** choice.  
- **Alt. Tuned & Pruned DT**: Similar recall but weaker balance due to lower accuracy and more false positives.  
- **Baseline DT**: High precision but misses too many positives, unsuitable for diagnosis.  

**Decision:** For **screening-only scenarios** where every possible positive must be flagged, the Class-Weighted DT could be considered.  
**But for diagnosis, the Tuned DT is clearly the better choice** — it maintains very high recall without sacrificing reliability.  

In [13]:
import joblib, pandas as pd, numpy as np

# Save tuned Decision Tree model
model_filename = "tuned_dt_model.pkl"
joblib.dump(best_dt, model_filename)

# Ensure 1D arrays
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
# Use tuned predictions/probabilities from the best estimator
y_pred = y_pred_dt_best
y_prob = y_prob_dt_best

# Optional gender column if present
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_pred_dt": y_pred,
    "y_prob": y_prob
})

preds_filename = "MendeleyData_75M25F_DT_tuned_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved tuned DT model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved tuned DT model → tuned_dt_model.pkl
Saved predictions → MendeleyData_75M25F_DT_tuned_predictions.csv


### Ensemble Model - Random Forest (RF)

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Train the model
rf.fit(X_train_ready, y_train)

# Predict on test set
y_pred_rf = rf.predict(X_test_ready)
evaluate_model(y_test, y_pred_rf, "Random Forest")

=== Random Forest Evaluation ===
Accuracy : 0.945
Precision: 0.9646017699115044
Recall   : 0.9396551724137931
F1 Score : 0.9519650655021834

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.94        84
           1       0.96      0.94      0.95       116

    accuracy                           0.94       200
   macro avg       0.94      0.95      0.94       200
weighted avg       0.95      0.94      0.95       200

Confusion Matrix:
 [[ 80   4]
 [  7 109]]




## Random Forest: Baseline

---

### Baseline Random Forest
- **Accuracy:** 0.945  
- **Precision:** 0.965  
- **Recall:** **0.940**  
- **F1 Score:** 0.952  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 80           | 4            |
| **Actual: 1** | 7            | 109          |

- **False negatives:** **7** (missed CVD)  
- **False positives:** **4** (healthy flagged)  

---

**Summary:**
- Strong overall performance; **detects most CVD cases** (7 missed).
- **Very few false alarms** (4), precision is high.
---

### Improvement Random Forest (RF)

In [15]:
# Random Forest: hyperparameter tuning 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 8, 12, 16],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.8],  # 0.8 = 80% of features
    "class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring="recall",     # recall-focused
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid.fit(X_train_ready, y_train)
best_rf = grid.best_estimator_
print("Best RF params:", grid.best_params_)
print("Best CV Recall:", grid.best_score_)

# Evaluate best RF 
y_pred_rf = best_rf.predict(X_test_ready)
y_prob_rf = best_rf.predict_proba(X_test_ready)[:, 1]

evaluate_model(y_test, y_pred_rf, "Random Forest (best)")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best RF params: {'class_weight': None, 'max_depth': 8, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Recall: 0.97
=== Random Forest (best) Evaluation ===
Accuracy : 0.955
Precision: 0.9652173913043478
Recall   : 0.9568965517241379
F1 Score : 0.961038961038961

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95        84
           1       0.97      0.96      0.96       116

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.96      0.95      0.96       200

Confusion Matrix:
 [[ 80   4]
 [  5 111]]




### Random Forest: Tuned

**Best RF params**: `{'class_weight': None, 'max_depth': 8, 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}`  
**Best CV Recall**: **0.9700**

---

### Random Forest (Best)
- **Accuracy:** 0.955  
- **Precision:** 0.965  
- **Recall:** **0.957**  
- **F1 Score:** 0.961  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 80           | 4            |
| **Actual: 1** | 5            | 111          |

- **False negatives:** **5** (missed CVD)  
- **False positives:** **4** (healthy flagged)

**Interpretation:**  
The tuned model achieves **high recall** and **strong precision**, indicating it captures most CVD cases with very few false alarms. Error rates are balanced across classes, suggesting a reliable setup for CVD screening.

---

## Random Forest Model Comparison (CVD Diagnosis) – Recall Priority

### 1. Baseline Random Forest
- **Accuracy**: 0.945  
- **Precision**: 0.965  
- **Recall**: 0.940  
- **F1**: 0.952  
- **Confusion Matrix**: [[80, 4], [7, 109]]

**Interpretation**:  
An excellent baseline model: very high recall (~94%), strong accuracy (~95%), and nearly perfect precision (~96%). It misses only 7 positives, making it already highly reliable for diagnosis.

---

### 2. Tuned Random Forest (Best Params: `n_estimators=200`, `max_depth=8`, `max_features=0.8`)
- **Accuracy**: **0.955 (highest)**  
- **Precision**: 0.965  
- **Recall**: **0.957 (highest)**  
- **F1**: **0.961 (highest)**  
- **Confusion Matrix**: [[80, 4], [5, 111]]

**Interpretation**:  
The tuned RF improves recall further to ~96%, missing only 5 positives (down from 7). Accuracy and F1 also increase slightly. It maintains the same excellent precision (~96%). This is a **clear upgrade** over the baseline and one of the best-performing models overall.

---

###  Overview Table (Ranked by Recall Priority)

| Model              | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|---------------------|----------|-----------|--------|------|-----------------|
| **Tuned RF**       | **0.955** | 0.965     | **0.957** | **0.961** | **5** |
| Baseline RF        | 0.945    | **0.965** | 0.940  | 0.952 | 7 |

---

### Final Takeaway
- **Tuned RF** is the superior model: it achieves the **highest recall (~96%)**, **highest accuracy (~96%)**, and **highest F1**. It misses only 5 CVD-positive patients, making it one of the strongest candidates overall.  
- **Baseline RF** is already excellent, but slightly less sensitive (recall ~94%, 7 missed positives).  
- For CVD diagnosis where recall is the priority, the **Tuned Random Forest is the best choice**.  

In [16]:
# Save Tuned Random Forest Results

# Save tuned Random Forest model
model_filename = "tuned_rf_model.pkl"
joblib.dump(best_rf, model_filename)

# Ensure 1D arrays for y_true and y_pred
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_pred = y_pred_rf  # from best_rf.predict(X_test_ready)
y_prob = y_prob_rf

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_pred_rf_tuned": y_pred,
    "y_prob" :y_prob_rf
})

preds_filename = "MendeleyData_75M25F_RF_tuned_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved tuned RF model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved tuned RF model → tuned_rf_model.pkl
Saved predictions → MendeleyData_75M25F_RF_tuned_predictions.csv


### Deep Learning - Multi-layer Perceptron

In [17]:
#import required library 
from sklearn.neural_network import MLPClassifier

In [18]:
# Initialize MLP model
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),   # one hidden layer with 100 neurons
    activation='relu',           # or 'tanh'
    solver='adam',               # optimizer
    max_iter=1000,                # increase if convergence warning appears
    random_state=42
)

# Train the model
mlp.fit(X_train_ready, y_train)

# Predict
y_pred_mlp = mlp.predict(X_test_ready)

evaluate_model(y_test, y_pred_mlp, "Multilayer Perceptron (MLP)")

=== Multilayer Perceptron (MLP) Evaluation ===
Accuracy : 0.905
Precision: 0.9369369369369369
Recall   : 0.896551724137931
F1 Score : 0.9162995594713657

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89        84
           1       0.94      0.90      0.92       116

    accuracy                           0.91       200
   macro avg       0.90      0.91      0.90       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 77   7]
 [ 12 104]]




## Multilayer Perceptron (MLP)

### Evaluation
- **Accuracy:** 0.905  
- **Precision:** 0.937  
- **Recall:** **0.897**  
- **F1 Score:** 0.916  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 77           | 7            |
| **Actual: 1** | 12           | 104          |

- **False negatives:** **12** (missed CVD)  
- **False positives:** **7** (healthy flagged)

**Interpretation:**  
The MLP shows **high precision** and **good recall**, indicating few false alarms and a manageable number of missed cases. This is a balanced setup for CVD screening.

---

### Improvements - MLP

In [19]:
#Adam + Early Stopping 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

adammlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),   # slightly smaller/deeper can help
    activation='relu',
    solver='adam',
    learning_rate_init=1e-3,       # smaller step can stabilize
    alpha=1e-3,                    # L2 regularization to reduce overfitting
    batch_size=32,
    max_iter=1000,                 # increased max_iter
    early_stopping=True,           # use a validation split internally
    validation_fraction=0.15,
    n_iter_no_change=25,          
    tol=1e-4,
    random_state=42
)

adammlp.fit(X_train_ready, y_train)  
y_pred_mlp = adammlp.predict(X_test_ready)                     
y_prob_mlp = adammlp.predict_proba(X_test_ready)[:, 1]         

evaluate_model(y_test, y_pred_mlp, "(Adam + EarlyStopping)")

=== (Adam + EarlyStopping) Evaluation ===
Accuracy : 0.91
Precision: 0.9224137931034483
Recall   : 0.9224137931034483
F1 Score : 0.9224137931034483

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.89      0.89        84
           1       0.92      0.92      0.92       116

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 75   9]
 [  9 107]]




In [20]:
# LBFGS solver - converges fast & well on small datasets
# LBFGS ignores batch_size, early_stopping, learning_rate. It optimizes the full-batch loss.
mlp_lbfgs = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='tanh',         # tanh + lbfgs often works nicely on tabular data
    solver='lbfgs',            # quasi-Newton optimizer
    alpha=1e-3,
    max_iter=1000,
    random_state=42
)

mlp_lbfgs.fit(X_train_ready, y_train)
y_pred_lbfgs = mlp_lbfgs.predict(X_test_ready)
y_prob_lbfgs = mlp_lbfgs.predict_proba(X_test_ready)[:, 1] 

evaluate_model(y_test, y_pred_lbfgs, "Multilayer Perceptron (MLP)")

=== Multilayer Perceptron (MLP) Evaluation ===
Accuracy : 0.92
Precision: 0.923728813559322
Recall   : 0.9396551724137931
F1 Score : 0.9316239316239316

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90        84
           1       0.92      0.94      0.93       116

    accuracy                           0.92       200
   macro avg       0.92      0.92      0.92       200
weighted avg       0.92      0.92      0.92       200

Confusion Matrix:
 [[ 75   9]
 [  7 109]]




## Multilayer Perceptron (MLP) - LBFGS solver

### Evaluation
- **Accuracy:** 0.920  
- **Precision:** 0.924  
- **Recall:** **0.940**  
- **F1 Score:** 0.932  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 75           | 9            |
| **Actual: 1** | 7            | 109          |

- **False negatives:** **7** (missed CVD)  
- **False positives:** **9** (healthy flagged)

**Interpretation:**  
The model prioritizes **high recall**, catching most CVD cases while keeping **precision** strong. The error profile (FN=7, FP=9) suggests a balanced MLP suitable for CVD screening. 


### Further Improvement MLP 

In [21]:
# Recall-first MLP 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, recall_score, fbeta_score, make_scorer
import numpy as np

# 1) Base model: Adam
base_mlp = MLPClassifier(
    solver="adam",
    early_stopping=False,      
    max_iter=1000,             # observed full convergence at 1000
    tol=1e-4,                  # default; tighten if you like (e.g., 1e-5)
    random_state=42
)

param_dist = {
    "hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64)],
    "activation": ["relu", "tanh"],
    "alpha": [1e-5, 1e-4, 3e-4, 1e-3],
    "learning_rate_init": [1e-3, 5e-4, 3e-4, 1e-4],
    "batch_size": [16, 32, 64],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "f1": make_scorer(f1_score),
    "recall": make_scorer(recall_score),
    "fbeta2": make_scorer(fbeta_score, beta=2)  # emphasize recall
}

rs = RandomizedSearchCV(
    estimator=base_mlp,
    param_distributions=param_dist,
    n_iter=30,
    scoring=scoring,
    refit="fbeta2",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs.fit(X_train_ready, y_train)
best_mlp = rs.best_estimator_

# summarize CV metrics for the selected config
best_idx = rs.best_index_
cvres = rs.cv_results_
print("Best MLP params:", rs.best_params_)
print(f"Best CV F-beta (β=2): {rs.best_score_:.4f}")
print(f"Corresponding CV Recall: {cvres['mean_test_recall'][best_idx]:.4f}")
print(f"Corresponding CV F1: {cvres['mean_test_f1'][best_idx]:.4f}")

# 2) Evaluate on test 
y_pred = best_mlp.predict(X_test_ready)
y_prob = best_mlp.predict_proba(X_test_ready)[:, 1] 

evaluate_model(y_test, y_pred, model_name="Best MLP (Adam)")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best MLP params: {'learning_rate_init': 0.0001, 'hidden_layer_sizes': (128,), 'batch_size': 16, 'alpha': 0.0001, 'activation': 'relu'}
Best CV F-beta (β=2): 0.9492
Corresponding CV Recall: 0.9500
Corresponding CV F1: 0.9484
=== Best MLP (Adam) Evaluation ===
Accuracy : 0.905
Precision: 0.9369369369369369
Recall   : 0.896551724137931
F1 Score : 0.9162995594713657

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89        84
           1       0.94      0.90      0.92       116

    accuracy                           0.91       200
   macro avg       0.90      0.91      0.90       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 77   7]
 [ 12 104]]




## Multilayer Perceptron (MLP) Model Comparison (CVD Diagnosis) – Recall Priority

### 1. Baseline MLP (Adam, default settings)
- **Accuracy**: 0.905  
- **Precision**: 0.937  
- **Recall**: 0.897  
- **F1**: 0.916  
- **Confusion Matrix**: [[77, 7], [12, 104]]

**Interpretation**:  
Solid balance between accuracy and precision. However, recall is just under 90% (12 missed positives), which makes it slightly weaker for diagnosis.

---

### 2. MLP (Adam + EarlyStopping)
- **Accuracy**: 0.91  
- **Precision**: 0.922  
- **Recall**: 0.922  
- **F1**: 0.922  
- **Confusion Matrix**: [[75, 9], [9, 107]]

**Interpretation**:  
Improves recall to ~92%, reducing missed positives to 9. Accuracy remains strong, though precision drops slightly compared to baseline. A good trade-off: more sensitive while still balanced.

---

### 3. **MLP (LBFGS Solver, hidden_layer_sizes=(64,32), tanh activation)**
- **Accuracy**: 0.92  
- **Precision**: 0.924  
- **Recall**: **0.940 (highest)**  
- **F1**: 0.932  
- **Confusion Matrix**: [[75, 9], [7, 109]]

**Interpretation**:  
This LBFGS-based MLP achieves the **highest recall (94%)**, missing only 7 positives. Accuracy and F1 are also the best among the MLPs. It is the **best-performing MLP configuration** for CVD diagnosis in your experiments.

---

### 4. Best MLP (Adam, tuned params: `hidden_layer_sizes=(128,)`, `relu`, `alpha=0.0001`, `batch=16`, `lr=0.0001`)
- **Accuracy**: 0.905  
- **Precision**: 0.937  
- **Recall**: 0.897  
- **F1**: 0.916  
- **Confusion Matrix**: [[77, 7], [12, 104]]

**Interpretation**:  
Despite hyperparameter tuning, performance mirrors the baseline MLP. Recall stays at ~90%, which is weaker than both EarlyStopping and LBFGS models.

---

###  Overview Table (Ranked by Recall Priority)

| Model                               | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|-------------------------------------|----------|-----------|--------|------|-----------------|
| **MLP (LBFGS Solver, best)**        | 0.92     | 0.924     | **0.940** | 0.932 | **7** |
| MLP (Adam + EarlyStopping)          | 0.91     | 0.922     | 0.922  | 0.922 | 9 |
| Baseline MLP (Adam)                 | 0.905    | 0.937     | 0.897  | 0.916 | 12 |
| Tuned MLP (Adam, relu, 128 units)   | 0.905    | 0.937     | 0.897  | 0.916 | 12 |

---

### Final Takeaway
- **MLP (LBFGS Solver, hidden_layer_sizes=(64,32), tanh activation)** is the **best-performing MLP model**: it combines the **highest recall (94%)** with strong accuracy (92%) and F1.  
- **MLP with EarlyStopping** also performs well (~92% recall), a safer choice than baseline.  
- **Baseline MLP** and **Tuned Adam MLP** perform similarly, but their recall (~90%) is weaker for diagnosis.  

 **Recommendation within the MLP family:** The **LBFGS MLP** is the top choice for CVD diagnosis.  



In [22]:
# Save Tuned MLP Results
import joblib, pandas as pd, numpy as np

# Save MLP model
model_filename =  "mlp_lbfgs.pkl"
joblib.dump(mlp_lbfgs, model_filename)

# Ensure 1D arrays for y_true and y_pred
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_pred = y_pred_lbfgs # from tuned MLP predictions
y_prob = y_prob_lbfgs

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_pred": y_pred,
    "y_prob" : y_prob
})

preds_filename = "MendeleyData_75M25F_MLP_lbfgs_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved lbfgs MLP model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved lbfgs MLP model → mlp_lbfgs.pkl
Saved predictions → MendeleyData_75M25F_MLP_lbfgs_predictions.csv
