## CVD Prediction - Mendeley Dataset (Source: https://data.mendeley.com/datasets/dzz48mvjht/1)
Model Training and Evaluation

In [1]:
#load preprocessed data 
import pandas as pd
train_df = pd.read_csv("./data_subsets/train_25M_75F.csv")

X_test = pd.read_csv("./data_splits/X_test.csv")
y_test = pd.read_csv("./data_splits/y_test.csv")

#check out the data
train_df.head()

Unnamed: 0,source_id,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,744,20,1,0,137,291.0,0,0,131,1,3.8,1,0,0
1,6,33,1,0,97,354.0,0,0,160,0,2.1,2,1,0
2,506,65,1,0,127,258.0,0,0,158,0,4.1,1,3,0
3,530,24,0,0,136,164.0,0,0,91,1,1.8,1,1,0
4,684,80,0,1,191,433.0,1,1,154,1,3.2,3,3,1


In [2]:
TARGET = "target"
SENSITIVE = "Gender"   # 1 = Male, 0 = Female

categorical_cols = ['gender','chestpain','fastingbloodsugar','restingrelectro','exerciseangia','slope','noofmajorvessels']
continuous_cols  = ['age','restingBP','serumcholestrol','maxheartrate','oldpeak']

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

In [3]:
# SCALE NUMERIC FEATURES ONLY 

import pandas as pd
from sklearn.preprocessing import StandardScaler


# 1) fit scaler on TRAIN numeric columns only
scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X_train[continuous_cols]),
    columns=continuous_cols,
    index=X_train.index
)

# 2) transform TEST with the same scaler
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X_test[continuous_cols]),
    columns=continuous_cols,
    index=X_test.index
)

# 3) reassemble: raw categoricals + scaled numerics
X_train_scaled = pd.concat([X_train[categorical_cols].reset_index(drop=True),
                            X_train_num_scaled.reset_index(drop=True)], axis=1)
X_test_scaled  = pd.concat([X_test[categorical_cols].reset_index(drop=True),
                            X_test_num_scaled.reset_index(drop=True)], axis=1)

# OPTIONAL: quick sanity checks
print("Train numeric means (≈0):")
print(X_train_scaled[continuous_cols].mean().round(3))
print("\nTrain numeric stds (≈1):")
print(X_train_scaled[continuous_cols].std(ddof=0).round(3))

# save for later steps
X_train_scaled.to_csv("data_subsets/train_75M_25F_scaled_only.csv", index=False)
#X_test_scaled.to_csv("data_splits/X_test_scaled_only.csv", index=False)


Train numeric means (≈0):
age               -0.0
restingBP          0.0
serumcholestrol   -0.0
maxheartrate      -0.0
oldpeak            0.0
dtype: float64

Train numeric stds (≈1):
age                1.0
restingBP          1.0
serumcholestrol    1.0
maxheartrate       1.0
oldpeak            1.0
dtype: float64


In [4]:
# one-hot encode categoricals, keep scaled numerics as is 

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# 1) fit encoder on TRAIN categoricals only
ohe = OneHotEncoder(handle_unknown="ignore", drop="if_binary", sparse_output=False)
ohe.fit(X_train_scaled[categorical_cols])

# 2) transform TRAIN and TEST
X_train_cat = pd.DataFrame(
    ohe.transform(X_train_scaled[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train_scaled.index
)
X_test_cat = pd.DataFrame(
    ohe.transform(X_test_scaled[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_test_scaled.index
)

# 3) concatenate: encoded categoricals + scaled numerics
X_train_ready = pd.concat([X_train_cat, X_train_scaled[continuous_cols]], axis=1)
X_test_ready  = pd.concat([X_test_cat,  X_test_scaled[continuous_cols]],  axis=1)

print("Final feature shapes:", X_train_ready.shape, X_test_ready.shape)

Final feature shapes: (600, 22) (200, 22)


### Traditional ML Models - Baseline: K-Nearest Neighbors (KNN) & Decision Tree (DT)

In [5]:
#import required libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

#define a function 
def evaluate_model(y_true, y_pred, model_name):
    print(f"=== {model_name} Evaluation ===")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='binary'))
    print("Recall   :", recall_score(y_true, y_pred, average='binary'))
    print("F1 Score :", f1_score(y_true, y_pred, average='binary'))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n" + "="*40 + "\n")

In [6]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_ready, y_train)
y_pred_knn = knn.predict(X_test_ready)
y_prob_knn = knn.predict_proba(X_test_ready)[:, 1]  
evaluate_model(y_test, y_pred_knn, "KNN")

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_ready, y_train)
y_pred_dt = dt.predict(X_test_ready)
y_prob_dt = dt.predict_proba(X_test_ready)[:, 1]     
evaluate_model(y_test, y_pred_dt, "Decision Tree")

=== KNN Evaluation ===
Accuracy : 0.89
Precision: 0.9122807017543859
Recall   : 0.896551724137931
F1 Score : 0.9043478260869565

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87        84
           1       0.91      0.90      0.90       116

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.89      0.89      0.89       200

Confusion Matrix:
 [[ 74  10]
 [ 12 104]]


=== Decision Tree Evaluation ===
Accuracy : 0.9
Precision: 0.9444444444444444
Recall   : 0.8793103448275862
F1 Score : 0.9107142857142857

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89        84
           1       0.94      0.88      0.91       116

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.90      0.90      0.

## KNN

### Evaluation
- **Accuracy:** 0.890  
- **Precision:** 0.912  
- **Recall:** **0.897**  
- **F1 Score:** 0.904  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 74           | 10           |
| **Actual: 1** | 12           | 104          |

- **False negatives:** **12** (missed CVD)  
- **False positives:** **10** (healthy flagged)

**Interpretation:**  
Balanced performance with **high precision** and **good recall**; a moderate number of missed CVD cases and false alarms.

---

## Decision Tree

### Evaluation
- **Accuracy:** 0.900  
- **Precision:** 0.944  
- **Recall:** **0.879**  
- **F1 Score:** 0.911  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 78           | 6            |
| **Actual: 1** | 14           | 102          |

- **False negatives:** **14** (missed CVD)  
- **False positives:** **6** (healthy flagged)

**Interpretation:**  
Strong **precision** with slightly lower recall; fewer false alarms but more missed CVD cases.

---

### KNN Improvement
The code improves the KNN model by performing a **grid search** over key hyperparameters (`n_neighbors`, `weights`, and `distance metric`) to find the configuration that yields the best performance. After selecting the optimal model, it further explores **decision threshold tuning** to boost recall, which is critical in medical prediction tasks. 

In [7]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

# 1) Hyperparameter tuning for KNN 
param_grid = {
    "n_neighbors": list(range(1, 31)),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "minkowski"],  # minkowski with p=2 is euclidean
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    cv=cv,
    scoring="f1",        
    n_jobs=-1,
    verbose=0,
    refit=True
)

# Fit on your READY features
grid.fit(X_train_ready, y_train)

print("Best KNN params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

best_knn = grid.best_estimator_

# 2) Evaluate best KNN on TEST 
y_pred_knn_best = best_knn.predict(X_test_ready)
y_prob_knn_best = best_knn.predict_proba(X_test_ready)[:, 1]  
evaluate_model(y_test, y_pred_knn_best, "KNN (best params")

Best KNN params: {'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}
Best CV F1: 0.978150785738024
=== KNN (best params Evaluation ===
Accuracy : 0.895
Precision: 0.9279279279279279
Recall   : 0.8879310344827587
F1 Score : 0.9074889867841409

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.88        84
           1       0.93      0.89      0.91       116

    accuracy                           0.90       200
   macro avg       0.89      0.90      0.89       200
weighted avg       0.90      0.90      0.90       200

Confusion Matrix:
 [[ 76   8]
 [ 13 103]]




### KNN: Tuned

**Best KNN params**: `{'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}`  
**Best CV F1**: **0.9782**

---

### Evaluation
- **Accuracy:** 0.895  
- **Precision:** 0.928  
- **Recall:** **0.888**  
- **F1 Score:** 0.907  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 76           | 8            |
| **Actual: 1** | 13           | 103          |

- **False negatives:** **13** (missed CVD)  
- **False positives:** **8** (healthy flagged)

---

## Interpretation
The tuned KNN with a 0.5 threshold achieves **high precision** and **strong recall**, indicating few false alarms alongside a **manageable number of missed CVD cases**. This reflects a balanced operating point suitable for screening.

---


### Further KNN Improvement - Implementing PCA 

In [8]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

# 1) PCA + KNN pipeline 
pca_knn = Pipeline([
    ('pca', PCA(n_components=0.95, random_state=42)),  # keep 95% variance
    ('knn', KNeighborsClassifier(
        n_neighbors=15, metric='manhattan', weights='distance'
    ))
])

pca_knn.fit(X_train_ready, y_train)

# Inspect PCA details
n_comp = pca_knn.named_steps['pca'].n_components_
expl_var = pca_knn.named_steps['pca'].explained_variance_ratio_.sum()
print(f"PCA components: {n_comp} | Explained variance retained: {expl_var:.3f}")

#2) Evaluate 
y_pred_pca_knn = pca_knn.predict(X_test_ready)
probs_pca_knn = pca_knn.predict_proba(X_test_ready)[:, 1]

evaluate_model(y_test, y_pred_pca_knn, "PCA+KNN")

PCA components: 15 | Explained variance retained: 0.966
=== PCA+KNN Evaluation ===
Accuracy : 0.88
Precision: 0.9107142857142857
Recall   : 0.8793103448275862
F1 Score : 0.8947368421052632

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86        84
           1       0.91      0.88      0.89       116

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.88      0.88      0.88       200

Confusion Matrix:
 [[ 74  10]
 [ 14 102]]




### PCA + KNN

**PCA components:** 15  
**Explained variance retained:** **0.966**

---

### Evaluation
- **Accuracy:** 0.880  
- **Precision:** 0.911  
- **Recall:** **0.879**  
- **F1 Score:** 0.895  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 74           | 10           |
| **Actual: 1** | 14           | 102          |

- **False negatives:** **14** (missed CVD)  
- **False positives:** **10** (healthy flagged)

---

## Interpretation
Reducing dimensionality to 15 components preserves **96.6%** of variance and yields **balanced performance**: high precision with **good** (though not maximal) recall. The error profile (FN=14, FP=10) suggests a moderate tendency to **miss some CVD cases** while keeping false alarms controlled. 

---

# KNN Model Comparison (CVD Diagnosis) – Recall Priority

## 1. Baseline KNN
- **Accuracy**: 0.89  
- **Precision**: 0.91  
- **Recall**: **0.90** (highest)  
- **F1**: 0.90  
- **Confusion Matrix**: [[74, 10], [12, 104]]

**Interpretation**:  
This model achieves the **highest recall** of all variants (~90%), meaning it misses the fewest CVD-positive patients (FN = 12). For medical diagnosis, this is crucial — better to flag more patients for further testing than to overlook cases.

---

## 2. Best KNN (Tuned Params: `n_neighbors=1`, `metric=manhattan`, `weights=uniform`)
- **Accuracy**: 0.895  
- **Precision**: **0.93** (highest)  
- **Recall**: 0.89  
- **F1**: **0.91**  
- **Confusion Matrix**: [[76, 8], [13, 103]]

**Interpretation**:  
Tuning improves precision and F1 slightly but recall **drops below the baseline**. This model is more conservative (fewer false positives) but misses **more true CVD cases** (FN = 13). In a clinical setting, that trade-off may not be acceptable if sensitivity is the main concern.

---

## 3. PCA + KNN (15 Components, 96.6% Variance Retained)
- **Accuracy**: 0.88  
- **Precision**: 0.91  
- **Recall**: 0.88 (lowest)  
- **F1**: 0.89  
- **Confusion Matrix**: [[74, 10], [14, 102]]

**Interpretation**:  
PCA reduces recall further (FN = 14). While dimensionality reduction retains most variance, it seems to lose subtle discriminative information critical for catching all positive cases. This makes it the **least favorable** for medical diagnosis.

---

## Overview Table 

| Model                  | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|-------------------------|----------|-----------|--------|------|-----------------|
| **Baseline KNN**        | 0.89     | 0.91      | **0.90** | 0.90 | **12** |
| Best KNN (Tuned Params) | 0.895    | **0.93**  | 0.89   | **0.91** | 13 |
| PCA + KNN (15 comps)    | 0.88     | 0.91      | 0.88   | 0.89 | 14 |

---

### Final Takeaway
- **Baseline KNN** is the most clinically appropriate choice, as it maximizes **recall** and minimizes missed diagnoses.  
- **Best KNN (tuned)** offers higher precision and F1, but at the cost of slightly lower recall — more CVD patients would be missed.  
- **PCA+KNN** performs worst in terms of recall and should be avoided for diagnosis-focused tasks.  

For **CVD screening**, we prioritize catching every possible case (high recall), even if it means accepting some false alarms. Thus, **Baseline KNN** is the preferred model.  


In [9]:
import joblib, pandas as pd, numpy as np

# Ensure y_test is a Series 
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze("columns")

# Save KNN model
model_filename = "knn_model.pkl"
joblib.dump(knn, model_filename)

# Ensure 1D arrays for y_true and use KNN predictions/probabilities
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_pred = knn.predict(X_test_ready)
y_prob = knn.predict_proba(X_test_ready)[:, 1]

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_pred_knn": y_pred,
    "y_prob_knn": y_prob
})

preds_filename = "MendeleyData_75F25M_KNN_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved KNN model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved KNN model → knn_model.pkl
Saved predictions → MendeleyData_75F25M_KNN_predictions.csv


### Improvement - Decision Tree (DT)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# 1) Base model
dt = DecisionTreeClassifier(random_state=42)

# 2) Hyperparameter grid 
param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 5, 7, 9, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 10],
}

# 3) Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4) Grid search 
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=cv,
    scoring="recall",
    n_jobs=-1,
    verbose=0
)

grid_dt.fit(X_train_ready, y_train)

print("Best Decision Tree params:", grid_dt.best_params_)
print("Best CV F1:", grid_dt.best_score_)

# 5) Train & evaluate best DT
best_dt = grid_dt.best_estimator_
y_pred_dt_best = best_dt.predict(X_test_ready)
y_prob_dt_best = best_dt.predict_proba(X_test_ready)[:, 1]  

evaluate_model(y_test, y_pred_dt_best, "Tuned Decision Tree")

Best Decision Tree params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV F1: 0.9766666666666668
=== Tuned Decision Tree Evaluation ===
Accuracy : 0.9
Precision: 0.9
Recall   : 0.9310344827586207
F1 Score : 0.9152542372881356

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.86      0.88        84
           1       0.90      0.93      0.92       116

    accuracy                           0.90       200
   macro avg       0.90      0.89      0.90       200
weighted avg       0.90      0.90      0.90       200

Confusion Matrix:
 [[ 72  12]
 [  8 108]]




### Decision Tree: Tuned

**Best DT params**: `{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}`  
**Best CV F1**: **0.9767**

---

### Evaluation
- **Accuracy:** 0.900  
- **Precision:** 0.900  
- **Recall:** **0.931**  
- **F1 Score:** 0.915  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 72           | 12           |
| **Actual: 1** | 8            | 108          |

- **False negatives (missed CVD):** **8**  
- **False positives (healthy flagged):** **12**  
- **Specificity (class 0):** ~0.857 (72/84)

---

## Interpretation
- The model is tuned toward **high sensitivity**, capturing most CVD cases (**recall ≈ 0.931**) with **8** misses.
- **Precision = 0.90** indicates a **moderate false-positive burden** (**12** cases), a common trade-off when prioritizing recall.
- Overall **accuracy (0.90)** and **F1 (0.915)** reflect a balanced setup suitable when **avoiding missed CVD** is more critical.

In [11]:
# Alternative DT tuning: simpler trees + class balancing + cost-complexity pruning
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

# Stage A: bias toward simpler trees with class_weight="balanced"
base_dt = DecisionTreeClassifier(random_state=42, class_weight="balanced")

param_grid_simple = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6, 7],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [2, 4, 6],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],  # tiny regularization
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_simple = GridSearchCV(
    estimator=base_dt,
    param_grid=param_grid_simple,
    cv=cv,
    scoring="recall",        # recall-focused search
    n_jobs=-1,
    verbose=0,
    refit=True
)
grid_simple.fit(X_train_ready, y_train)

print("Stage A — Best simple DT params:", grid_simple.best_params_)
print("Stage A — Best CV Recall:", grid_simple.best_score_)
simple_dt = grid_simple.best_estimator_

# stage B: cost-complexity pruning on the best simple DT
path = simple_dt.cost_complexity_pruning_path(X_train_ready, y_train)
ccp_alphas = path.ccp_alphas

unique_alphas = np.unique(np.round(ccp_alphas, 6))
candidate_alphas = np.linspace(unique_alphas.min(), unique_alphas.max(), num=min(20, len(unique_alphas)))
candidate_alphas = np.unique(np.concatenate([candidate_alphas, [0.0]]))  # include no-pruning baseline

cv_scores = []
for alpha in candidate_alphas:
    dt_alpha = DecisionTreeClassifier(
        random_state=42,
        class_weight="balanced",
        criterion=simple_dt.criterion,
        max_depth=simple_dt.max_depth,
        min_samples_split=simple_dt.min_samples_split,
        min_samples_leaf=simple_dt.min_samples_leaf,
        min_impurity_decrease=simple_dt.min_impurity_decrease,
        ccp_alpha=alpha
    )
    # recall-focused CV
    recall_cv = cross_val_score(dt_alpha, X_train_ready, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    cv_scores.append((alpha, recall_cv))

best_alpha, best_cv_recall = sorted(cv_scores, key=lambda x: x[1], reverse=True)[0]
print(f"Stage B — Best ccp_alpha: {best_alpha:.6f} | CV Recall: {best_cv_recall:.4f}")

# Final model fit with the chosen ccp_alpha
best_dt = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced",
    criterion=simple_dt.criterion,
    max_depth=simple_dt.max_depth,
    min_samples_split=simple_dt.min_samples_split,
    min_samples_leaf=simple_dt.min_samples_leaf,
    min_impurity_decrease=simple_dt.min_impurity_decrease,
    ccp_alpha=best_alpha
).fit(X_train_ready, y_train)

# Evaluation
y_pred_dt = best_dt.predict(X_test_ready)
y_prob_dt = best_dt.predict_proba(X_test_ready)[:, 1] 
evaluate_model(y_test, y_pred_dt, "Alternative Tuned & Pruned DT")

Stage A — Best simple DT params: {'criterion': 'gini', 'max_depth': 5, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 10}
Stage A — Best CV Recall: 0.97
Stage B — Best ccp_alpha: 0.000000 | CV Recall: 0.9700
=== Alternative Tuned & Pruned DT Evaluation ===
Accuracy : 0.9
Precision: 0.9
Recall   : 0.9310344827586207
F1 Score : 0.9152542372881356

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.86      0.88        84
           1       0.90      0.93      0.92       116

    accuracy                           0.90       200
   macro avg       0.90      0.89      0.90       200
weighted avg       0.90      0.90      0.90       200

Confusion Matrix:
 [[ 72  12]
 [  8 108]]




## Decision Tree: Alternative Tuned & Pruned

**Stage A — Best simple DT params**: `{'criterion': 'gini', 'max_depth': 5, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 10}`  
**Stage A — Best CV Recall**: **0.9700**  
**Stage B — Best `ccp_alpha`**: **0.000000** | **CV Recall**: **0.9700**

---

### Test Evaluation
- **Accuracy:** 0.900  
- **Precision:** 0.900  
- **Recall:** **0.931**  
- **F1 Score:** 0.915  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 72           | 12           |
| **Actual: 1** | 8            | 108          |

- **False negatives:** **8** (missed CVD)  
- **False positives:** **12** (healthy flagged)  

**Interpretation:**  
The pruned DT emphasizes **high recall**—it captures most CVD cases (FN=8)—with **moderate precision** and some increase in false positives (FP=12). This model suits scenarios where **missing CVD is costlier** than extra follow-ups.


In [12]:
# Alternative DT tuning focused on higher recall
# Changes vs previous:
#  - Remove calibration (predict uses raw tree probs at 0.5)
#  - Tune class_weight (heavier positive weights allowed)
#  - Broaden depth a bit but keep regularization via min_samples_* and tiny impurity decrease
#  - Prune only with very small ccp_alphas to avoid killing recall

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
import numpy as np

# Simpler-but-expressive trees + tuned class weights
base_dt = DecisionTreeClassifier(random_state=42)

param_grid_simple = {
    "criterion": ["gini", "entropy"],                  # add "log_loss" if your sklearn supports it
    "max_depth": [4, 5, 6, 7, 8, 9, 10],               # a bit deeper to help recall
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6],
    "min_impurity_decrease": [0.0, 1e-4, 1e-3],
    "class_weight": ["balanced", {0:1,1:2}, {0:1,1:3}, {0:1,1:4}],  # stronger push toward positives
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

grid_simple = GridSearchCV(
    estimator=base_dt,
    param_grid=param_grid_simple,
    cv=cv,
    scoring="recall",      # prioritize sensitivity for class 1
    n_jobs=-1,
    verbose=0,
    refit=True
)
grid_simple.fit(X_train_ready, y_train)

best_params = grid_simple.best_params_
print("Stage A — Best DT params:", best_params)
print("Stage A — Best CV Recall:", round(grid_simple.best_score_, 4))

# Train a zero-pruned model with best params to get the pruning path
dt0 = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=0.0).fit(X_train_ready, y_train)


# Stage B — Gentle cost-complexity pruning (favor small alphas)
path = dt0.cost_complexity_pruning_path(X_train_ready, y_train)
ccp_alphas = path.ccp_alphas

# Focus on tiny alphas only + 0.0 to avoid big recall loss
small_slice = ccp_alphas[: min(30, len(ccp_alphas))]  # first 30 values are typically the smallest
candidate_alphas = np.unique(np.r_[0.0, small_slice])

cv_scores = []
for alpha in candidate_alphas:
    dt_alpha = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=alpha)
    rec = cross_val_score(dt_alpha, X_train_ready, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    cv_scores.append((alpha, rec))

best_alpha, best_cv_recall = max(cv_scores, key=lambda x: x[1])
print(f"Stage B — Best ccp_alpha: {best_alpha:.6f} | CV Recall: {best_cv_recall:.4f}")

alt_best_dt = DecisionTreeClassifier(random_state=42, **best_params, ccp_alpha=best_alpha).fit(X_train_ready, y_train)


# Evaluation
y_pred = alt_best_dt.predict(X_test_ready)               
y_prob = alt_best_dt.predict_proba(X_test_ready)[:, 1]   

evaluate_model(y_test, y_pred, "Alternative Tuned & Pruned Decision Tree")

Stage A — Best DT params: {'class_weight': {0: 1, 1: 4}, 'criterion': 'gini', 'max_depth': 6, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10}
Stage A — Best CV Recall: 0.9817
Stage B — Best ccp_alpha: 0.000000 | CV Recall: 0.9817
=== Alternative Tuned & Pruned Decision Tree Evaluation ===
Accuracy : 0.9
Precision: 0.8934426229508197
Recall   : 0.9396551724137931
F1 Score : 0.9159663865546218

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.85      0.88        84
           1       0.89      0.94      0.92       116

    accuracy                           0.90       200
   macro avg       0.90      0.89      0.90       200
weighted avg       0.90      0.90      0.90       200

Confusion Matrix:
 [[ 71  13]
 [  7 109]]




### Decision Tree Model Comparison (CVD Diagnosis) 

### 1. Baseline Decision Tree
- **Accuracy**: 0.90  
- **Precision**: **0.94** (highest)  
- **Recall**: 0.879 (lowest)  
- **F1**: 0.911  
- **Confusion Matrix**: [[78, 6], [14, 102]]

**Interpretation**:  
High precision but lower recall (misses 14 CVD patients). Not ideal for diagnosis.

---

### 2. Tuned Decision Tree (Best Params: `gini`, `max_depth=5`, `min_samples_split=10`, `min_samples_leaf=1`)
- **Accuracy**: 0.90  
- **Precision**: 0.90  
- **Recall**: 0.931  
- **F1**: 0.915  
- **Confusion Matrix**: [[72, 12], [8, 108]]

**Interpretation**:  
Much better balance — recall improves to ~93%, halving false negatives compared to the baseline.

---

### 3. Alternative Tuned & Pruned DT (same params, simplified tree)
- **Accuracy**: 0.90  
- **Precision**: 0.90  
- **Recall**: 0.931  
- **F1**: 0.915  
- **Confusion Matrix**: [[72, 12], [8, 108]]

**Interpretation**:  
Matches the tuned DT in performance but is simpler and more interpretable. Strong candidate for clinical deployment.

---

### 4. Class-Weighted Tuned & Pruned DT (`class_weight={0:1, 1:4}`)
- **Accuracy**: 0.90  
- **Precision**: 0.893  
- **Recall**: **0.940 (highest)**  
- **F1**: 0.916  
- **Confusion Matrix**: [[71, 13], [7, 109]]

**Interpretation**:  
This model achieves the **highest recall** (~94%), missing only 7 CVD patients. Precision drops slightly compared to others (more false positives), but for diagnosis this is an acceptable trade-off. This makes it the **best DT for sensitivity**.

---

### Overview Table (Ranked by Recall Priority)

| Model                                   | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|-----------------------------------------|----------|-----------|--------|------|-----------------|
| **Class-Weighted Tuned & Pruned DT**    | 0.90     | 0.893     | **0.940** | 0.916 | **7** |
| Tuned DT                                | 0.90     | 0.90      | 0.931 | 0.915 | 8 |
| Alt. Tuned & Pruned DT                  | 0.90     | 0.90      | 0.931 | 0.915 | 8 |
| Baseline DT                             | 0.90     | **0.94**  | 0.879 | 0.911 | 14 |

---

### Final Takeaway
- The **class-weighted DT** is the **best choice for diagnosis**: it maximizes recall (94%) while keeping reasonable precision.  
- The **tuned & pruned DTs** are close runners-up, still strong options with ~93% recall and better balance.  
- The **baseline DT** has the highest precision but unacceptably low recall for medical screening.  

**Recommended for CVD diagnosis: Class-Weighted Tuned & Pruned DT** — best recall, fewer missed cases, still interpretable.  


In [13]:
import joblib, pandas as pd, numpy as np

# Save tuned Decision Tree model
model_filename = "class_weighted_tuned_dt_model.pkl"
joblib.dump(best_dt, model_filename)

# Ensure 1D arrays for y_true and y_pred
y_true_dt = y_test.to_numpy().ravel() if hasattr(y_test, "to_numpy") else np.asarray(y_test).ravel()
y_pred_dt = alt_best_dt.predict(X_test_ready)               
y_prob_dt = alt_best_dt.predict_proba(X_test_ready)[:, 1]   

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true_dt), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true_dt,
    "y_pred_dt": y_pred_dt,
    "y_prob_dt": y_prob_dt
})

preds_filename = "MendeleyData_75F25M_DT_classweightedtuned_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved class weighted tuned DT model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved class weighted tuned DT model → class_weighted_tuned_dt_model.pkl
Saved predictions → MendeleyData_75F25M_DT_classweightedtuned_predictions.csv


### Ensemble Model - Random Forest (RF)

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Train the model
rf.fit(X_train_ready, y_train)

# Predict on test set
y_pred_rf = rf.predict(X_test_ready)
y_prob_rf = rf.predict_proba(X_test_ready)[:, 1]  
evaluate_model(y_test, y_pred_rf, "Random Forest")

=== Random Forest Evaluation ===
Accuracy : 0.925
Precision: 0.9469026548672567
Recall   : 0.9224137931034483
F1 Score : 0.9344978165938864

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91        84
           1       0.95      0.92      0.93       116

    accuracy                           0.93       200
   macro avg       0.92      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200

Confusion Matrix:
 [[ 78   6]
 [  9 107]]




## Random Forest

### Evaluation
- **Accuracy:** 0.925  
- **Precision:** 0.947  
- **Recall:** **0.922**  
- **F1 Score:** 0.934  

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 78           | 6            |
| **Actual: 1** | 9            | 107          |

- **False negatives:** **9** (missed CVD)  
- **False positives:** **6** (healthy flagged)

**Interpretation:**  
The model shows **strong overall performance** with **high precision** and **good recall**, indicating few false alarms and a manageable number of missed CVD cases. This is a balanced setup for CVD screening.

---

### Improvement Random Forest (RF)

In [15]:
# Random Forest: hyperparameter tuning 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 8, 12, 16],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.8],  # 0.8 = 80% of features
    "class_weight": [None, "balanced"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring="recall",     # recall-focused
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid.fit(X_train_ready, y_train)
best_rf = grid.best_estimator_
print("Best RF params:", grid.best_params_)
print("Best CV Recall:", grid.best_score_)

# Evaluate best RF 
y_pred_rf = best_rf.predict(X_test_ready)
y_prob_rf = best_rf.predict_proba(X_test_ready)[:, 1]

evaluate_model(y_test, y_pred_rf, "Random Forest (best)")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best RF params: {'class_weight': None, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Recall: 0.9866666666666667
=== Random Forest (best) Evaluation ===
Accuracy : 0.91
Precision: 0.9298245614035088
Recall   : 0.9137931034482759
F1 Score : 0.9217391304347826

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89        84
           1       0.93      0.91      0.92       116

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 76   8]
 [ 10 106]]




## Random Forest Model Comparison (CVD Diagnosis) 

### 1. Baseline Random Forest
- **Accuracy**: 0.925 (highest overall)  
- **Precision**: **0.947** (highest)  
- **Recall**: **0.922** (highest of RF models)  
- **F1**: 0.934 (highest overall)  
- **Confusion Matrix**: [[78, 6], [9, 107]]

**Interpretation**:  
The baseline RF is excellent: it achieves both high recall (~92%) and very high precision (~95%). It misses only 9 CVD-positive patients, fewer than most KNNs and DTs. Strong all-around model for diagnosis.

---

### 2. Tuned Random Forest (Best Params: `n_estimators=200`, `max_features='sqrt'`, `min_samples_split=2`, `min_samples_leaf=1`)
- **Accuracy**: 0.91  
- **Precision**: 0.930  
- **Recall**: 0.914  
- **F1**: 0.922  
- **Confusion Matrix**: [[76, 8], [10, 106]]

**Interpretation**:  
Tuning improves cross-validation recall, but in evaluation the recall is slightly lower than baseline (~91%). It misses 10 positives (vs. 9 for baseline). Still strong, but baseline edges it out in practice.

---

### Overview Table 

| Model                  | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|-------------------------|----------|-----------|--------|------|-----------------|
| **Baseline RF**         | **0.925** | **0.947** | **0.922** | **0.934** | **9** |
| Tuned RF                | 0.910    | 0.930     | 0.914  | 0.922 | 10 |

---

### Final Takeaway
- The **baseline Random Forest** is the best performer: it combines the **highest recall (92%)**, **highest precision (95%)**, and **highest F1**. It misses just 9 positives.  
- The **tuned RF** is slightly weaker in both recall and precision, even though it looked promising during CV.  

**Recommended for CVD diagnosis (within RF family): Baseline Random Forest** — it maximizes sensitivity without sacrificing precision.  


In [16]:
# Save best Random Forest Results for fairness evaluation
import joblib, pandas as pd, numpy as np

# Save Random Forest model
model_filename = "rf_model.pkl"
joblib.dump(rf, model_filename)

# Ensure 1D arrays for y_true and y_pred
y_true_rf = y_test.to_numpy().ravel() if hasattr(y_test, "to_numpy") else np.asarray(y_test).ravel()
rf.fit(X_train_ready, y_train)
y_pred_rf = rf.predict(X_test_ready)
y_prob_rf = rf.predict_proba(X_test_ready)[:, 1]

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true_rf), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true_rf,
    "y_pred_rf": y_pred_rf,
    "y_prob": y_prob_rf
})

preds_filename = "MendeleyData_75F25M_RF_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved Baseline RF model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved Baseline RF model → rf_model.pkl
Saved predictions → MendeleyData_75F25M_RF_predictions.csv


### Deep Learning - Multi-layer Perceptron

In [17]:
#import required library 
from sklearn.neural_network import MLPClassifier

In [18]:
# Initialize MLP model
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),   # one hidden layer with 100 neurons
    activation='relu',           # or 'tanh'
    solver='adam',               # optimizer
    max_iter=1000,                # increase if convergence warning appears
    random_state=42
)

# Train the model
mlp.fit(X_train_ready, y_train)

# Predict
y_pred_mlp = mlp.predict(X_test_ready)
y_prob_mlp = mlp.predict_proba(X_test_ready)[:, 1]  

evaluate_model(y_test, y_pred_mlp, "Multilayer Perceptron (MLP)")

=== Multilayer Perceptron (MLP) Evaluation ===
Accuracy : 0.885
Precision: 0.9043478260869565
Recall   : 0.896551724137931
F1 Score : 0.9004329004329005

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.86        84
           1       0.90      0.90      0.90       116

    accuracy                           0.89       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.89      0.89      0.89       200

Confusion Matrix:
 [[ 73  11]
 [ 12 104]]




## Multilayer Perceptron (MLP)

### Evaluation
- **Accuracy:** 0.885  
- **Precision:** 0.904  
- **Recall:** **0.897**  
- **F1 Score:** 0.900  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 73           | 11           |
| **Actual: 1** | 12           | 104          |

- **False negatives:** **12** (missed CVD)  
- **False positives:** **11** (healthy flagged)

---

## Interpretation
The MLP delivers **balanced performance near 0.90** across precision, recall, and F1. It **captures most CVD cases** while keeping false alarms moderate (11).

---


### Improvements - MLP

In [19]:
#Adam + Early Stopping 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

adammlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),   # slightly smaller/deeper can help
    activation='relu',
    solver='adam',
    learning_rate_init=1e-3,       # smaller step can stabilize
    alpha=1e-3,                    # L2 regularization to reduce overfitting
    batch_size=32,
    max_iter=1000,                 # increased max_iter
    early_stopping=True,           # use a validation split internally
    validation_fraction=0.15,
    n_iter_no_change=25,          
    tol=1e-4,
    random_state=42
)

adammlp.fit(X_train_ready, y_train)  
y_pred_mlp = adammlp.predict(X_test_ready)                     
y_prob_mlp = adammlp.predict_proba(X_test_ready)[:, 1]         

evaluate_model(y_test, y_pred_mlp, "(Adam + EarlyStopping)")

=== (Adam + EarlyStopping) Evaluation ===
Accuracy : 0.895
Precision: 0.9130434782608695
Recall   : 0.9051724137931034
F1 Score : 0.9090909090909091

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.88        84
           1       0.91      0.91      0.91       116

    accuracy                           0.90       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.90      0.90      0.90       200

Confusion Matrix:
 [[ 74  10]
 [ 11 105]]




In [20]:
# LBFGS solver - converges fast & well on small datasets
# LBFGS ignores batch_size, early_stopping, learning_rate. It optimizes the full-batch loss.
mlp_lbfgs = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='tanh',         # tanh + lbfgs often works nicely on tabular data
    solver='lbfgs',            # quasi-Newton optimizer
    alpha=1e-3,
    max_iter=1000,
    random_state=42
)

mlp_lbfgs.fit(X_train_ready, y_train)
y_pred_lbfgs = mlp_lbfgs.predict(X_test_ready)
y_prob_lbfgs = mlp_lbfgs.predict_proba(X_test_ready)[:, 1] 

evaluate_model(y_test, y_pred_lbfgs, "Multilayer Perceptron (MLP)- LBFGS solver")

=== Multilayer Perceptron (MLP)- LBFGS solver Evaluation ===
Accuracy : 0.885
Precision: 0.8842975206611571
Recall   : 0.9224137931034483
F1 Score : 0.9029535864978903

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86        84
           1       0.88      0.92      0.90       116

    accuracy                           0.89       200
   macro avg       0.89      0.88      0.88       200
weighted avg       0.89      0.89      0.88       200

Confusion Matrix:
 [[ 70  14]
 [  9 107]]




## Multilayer Perceptron (MLP) - LBFGS solver

### Evaluation
- **Accuracy:** 0.885  
- **Precision:** 0.884  
- **Recall:** **0.922**  
- **F1 Score:** 0.903  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 70           | 14           |
| **Actual: 1** | 9            | 107          |

- **False negatives:** **9** (missed CVD)  
- **False positives:** **14** (healthy flagged)  
- **Specificity (class 0):** ~0.833 (70/84)

---

## Interpretation
The MLP emphasizes **high recall** (≈0.922), capturing most CVD cases with **few misses** (FN=9). This comes with a **moderate false-positive rate** (FP=14), reflected in precision (0.884). 

---


### Further Improvement MLP 

In [21]:
# Recall-first MLP 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, recall_score, fbeta_score, make_scorer
import numpy as np

# 1) Base model: Adam
base_mlp = MLPClassifier(
    solver="adam",
    early_stopping=False,      
    max_iter=1000,             # observed full convergence at 1000
    tol=1e-4,                  # default; tighten if you like (e.g., 1e-5)
    random_state=42
)

param_dist = {
    "hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64)],
    "activation": ["relu", "tanh"],
    "alpha": [1e-5, 1e-4, 3e-4, 1e-3],
    "learning_rate_init": [1e-3, 5e-4, 3e-4, 1e-4],
    "batch_size": [16, 32, 64],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "f1": make_scorer(f1_score),
    "recall": make_scorer(recall_score),
    "fbeta2": make_scorer(fbeta_score, beta=2)  # emphasize recall
}

rs = RandomizedSearchCV(
    estimator=base_mlp,
    param_distributions=param_dist,
    n_iter=30,
    scoring=scoring,
    refit="fbeta2",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs.fit(X_train_ready, y_train)
best_mlp = rs.best_estimator_

# Optional: summarize CV metrics for the selected config
best_idx = rs.best_index_
cvres = rs.cv_results_
print("Best MLP params:", rs.best_params_)
print(f"Best CV F-beta (β=2): {rs.best_score_:.4f}")
print(f"Corresponding CV Recall: {cvres['mean_test_recall'][best_idx]:.4f}")
print(f"Corresponding CV F1: {cvres['mean_test_f1'][best_idx]:.4f}")

# 2) Evaluate on test 
y_pred = best_mlp.predict(X_test_ready)
y_prob = best_mlp.predict_proba(X_test_ready)[:, 1]  # probability of positive class

evaluate_model(y_test, y_pred, model_name="Best MLP (Adam)")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best MLP params: {'learning_rate_init': 0.001, 'hidden_layer_sizes': (64, 32), 'batch_size': 32, 'alpha': 0.001, 'activation': 'tanh'}
Best CV F-beta (β=2): 0.9766
Corresponding CV Recall: 0.9767
Corresponding CV F1: 0.9767
=== Best MLP (Adam) Evaluation ===
Accuracy : 0.905
Precision: 0.9292035398230089
Recall   : 0.9051724137931034
F1 Score : 0.9170305676855895

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.89        84
           1       0.93      0.91      0.92       116

    accuracy                           0.91       200
   macro avg       0.90      0.90      0.90       200
weighted avg       0.91      0.91      0.91       200

Confusion Matrix:
 [[ 76   8]
 [ 11 105]]




### MLP (Adam): Tuned

**Best MLP params**: `{'learning_rate_init': 0.001, 'hidden_layer_sizes': (64, 32), 'batch_size': 32, 'alpha': 0.001, 'activation': 'tanh'}`  
**Best CV F-beta (β=2)**: **0.9766** | **CV Recall**: **0.9767** | **CV F1**: **0.9767**

---

### Test Evaluation
- **Accuracy:** 0.905  
- **Precision:** 0.929  
- **Recall:** **0.905**  
- **F1 Score:** 0.917  
- **Support:** 0→84, 1→116

**Confusion Matrix**  
|               | Predicted: 0 | Predicted: 1 |
|--------------:|-------------:|-------------:|
| **Actual: 0** | 76           | 8            |
| **Actual: 1** | 11           | 105          |

- **False negatives:** **11** (missed CVD)  
- **False positives:** **8** (healthy flagged)

---

## Interpretation
The tuned MLP achieves **strong, balanced performance**: high **recall (≈0.905)** captures most CVD cases, while **precision (≈0.929)** keeps false alarms modest. The error profile (FN=11, FP=8) reflects a sensible screening trade-off. 

---


### Multilayer Perceptron (MLP) Model Comparison (CVD Diagnosis) – Recall Priority

### 1. Baseline MLP (default solver)
- **Accuracy**: 0.885  
- **Precision**: 0.904  
- **Recall**: 0.897  
- **F1**: 0.900  
- **Confusion Matrix**: [[73, 11], [12, 104]]

**Interpretation**:  
Decent baseline, but recall is under 90%. Misses 12 CVD cases.

---

### 2. MLP (Adam + EarlyStopping)
- **Accuracy**: 0.895  
- **Precision**: 0.913  
- **Recall**: 0.905  
- **F1**: 0.909  
- **Confusion Matrix**: [[74, 10], [11, 105]]

**Interpretation**:  
Improves recall slightly over baseline (to ~91%) while keeping decent precision. Misses 11 positives. Early stopping helped prevent overfitting, making this a stable improvement.

---

### 3. MLP (LBFGS Solver)
- **Accuracy**: 0.885  
- **Precision**: 0.884  
- **Recall**: **0.922** (highest)  
- **F1**: 0.903  
- **Confusion Matrix**: [[70, 14], [9, 107]]

**Interpretation**:  
Best recall (~92%), missing only 9 positives. Precision is lower (more false positives), but this is acceptable in diagnosis. Strongest for sensitivity.

---

### 4. Best MLP (Tuned Adam, `(64,32)` hidden layers, `tanh`, `alpha=0.001`, `batch=32`, `lr=0.001`)
- **Accuracy**: 0.905  
- **Precision**: **0.929**  
- **Recall**: 0.905  
- **F1**: **0.917**  
- **Confusion Matrix**: [[76, 8], [11, 105]]

**Interpretation**:  
Best overall balance: accuracy 91%, recall 91%, precision 93%. Misses 11 positives — same as EarlyStopping, but with higher precision and F1.

---

## Overview Table

| Model                         | Accuracy | Precision | Recall | F1   | FN (Missed CVD) |
|--------------------------------|----------|-----------|--------|------|-----------------|
| **MLP (LBFGS Solver)**         | 0.885    | 0.884     | **0.922** | 0.903 | **9** |
| Best MLP (Adam, tuned)         | 0.905    | **0.929** | 0.905  | **0.917** | 11 |
| MLP (Adam + EarlyStopping)     | 0.895    | 0.913     | 0.905  | 0.909 | 11 |
| Baseline MLP                   | 0.885    | 0.904     | 0.897  | 0.900 | 12 |

---

###  Final Takeaway
- **MLP (LBFGS Solver)** is the best for recall (92%), ideal for diagnosis where sensitivity is critical.  
- **Best MLP (Adam, tuned)** provides the strongest overall balance (accuracy, precision, F1) with still solid recall.  
- **MLP (Adam + EarlyStopping)** improves stability and recall over baseline but lags slightly behind tuned Adam.  
- **Baseline MLP** is the weakest.  

**For diagnosis: LBFGS MLP is most sensitive; for balance: tuned Adam is strongest.**



In [22]:
# Save Recall-First Tuned MLP Results
import joblib, pandas as pd, numpy as np

# Save tuned MLP model
model_filename = " mlp_lbfgs.pkl"
joblib.dump(mlp_lbfgs, model_filename)

# Ensure 1D arrays for y_true and y_pred
y_true = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_pred_lbfgs = mlp_lbfgs.predict(X_test_ready) # from tuned MLP predictions
y_prob_lbfgs = mlp_lbfgs.predict_proba(X_test_ready)[:, 1] 

# Optional gender column if present in test set
if isinstance(X_test, pd.DataFrame) and "gender" in X_test.columns:
    gender_vals = X_test["gender"].to_numpy()
else:
    gender_vals = np.full(shape=len(y_true), fill_value=np.nan)

# Build and save results DataFrame
results = pd.DataFrame({
    "gender": gender_vals,
    "y_true": y_true,
    "y_pred_lbfgs": y_pred,
    "y_prob_lbfgs" : y_prob
})

preds_filename = "MendeleyData_75F25M_MLP_lbfgs_predictions.csv"
results.to_csv(preds_filename, index=False)

print(f"Saved lbfgs MLP model → {model_filename}")
print(f"Saved predictions → {preds_filename}")

Saved lbfgs MLP model →  mlp_lbfgs.pkl
Saved predictions → MendeleyData_75F25M_MLP_lbfgs_predictions.csv
