In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv("heart.csv")

print("First 5 rows:")
display(df.head())

print("\nShape (rows, columns):", df.shape)


First 5 rows:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0



Shape (rows, columns): (1025, 14)


In [3]:
    print("Missing values per column:")
print(df.isna().sum())

print("\nTarget value counts:")
print(df["target"].value_counts())


Missing values per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Target value counts:
target
1    526
0    499
Name: count, dtype: int64


In [4]:
target_col = "target"

X = df.drop(columns=[target_col])
y = df[target_col]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)


Feature shape: (1025, 13)
Target shape: (1025,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (820, 13) (820,)
Test shape: (205, 13) (205,)


In [6]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    else:
        auc = np.nan

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)

    print(f"\n=== {name} ===")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification report:\n", classification_report(y_test, y_pred, zero_division=0))

    return {
        "Model": name,
        "Accuracy": acc,
        "AUC": auc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "MCC": mcc,
    }


In [7]:
results = []

# 1. Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
results.append(evaluate_model("Logistic Regression", log_reg, X_test_scaled, y_test))

# 2. Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
results.append(evaluate_model("Decision Tree", dt, X_test, y_test))

# 3. k-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
results.append(evaluate_model("kNN", knn, X_test_scaled, y_test))

# 4. Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
results.append(evaluate_model("Naive Bayes", nb, X_test, y_test))

# 5. Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
results.append(evaluate_model("Random Forest", rf, X_test, y_test))

# 6. XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)
xgb.fit(X_train, y_train)
results.append(evaluate_model("XGBoost", xgb, X_test, y_test))

results_df = pd.DataFrame(results)
results_df



=== Logistic Regression ===
Confusion matrix:
 [[70 30]
 [ 9 96]]
Classification report:
               precision    recall  f1-score   support

           0       0.89      0.70      0.78       100
           1       0.76      0.91      0.83       105

    accuracy                           0.81       205
   macro avg       0.82      0.81      0.81       205
weighted avg       0.82      0.81      0.81       205


=== Decision Tree ===
Confusion matrix:
 [[100   0]
 [  3 102]]
Classification report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       100
           1       1.00      0.97      0.99       105

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205


=== kNN ===
Confusion matrix:
 [[87 13]
 [15 90]]
Classification report:
               precision    recall  f1-score   support

           0       0.85      0.87   

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.809756,0.92981,0.761905,0.914286,0.831169,0.630908
1,Decision Tree,0.985366,0.985714,1.0,0.971429,0.985507,0.971151
2,kNN,0.863415,0.962905,0.873786,0.857143,0.865385,0.726935
3,Naive Bayes,0.829268,0.904286,0.807018,0.87619,0.840183,0.660163
4,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0
5,XGBoost,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
os.makedirs("model", exist_ok=True)
results_df.to_csv("model/comparison_metrics.csv", index=False)

print("Saved metrics to model/comparison_metrics.csv")


Saved metrics to model/comparison_metrics.csv
