In [10]:
# Heart Disease (UCI) - Logistic Regression Project
# --------------------------------------------------
# See instructions inside for downloading the dataset and running the model.

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

DATA_DIR = "data"
OUT_DIR = "outputs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

CANDIDATE_FILES = [
    os.path.join(DATA_DIR, "heart_cleveland_upload.csv"),
    os.path.join(DATA_DIR, "heart.csv"),
    os.path.join(DATA_DIR, "processed.cleveland.data"),
]

def load_dataset():
    for path in CANDIDATE_FILES:
        if os.path.exists(path):
            if path.endswith(".data"):
                cols = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach",
                        "exang","oldpeak","slope","ca","thal","target"]
                df = pd.read_csv(path, header=None, names=cols, na_values='?')
            else:
                df = pd.read_csv(path)
            print(f"Loaded: {path}")
            return df
    raise FileNotFoundError("Place the dataset in ./data/ as 'heart_cleveland_upload.csv' or 'heart.csv'.")

df = load_dataset()
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

possible_targets = ["target","num","heart_disease","output", "condition"]
target_col = next((t for t in possible_targets if t in df.columns), None)
if target_col is None:
    raise ValueError("Target column not found. Expected one of: target, num, heart_disease, output, condition.")

if target_col == "num":
    df["target"] = (df["num"] > 0).astype(int)
    target_col = "target"

df_clean = df.dropna().copy()
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col].astype(int)

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

preprocessed_path = os.path.join(OUT_DIR, "preprocessed_heart.csv")
df_clean.assign(**{target_col: y}).to_csv(preprocessed_path, index=False)

with open(os.path.join(OUT_DIR, "metrics.txt"), "w") as f:
    f.write(f"Accuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(str(cm) + "\n")

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(4,4))
im = ax.imshow(cm, cmap='Blues')
ax.set_title("Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, cm[i, j], ha='center', va='center')
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "confusion_matrix.png"))
plt.close()

Loaded: data/heart_cleveland_upload.csv
Accuracy: 0.8666666666666667
Precision: 0.9629629629629629
Recall: 0.7428571428571429
F1: 0.8387096774193549
Confusion Matrix:
 [[39  1]
 [ 9 26]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.97      0.89        40
           1       0.96      0.74      0.84        35

    accuracy                           0.87        75
   macro avg       0.89      0.86      0.86        75
weighted avg       0.88      0.87      0.86        75



In [4]:
import os
os.makedirs("data", exist_ok=True)


In [6]:
!ls data/

heart_cleveland_upload.csv


In [7]:
import pandas as pd
df = pd.read_csv("data/heart_cleveland_upload.csv")
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
