# PRCP-1016-HeartDiseasePred
## Team ID -  PTID-CDS-NOV-25-3300
### Muhammed Sayees

### Section 1 – Project setup & data loading

In [4]:
# ---- Section 1: Setup ----
import numpy as np
import pandas as pd

from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("data")

# ---- Load values and labels ----
values = pd.read_csv(DATA_DIR / "values.csv")
labels = pd.read_csv(DATA_DIR / "labels.csv")

print("Values shape:", values.shape)
print("Labels shape:", labels.shape)
print(values.head())
print(labels.head())


Values shape: (180, 14)
Labels shape: (180, 2)
  patient_id  slope_of_peak_exercise_st_segment               thal  \
0     0z64un                                  1             normal   
1     ryoo3j                                  2             normal   
2     yt1s1x                                  1             normal   
3     l2xjde                                  1  reversible_defect   
4     oyt4ek                                  3  reversible_defect   

   resting_blood_pressure  chest_pain_type  num_major_vessels  \
0                     128                2                  0   
1                     110                3                  0   
2                     125                4                  3   
3                     152                4                  0   
4                     178                1                  0   

   fasting_blood_sugar_gt_120_mg_per_dl  resting_ekg_results  \
0                                     0                    2   
1            

### Sectiob 2 Merge and quick sanity checks

In [5]:
df = values.merge(labels, on="patient_id", how="inner")
df.set_index("patient_id", inplace=True)

target_col = "heart_disease_present"  # change if your label column is named differently
X = df.drop(columns=[target_col])
y = df[target_col]

print("Merged shape:", df.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

# Quick info
display(df.head())
df.info()


Merged shape: (180, 14)
Target distribution:
 heart_disease_present
0    0.555556
1    0.444444
Name: proportion, dtype: float64


Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,heart_disease_present
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 0z64un to 2nx10r
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   slope_of_peak_exercise_st_segment     180 non-null    int64  
 1   thal                                  180 non-null    object 
 2   resting_blood_pressure                180 non-null    int64  
 3   chest_pain_type                       180 non-null    int64  
 4   num_major_vessels                     180 non-null    int64  
 5   fasting_blood_sugar_gt_120_mg_per_dl  180 non-null    int64  
 6   resting_ekg_results                   180 non-null    int64  
 7   serum_cholesterol_mg_per_dl           180 non-null    int64  
 8   oldpeak_eq_st_depression              180 non-null    float64
 9   sex                                   180 non-null    int64  
 10  age                                   180 non-null    int64  
 11  max_heart_rate_a

### Section 3 – Define feature types and preprocessing

In [6]:
numeric_features = [
    "age",
    "resting_blood_pressure",
    "serum_cholesterol_mg_per_dl",
    "max_heart_rate_achieved",
    "oldpeak_eq_st_depression",
    "num_major_vessels",
    "fasting_blood_sugar_gt_120_mg_per_dl",
    "sex",
    "exercise_induced_angina",
]

categorical_features = [
    "thal",
    "chest_pain_type",
    "resting_ekg_results",
    "slope_of_peak_exercise_st_segment",
]

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


### Section 4 – Baseline model (Logistic Regression) with full metrics

In [7]:
# Train/test split (keep a true hold-out for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

log_reg_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",  # handles any mild imbalance
        random_state=RANDOM_STATE
    ))
])

log_reg_clf.fit(X_train, y_train)

y_pred = log_reg_clf.predict(X_test)
y_proba = log_reg_clf.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("PR AUC:", average_precision_score(y_test, y_proba))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8611111111111112
Precision: 0.7619047619047619
Recall: 1.0
F1: 0.8648648648648649
ROC AUC: 0.959375
PR AUC: 0.9480888483936084
Confusion matrix:
 [[15  5]
 [ 0 16]]
