In [74]:
# Step 0: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, roc_curve,auc
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [58]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 

# Combine into one DataFrame for easier processing
df = pd.concat([X, y], axis=1) 
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [59]:
data=df.dropna()

In [60]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [61]:
categorical_cols = ['cp', 'restecg', 'slope', 'thal']
df_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [62]:
from sklearn.preprocessing import MinMaxScaler

# columns need scalling
num_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

df_scaled = df_encoded.copy()

scaler = MinMaxScaler()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

In [63]:
df_scaled.head(10)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,num,cp_2,cp_3,cp_4,restecg_1,restecg_2,slope_2,slope_3,thal_6.0,thal_7.0
0,0.708333,1,0.481132,0.244292,1,0.603053,0,0.370968,0.0,0,False,False,False,False,True,False,True,True,False
1,0.791667,1,0.622642,0.365297,0,0.282443,1,0.241935,1.0,2,False,False,True,False,True,True,False,False,False
2,0.791667,1,0.245283,0.23516,0,0.442748,1,0.419355,0.666667,1,False,False,True,False,True,True,False,False,True
3,0.166667,1,0.339623,0.283105,0,0.885496,0,0.564516,0.0,0,False,True,False,False,False,False,True,False,False
4,0.25,0,0.339623,0.178082,0,0.770992,0,0.225806,0.0,0,True,False,False,False,True,False,False,False,False
5,0.5625,1,0.245283,0.251142,0,0.816794,0,0.129032,0.0,0,True,False,False,False,False,False,False,False,False
6,0.6875,0,0.433962,0.324201,0,0.679389,0,0.580645,0.666667,3,False,False,True,False,True,False,True,False,False
7,0.583333,0,0.245283,0.520548,0,0.70229,1,0.096774,0.0,0,False,False,True,False,False,False,False,False,False
8,0.708333,1,0.339623,0.292237,0,0.580153,0,0.225806,0.333333,2,False,False,True,False,True,True,False,False,True
9,0.5,1,0.433962,0.175799,1,0.641221,1,0.5,0.0,1,False,False,True,False,True,False,True,False,True


In [64]:
df_scaled.isnull().sum()

age          0
sex          0
trestbps     0
chol         0
fbs          0
thalach      0
exang        0
oldpeak      0
ca           0
num          0
cp_2         0
cp_3         0
cp_4         0
restecg_1    0
restecg_2    0
slope_2      0
slope_3      0
thal_6.0     0
thal_7.0     0
dtype: int64

In [65]:
# Convert multi-class target into binary (0 = no disease, 1 = disease)
df_scaled["num_binary"] = df_scaled["num"].apply(lambda x: 1 if x > 0 else 0)

# Use this as target
y = df_scaled["num_binary"]
X = df_scaled.drop(["num", "num_binary"], axis=1)

In [66]:
# Split data into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [67]:
y.head()

0    0
1    1
2    1
3    0
4    0
Name: num_binary, dtype: int64

In [75]:
# ------------------- Logistic Regression -------------------
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}
grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=5, scoring="accuracy")
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
print("Best Logistic Regression Score:", grid_lr.best_score_)


Best Logistic Regression Params: {'C': 0.1, 'solver': 'lbfgs'}
Best Logistic Regression Score: 0.8305851063829787


In [76]:
# ------------------- Random Forest -------------------
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}
random_rf = RandomizedSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring="accuracy", n_iter=5, random_state=42)
random_rf.fit(X_train, y_train)

print("Best Random Forest Params:", random_rf.best_params_)
print("Best Random Forest Score:", random_rf.best_score_)

Best Random Forest Params: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': None}
Best Random Forest Score: 0.8349290780141845


In [77]:
# ------------------- SVM -------------------
param_grid_svm = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf", "poly"],
    "gamma": ["scale", "auto"]
}
grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring="accuracy")
grid_svm.fit(X_train, y_train)

print("Best SVM Params:", grid_svm.best_params_)
print("Best SVM Score:", grid_svm.best_score_)

Best SVM Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best SVM Score: 0.8430851063829786


In [80]:
# ------------------- Decision Tree -------------------
param_grid_dt = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, cv=5, scoring="accuracy")
grid_dt.fit(X_train, y_train)

print("Best Decision Tree Params:", grid_dt.best_params_)
print("Best Decision Tree Score:", grid_dt.best_score_)

Best Decision Tree Params: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Decision Tree Score: 0.8008865248226951
