In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load preprocessed dataset
df = pd.read_csv("final_processed_dataset.csv")

# Identify target column
target_col = "Heart Disease Status"
X = df.drop(columns=[target_col])
y = df[target_col]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([("clf", LogisticRegression(max_iter=1000, solver="liblinear"))])
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga']
}


grid = GridSearchCV(pipe, param_grid, cv=cv, scoring="f1", n_jobs=-1)
grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)
y_prob = grid.predict_proba(X_test)[:, 1]

print("Best Parameters:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Parameters: {'clf__C': 0.01, 'clf__penalty': 'l1', 'clf__solver': 'saga'}
Accuracy: 0.5075
Precision: 0.5072727272727273
Recall: 0.523125
F1 Score: 0.5150769230769231
ROC AUC: 0.5071152343750001
Confusion Matrix:
 [[787 813]
 [763 837]]


In [3]:
y.value_counts()


Heart Disease Status
1    8000
0    8000
Name: count, dtype: int64