In [271]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


In [272]:
df = pd.read_csv("../data/heart_disease_selected.csv")

In [273]:
X = df.drop(columns=["target"])
y = df["target"]

In [274]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [275]:
log_reg = LogisticRegression(max_iter=1000)
param_log = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

In [276]:
grid_log = GridSearchCV(log_reg, param_log, cv=5, scoring='accuracy', n_jobs=-1)
grid_log.fit(X_train, y_train)

best_log = grid_log.best_estimator_
y_pred_log = best_log.predict(X_test)

In [277]:
tree = DecisionTreeClassifier(random_state=42)
param_tree = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 9],          
    'min_samples_split': [5, 10, 20],   
    'min_samples_leaf': [2, 4, 6] 
}

grid_tree = GridSearchCV(tree, param_tree, cv=10, scoring='accuracy', n_jobs=-1)
grid_tree.fit(X_train, y_train)

best_tree = grid_tree.best_estimator_
y_pred_tree = best_tree.predict(X_test)

In [278]:
rf = RandomForestClassifier(random_state=42)
param_rf = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_rf = RandomizedSearchCV(rf, param_rf, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
random_rf.fit(X_train, y_train)

best_rf = random_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

In [279]:
svm = SVC()
param_svm = {
    'C': [0.1, 0.5, 1, 5],          
    'gamma': ['scale', 0.001, 0.01, 0.1],
    'kernel': ['linear', 'rbf']
}

grid_svm = GridSearchCV(svm, param_svm, cv=10, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)

best_svm = grid_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

In [280]:
baseline_acc = {
    'Logistic Regression': 0.86,
    'Random Forest': 0.88,
    'SVM': 0.91,
    'Decision Tree': 0.77

}

In [281]:
tuned_results = {
    'Logistic Regression': accuracy_score(y_test, y_pred_log),
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'SVM': accuracy_score(y_test, y_pred_svm),
    'Decision Tree': accuracy_score(y_test, y_pred_tree)

}

In [282]:
results_df = pd.DataFrame({
    'Model': tuned_results.keys(),
    'Baseline Accuracy': baseline_acc.values(),
    'Tuned Accuracy': tuned_results.values()
})

results_df['Improvement'] = results_df['Tuned Accuracy'] - results_df['Baseline Accuracy']

print("Best Hyperparameters:")
print("Logistic Regression:", grid_log.best_params_)
print("Decision Tree:", grid_tree.best_params_)
print("Random Forest:", random_rf.best_params_)
print("SVM:", grid_svm.best_params_)


Best Hyperparameters:
Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Decision Tree: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5}
Random Forest: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5, 'bootstrap': True}
SVM: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [283]:
print("Model Comparison:")
print(results_df.sort_values(by='Tuned Accuracy', ascending=False))

Model Comparison:
                 Model  Baseline Accuracy  Tuned Accuracy  Improvement
2                  SVM               0.91        0.950820     0.040820
1        Random Forest               0.88        0.885246     0.005246
0  Logistic Regression               0.86        0.868852     0.008852
3        Decision Tree               0.77        0.852459     0.082459


In [286]:
import joblib
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

best_model = grid_svm.best_estimator_

final_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # preprocessing
    ('pca', PCA()),
    ('model', best_model)           
])

final_pipeline.fit(X_train, y_train)

In [287]:
import os

joblib.dump(final_pipeline, "../models/final_model.pkl" )

['../models/final_model.pkl']