In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib

In [5]:
df = pd.read_csv('../data/heart_disease_selected.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

In [7]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

In [13]:
best_models = {}

In [14]:
for model_name in models.keys():
    print(f"Tuning {model_name}")
    search = RandomizedSearchCV(
        estimator=models[model_name],
        param_distributions=param_grids[model_name],
        n_iter=20,
        cv=5,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )
    
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    train_score = best_model.score(X_train, y_train)
    test_score = best_model.score(X_test, y_test)
    
    best_models[model_name] = {
        'model': best_model,
        'best_params': search.best_params_,
        'train_score': train_score,
        'test_score': test_score
    }
    
    print(f"  Best params: {search.best_params_}")
    print(f"  Train score: {train_score:.3f}")
    print(f"  Test score: {test_score:.3f}")
    print("-" * 50)

Tuning RandomForest
  Best params: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
  Train score: 1.000
  Test score: 0.617
--------------------------------------------------
Tuning LogisticRegression
  Best params: {'solver': 'liblinear', 'penalty': 'l1', 'C': 1}
  Train score: 0.616
  Test score: 0.633
--------------------------------------------------
Tuning SVM




  Best params: {'kernel': 'linear', 'gamma': 'scale', 'C': 1}
  Train score: 0.603
  Test score: 0.633
--------------------------------------------------


In [15]:
final_best_model = max(best_models.items(), key=lambda x: x[1]['test_score'])
print(f"Final best model: {final_best_model[0]} with test score: {final_best_model[1]['test_score']:.3f}")

Final best model: LogisticRegression with test score: 0.633


In [16]:
joblib.dump(final_best_model[1]['model'], '../models/final_model.pkl')

['../models/final_model.pkl']