In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier  # Example on RF
from sklearn.metrics import f1_score

# Load selected features
df_selected = pd.read_csv('selected_features_heart_disease.csv')
X = df_selected.drop('target', axis=1)
y = df_selected['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter grid for Random Forest (example; repeat for others)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Optimized F1:", f1_score(y_test, y_pred))

# Compare with baseline
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)
print("Baseline F1:", f1_score(y_test, baseline_rf.predict(X_test)))

Best params: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
Optimized F1: 0.8524590163934426
Baseline F1: 0.819672131147541


In [3]:
import joblib
import os

# Create the 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Save model (from above)
joblib.dump(best_model, 'models/final_model.pkl')
print("Model saved as 'final_model.pkl'.")

# For pipeline (preprocessing + model), use sklearn Pipeline
from sklearn.pipeline import Pipeline
# Example: pipeline = Pipeline([('scaler', StandardScaler()), ('model', best_model)])
# joblib.dump(pipeline, 'models/full_pipeline.pkl')

Model saved as 'final_model.pkl'.
