In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import sys
sys.path.append('../utils')
from data_prep import prepare_data

# Step 1: Load and prepare the data
X, y = prepare_data('../data/Heart_Disease.csv')
y_binary = y.apply(lambda x: 0 if x == 0 else 1)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Step 2: Define the best hyperparameters from your previous sprint
best_params = {'max_depth': 36, 'min_samples_split': 12, 'n_estimators': 64}

# Step 3: Create a preprocessing and modeling pipeline
# This pipeline will first standardize the data and then apply the Random Forest model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42, **best_params))
])

# Step 4: Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Export the trained pipeline as a .pkl file
model_filename = 'final_model.pkl'
joblib.dump(pipeline, model_filename)

print(f"Model pipeline successfully exported as '{model_filename}'")

Model pipeline successfully exported as 'final_model.pkl'
