In [1]:
import sys
sys.path.append('../utils')
from data_prep import prepare_data
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

X, y = prepare_data('../data/Heart_Disease.csv')

# Convert the target variable to a binary format
y_binary = y.apply(lambda x: 0 if x == 0 else 1)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)

# Fit the grid search to the training data
print("Starting GridSearchCV. This may take a few moments...")
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best hyperparameters found: ", grid_search.best_params_)
print("Best cross-validation AUC score: ", grid_search.best_score_)

# Evaluate the best model on the test set
best_rf_model = grid_search.best_estimator_
y_pred_best = best_rf_model.predict(X_test)
y_prob_best = best_rf_model.predict_proba(X_test)[:, 1]

# Print final evaluation metrics for the optimized model
print("\nOptimized Random Forest Model Performance on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best, average='weighted', zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_best, average='weighted', zero_division=0):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_best, average='weighted', zero_division=0):.4f}")
print(f"AUC Score: {roc_auc_score(y_test, y_prob_best):.4f}")

Starting GridSearchCV. This may take a few moments...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparameters found:  {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation AUC score:  0.8917909251242586

Optimized Random Forest Model Performance on Test Set:
Accuracy: 0.8689
Precision: 0.8689
Recall: 0.8689
F1-Score: 0.8689
AUC Score: 0.9440


In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint

# Convert the target variable to a binary format
y_binary = y.apply(lambda x: 0 if x == 0 else 1)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(low=50, high=200),
    'max_depth': randint(low=10, high=50),
    'min_samples_split': randint(low=2, high=15)
}

# Create a RandomizedSearchCV object
rf_model = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions, n_iter=20, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)

# Fit the random search to the training data
print("Starting RandomizedSearchCV. This may take a few moments...")
random_search.fit(X_train, y_train)

# Step 6: Print the best parameters and best score
print("Best hyperparameters found: ", random_search.best_params_)
print("Best cross-validation AUC score: ", random_search.best_score_)

# Evaluate the best model on the test set
best_random_model = random_search.best_estimator_
y_pred_best_random = best_random_model.predict(X_test)
y_prob_best_random = best_random_model.predict_proba(X_test)[:, 1]

# Print final evaluation metrics for the optimized model
print("\nOptimized Random Forest Model Performance on Test Set:")
print(f"AUC Score: {roc_auc_score(y_test, y_prob_best_random):.4f}")

Starting RandomizedSearchCV. This may take a few moments...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best hyperparameters found:  {'max_depth': 36, 'min_samples_split': 12, 'n_estimators': 64}
Best cross-validation AUC score:  0.8904120570787238

Optimized Random Forest Model Performance on Test Set:
AUC Score: 0.9472


In [3]:
y_binary = y.apply(lambda x: 0 if x == 0 else 1)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

#  Define and train the baseline model (with default hyperparameters)
print("Training Baseline Random Forest Model...")
baseline_rf = RandomForestClassifier(random_state=42)
baseline_rf.fit(X_train, y_train)

# Make predictions and calculate performance metrics
y_pred_baseline = baseline_rf.predict(X_test)
y_prob_baseline = baseline_rf.predict_proba(X_test)[:, 1]

# Evaluate the baseline model
print("\nBaseline Random Forest Model Performance on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_baseline, average='weighted', zero_division=0):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_baseline, average='weighted', zero_division=0):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_baseline, average='weighted', zero_division=0):.4f}")
print(f"AUC Score: {roc_auc_score(y_test, y_prob_baseline):.4f}")

Training Baseline Random Forest Model...

Baseline Random Forest Model Performance on Test Set:
Accuracy: 0.8525
Precision: 0.8566
Recall: 0.8525
F1-Score: 0.8525
AUC Score: 0.9418
