## Import

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score, recall_score, precision_score, accuracy_score
from imblearn.over_sampling import SMOTE

## Loading Data (processed)

In [48]:
processed_file_path = '../data/processed/stroke_data_processed.csv'
df = pd.read_csv(processed_file_path)

We have to split our dataset into training and testing dataset.  
(70/30 or 80/20 are good)

In [49]:
# Split the data into features and target
X = df.drop(columns=['stroke', 'id'])
y = df['stroke']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [50]:
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced_subsample')
param_grid = {
    'n_estimators': [100, 200],         # Number of trees
    'max_depth': [10, 20, None],        # Maximum depth of trees (None means unlimited)
    'min_samples_split': [2, 5],        # Minimum samples required to split a node
    'min_samples_leaf': [1, 3]          # Minimum samples required at a leaf node
}
scoring = make_scorer(f1_score, pos_label=1) # Focus on F1 for the stroke class
grid_search = GridSearchCV(estimator=rf_model,
                           param_grid=param_grid,
                           scoring=scoring,
                           cv=3,
                           n_jobs=-1,
                           verbose=2) # verbose=2 shows progress

print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train) # Fit on the original (but stratified) training data
print("\nGridSearchCV finished.")
print("Best parameters found: ", grid_search.best_params_)
print(f"Best cross-validation score ({scoring}): {grid_search.best_score_:.4f}")

# Use the best estimator found by the grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_tuned = best_model.predict(X_test)

# Evaluate the tuned model
print("\nClassification Report (Tuned Model):")
print(classification_report(y_test, y_pred_tuned))

print("\nConfusion Matrix (Tuned Model):")
print(confusion_matrix(y_test, y_pred_tuned))

Starting GridSearchCV...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END max_depth=10, min_samples_leaf=3, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] 

Mmmm.. results are not so good.  
They are actually to predict if a person will not have a stroke (0) with 94% accuracy, but only 50% for stroke (1).