In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

df_stroke_prediction_original = pd.read_csv("./healthcare-dataset-stroke-data.csv")
print(df_stroke_prediction_original.head())

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [2]:
df_stroke_prediction_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [3]:
df_stroke_prediction_original.value_counts('stroke')

stroke
0    4861
1     249
Name: count, dtype: int64

In [4]:
# Remove rows where age < 20
df_stroke_prediction = df_stroke_prediction_original[df_stroke_prediction_original['age'] >= 20].reset_index(drop=True)

In [5]:
# Separate features (X) and target (y)
X = df_stroke_prediction.drop(columns=['stroke', 'id'])
y = df_stroke_prediction['stroke']

# One-hot encode categorical variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categories = X.select_dtypes(include=['object']).columns
X_encoded = pd.DataFrame(encoder.fit_transform(X[categories]), columns=encoder.get_feature_names_out(categories))
X = X.drop(columns=categories).reset_index(drop=True)

print(X_encoded.head())

# Combine numeric and encoded categorical features
X_final = pd.concat([X, X_encoded.reset_index(drop=True)], axis=1)

   gender_Female  gender_Male  gender_Other  ever_married_No  \
0            0.0          1.0           0.0              0.0   
1            1.0          0.0           0.0              0.0   
2            0.0          1.0           0.0              0.0   
3            1.0          0.0           0.0              0.0   
4            1.0          0.0           0.0              0.0   

   ever_married_Yes  work_type_Govt_job  work_type_Never_worked  \
0               1.0                 0.0                     0.0   
1               1.0                 0.0                     0.0   
2               1.0                 0.0                     0.0   
3               1.0                 0.0                     0.0   
4               1.0                 0.0                     0.0   

   work_type_Private  work_type_Self-employed  Residence_type_Rural  \
0                1.0                      0.0                   0.0   
1                0.0                      1.0                   1.0   

In [28]:
# Recursive feature elimination to identify most important features

from sklearn.feature_selection import RFE

# Create the RFE model and select top 10 features
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=4)
rfe.fit(X_final, y)

0,1,2
,estimator,RandomForestClassifier()
,n_features_to_select,4
,step,1
,verbose,0
,importance_getter,'auto'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
# Get the selected features
selected_features = X_final.columns[rfe.support_]
print("Selected Features:")
print(selected_features)
print(X_final.shape)

Selected Features:
Index(['age', 'avg_glucose_level', 'bmi', 'Residence_type_Urban'], dtype='object')
(4144, 20)


In [44]:
# Split the dataset into training and testing sets
X_final_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

# Apply log transform to skewed numeric features (preserve outliers)
skewed_cols = ['bmi', 'avg_glucose_level']
for col in skewed_cols:
    X_final_train[col] = np.log1p(X_final_train[col])
    X_test[col] = np.log1p(X_test[col])

In [45]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')  
X_final_train = imputer.fit_transform(X_final_train)
X_test = imputer.transform(X_test)

In [59]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from torch import threshold

# Get the original feature names before SMOTE
feature_names = X_final.columns

# Apply SMOTE to balance classes
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_final_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("\nAfter SMOTE:\n", y_train_res.value_counts())

# Convert back to dataframe and use selected features
X_selected = pd.DataFrame(X_train_res, columns=feature_names)[selected_features]
X_test_selected = pd.DataFrame(X_test, columns=feature_names)[selected_features]
y_train_res = pd.Series(y_train_res)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_selected, y_train_res)

# Adjust threshold to improve recall for stroke cases
threshold = 0.2
y_proba = rf.predict_proba(X_test_selected)[:,1]
y_pred_adj = (y_proba >= threshold).astype(int)

# Evaluate
y_pred = rf.predict(X_test_selected)
print("Accuracy:", accuracy_score(y_test, y_pred_adj))
print(classification_report(y_test, y_pred_adj))

Before SMOTE:
 stroke
0    3132
1     183
Name: count, dtype: int64

After SMOTE:
 stroke
0    3132
1    3132
Name: count, dtype: int64
Accuracy: 0.6863691194209891
              precision    recall  f1-score   support

           0       0.96      0.69      0.80       765
           1       0.15      0.66      0.24        64

    accuracy                           0.69       829
   macro avg       0.55      0.67      0.52       829
weighted avg       0.90      0.69      0.76       829

