# CLEANING THE DATASET

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import ttest_ind
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [7]:
# load the dataset
df=pd.read_csv(r"C:\Users\Admin\Documents\Datasets\New folder\healthcare-dataset-stroke-data.csv")
df.shape

(5110, 12)

In [8]:
# drop rows containing null values
df=df.dropna()

# remove the one row containing Other in gender column
df = df[df['gender'] != 'Other'].reset_index(drop=True)

df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [9]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [10]:
# Create the encoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform
encoded_array = encoder.fit_transform(df[['gender','work_type','Residence_type','smoking_status','ever_married']])

# convert to dataframe
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['gender','work_type','Residence_type','smoking_status', 'ever_married']))

# merge encoded_df to original df
merged_df = pd.concat([df.drop(columns=['gender','work_type','Residence_type','smoking_status', 'ever_married']), encoded_df], axis=1)

merged_df.head()



Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,work_type_Govt_job,...,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,ever_married_No,ever_married_Yes
0,9046,67.0,0,1,228.69,36.6,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,31112,80.0,0,1,105.92,32.5,1,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,60182,49.0,0,0,171.23,34.4,1,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1665,79.0,1,0,174.12,24.0,1,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,56669,81.0,0,0,186.21,29.0,1,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


# EXPLORATORY DATA(STATISTICAL) ANALYSIS

In [11]:
# check if age significantly affects incidence of stroke 

group_0 = merged_df['age'][merged_df['stroke'] == 0]
group_1 = merged_df['age'][merged_df['stroke'] == 1]


mean_age_0 = group_0.mean()
mean_age_1 = group_1.mean()

# Perform independent t-test
t_stat, p_value = ttest_ind(group_0, group_1, equal_var=False)  # Welch’s t-test

print(f"Mean age (target=0): {mean_age_0:.2f}")
print(f"Mean age (target=1): {mean_age_1:.2f}")
print(f"P-value: {p_value:.4f}")


Mean age (target=0): 41.76
Mean age (target=1): 67.71
P-value: 0.0000


In [12]:
# check if average glucose level significantly affects the incidence of stroke

group_0 = merged_df['avg_glucose_level'][merged_df['stroke'] == 0]
group_1 = merged_df['avg_glucose_level'][merged_df['stroke'] == 1]


mean_glu_0 = group_0.mean()
mean_glu_1 = group_1.mean()

# Perform independent t-test
t_stat, p_value = ttest_ind(group_0, group_1, equal_var=False)  # Welch’s t-test

print(f"Mean glu (target=0): {mean_glu_0:.2f}")
print(f"Mean glu (target=1): {mean_glu_1:.2f}")
print(f"P-value: {p_value:.4f}")


Mean glu (target=0): 104.00
Mean glu (target=1): 134.57
P-value: 0.0000


In [13]:
# check if bmi significantly affects the incidence of stroke

group_0 = merged_df['bmi'][merged_df['stroke'] == 0]
group_1 = merged_df['bmi'][merged_df['stroke'] == 1]


mean_bmi_0 = group_0.mean()
mean_bmi_1 = group_1.mean()

# Perform independent t-test
t_stat, p_value = ttest_ind(group_0, group_1, equal_var=False)  # Welch’s t-test

print(f"Mean bmi (target=0): {mean_bmi_0:.2f}")
print(f"Mean bmi (target=1): {mean_bmi_1:.2f}")
print(f"P-value: {p_value:.4f}")


Mean bmi (target=0): 28.82
Mean bmi (target=1): 30.47
P-value: 0.0003


# MODEL SELECTION AND TRAINING

In [22]:
# select the needed columns and target variable

x = merged_df.drop(columns=['id','stroke'])
y = merged_df['stroke']

In [23]:
# scale, split and fit the model

scaler = StandardScaler()
x_norm = scaler.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_norm,y,stratify=y,random_state=42,test_size=0.2)
LR = LogisticRegression().fit(x_train, y_train)

In [24]:
# predict using test set data

y_pred = LR.predict(x_test)

# use a lower threshold probability (0.05 in this case) to predict the incidence of stroke
y_pred_prob = LR.predict_proba(x_test)
y_pred_prob_custom=y_pred_prob[:,1]

custom=(y_pred_prob_custom>0.05).astype(int)

# MODEL EVALUATION

In [25]:
# I've encountered a classic case of class imbalance where the positive class (stroke being present in this case) is rare but critical
# the model is biased towards the negative class because they dominate the dataset
# so it predicts everything as negative and so has a high accuracy
# but very low or zero positive precision or recall
# and is therefore not useful
# to solve this i considered adding the class_weight='balanced' option on randomforest
# i also considered using SMOTE teching to oversample the minority class(1)
# i also considered changing to a logistic regression model and increasing the threshold probability for predicing a zero
# i also considered using xgboost

In [26]:
# evaluate the model's prediction
print(classification_report(y_test,custom))

              precision    recall  f1-score   support

           0       0.98      0.77      0.86       940
           1       0.11      0.62      0.19        42

    accuracy                           0.77       982
   macro avg       0.54      0.70      0.52       982
weighted avg       0.94      0.77      0.83       982



In [None]:
# future considerations would include getting more data and maybbe features too using domain knowledge