In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [13]:
df = pd.read_csv(r"C:\Users\abdal\OneDrive\Desktop\heart_stroke\healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [14]:
df = df.drop("id", axis=1)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [16]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [17]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [18]:
df = df.dropna()

In [19]:
df["gender"] = df["gender"].map({"Male": 0 ,"Female":1})
df["ever_married"] = df["ever_married"].map({"No": 0, "Yes": 1})
df["Residence_type"] = df["Residence_type"].map({"Rural": 0, "Urban": 1})

In [20]:
encoder = OneHotEncoder(drop="first", sparse_output=False)

cat_cols=["smoking_status","work_type"]

encoded = encoder.fit_transform(df[cat_cols])

encoded_df = pd.DataFrame(encoded,columns=encoder.get_feature_names_out(cat_cols))

df = pd.concat([df.drop(cat_cols,axis=1),encoded_df],axis=1)

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,0.0,67.0,0.0,1.0,1.0,1.0,228.69,36.6,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,80.0,0.0,1.0,1.0,0.0,105.92,32.5,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,49.0,0.0,0.0,1.0,1.0,171.23,34.4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,79.0,1.0,0.0,1.0,0.0,174.12,24.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,81.0,0.0,0.0,1.0,1.0,186.21,29.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [21]:
scaler = StandardScaler()
df[["bmi","avg_glucose_level"]] =scaler.fit_transform(df[["bmi","avg_glucose_level"]])

In [22]:
x = df.drop("stroke",axis=1)
y=df["stroke"]

In [23]:
X_train,X_test,y_train,y_test =train_test_split(x,y,test_size=0.2,random_state=42)

In [24]:
X_train = pd.DataFrame(X_train).dropna()
y_train = y_train[X_train.index]

X_test = pd.DataFrame(X_test).dropna()
y_test = y_test[X_test.index]

In [25]:
lr = LogisticRegression(max_iter=10000, class_weight="balanced")
rf = RandomForestClassifier(n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    class_weight="balanced",
    random_state=42)

dt = DecisionTreeClassifier(max_depth=2, random_state=1)

ada = AdaBoostClassifier(estimator=dt,
    n_estimators=300,
    learning_rate=0.8,
    random_state=42)

gb = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)

models = [
    ("AdaBoostClassifier", ada),
    ("Logistic Regression", lr),
    ("Decision Tree", dt),
    ("Random Forest", rf),
     ("Gradient Boosting", gb)
]

In [26]:
for clf_name, clf in models:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("{:s} : {:.3f}".format(clf_name, (accuracy_score(y_test, y_pred))))

AdaBoostClassifier : 0.947
Logistic Regression : 0.761
Decision Tree : 0.947
Random Forest : 0.904
Gradient Boosting : 0.948
