# **Stroke Prediction**

**Dataset**: [`https://www.kaggle.com/fedesoriano/stroke-prediction-dataset`](https://www.kaggle.com/fedesoriano/stroke-prediction-dataset)<br>
**GitHub Repo**: [`https://github.com/stefanosPanteli/EPL448_Team1_StrokePrediction`](https://github.com/stefanosPanteli/EPL448_Team1_StrokePrediction)

## Team Members:

- **Loukia Shikki**: UC1066315
- **Stefanos Panteli**: UC1065916
- **Rafael Mitilineos**: UC1066383

---

# Installs

In [25]:
#!pip install

---

# Imports

---

# Dataset Extraction and First Look

In [26]:
# Libraries
import pandas as pd

In [27]:
# Read CSV File
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [28]:
# Quick overview (shape, columns, types, missingness)
df.shape, df.columns.tolist()

((5110, 12),
 ['id',
  'gender',
  'age',
  'hypertension',
  'heart_disease',
  'ever_married',
  'work_type',
  'Residence_type',
  'avg_glucose_level',
  'bmi',
  'smoking_status',
  'stroke'])

In [29]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [30]:
# Missing values per column
df.isna().sum().sort_values(ascending=False)

bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

In [31]:
# Basic descriptive stats (numeric + categorical)
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,5110.0,,,,36517.829354,21161.721625,67.0,17741.25,36932.0,54682.0,72940.0
gender,5110.0,3.0,Female,2994.0,,,,,,,
age,5110.0,,,,43.226614,22.612647,0.08,25.0,45.0,61.0,82.0
hypertension,5110.0,,,,0.097456,0.296607,0.0,0.0,0.0,0.0,1.0
heart_disease,5110.0,,,,0.054012,0.226063,0.0,0.0,0.0,0.0,1.0
ever_married,5110.0,2.0,Yes,3353.0,,,,,,,
work_type,5110.0,5.0,Private,2925.0,,,,,,,
Residence_type,5110.0,2.0,Urban,2596.0,,,,,,,
avg_glucose_level,5110.0,,,,106.147677,45.28356,55.12,77.245,91.885,114.09,271.74
bmi,4909.0,,,,28.893237,7.854067,10.3,23.5,28.1,33.1,97.6


In [32]:
# Target distribution: stroke (0/1) counts, percentages, and positive rate
target = df["stroke"]
summary = (
    target.value_counts(dropna=False)
    .rename_axis("stroke")
    .to_frame("count")
    .assign(percent=lambda x: (x["count"] / x["count"].sum() * 100).round(2))
    .sort_index()
)

stroke_rate = target.mean()

stroke_rate_row = pd.DataFrame(
    {"count": [target.shape[0]], "percent": [round(stroke_rate * 100, 2)]},
    index=pd.Index(["Stroke rate"], name="stroke"),
)

pd.concat([summary, stroke_rate_row])


Unnamed: 0_level_0,count,percent
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4861,95.13
1,249,4.87
Stroke rate,5110,4.87


In [33]:
# Convert bmi to numeric (dataset often uses 'N/A'); create a clean copy for analysis
df_clean = df.copy()
df_clean["bmi"] = pd.to_numeric(df_clean["bmi"], errors="coerce")
df_clean[["bmi"]].isna().mean()

bmi    0.039335
dtype: float64

In [34]:
# Numeric column summaries after cleaning BMI
num_cols = df_clean.select_dtypes(include="number").columns
df_clean[num_cols].agg(["count", "mean", "std", "min", "median", "max"]).T

Unnamed: 0,count,mean,std,min,median,max
id,5110.0,36517.829354,21161.721625,67.0,36932.0,72940.0
age,5110.0,43.226614,22.612647,0.08,45.0,82.0
hypertension,5110.0,0.097456,0.296607,0.0,0.0,1.0
heart_disease,5110.0,0.054012,0.226063,0.0,0.0,1.0
avg_glucose_level,5110.0,106.147677,45.28356,55.12,91.885,271.74
bmi,4909.0,28.893237,7.854067,10.3,28.1,97.6
stroke,5110.0,0.048728,0.21532,0.0,0.0,1.0


In [35]:
# Stroke rate by gender
df_clean.groupby("gender")["stroke"].agg(["count", "mean"]).sort_values("mean", ascending=False)

Unnamed: 0_level_0,count,mean
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,2115,0.051064
Female,2994,0.047094
Other,1,0.0


In [36]:
# Stroke rate by hypertension and heart disease
df_clean.groupby(["hypertension", "heart_disease"])["stroke"].agg(["count", "mean"]).sort_values("mean",
                                                                                                 ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
hypertension,heart_disease,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,64,0.203125
0,1,212,0.160377
1,0,434,0.12212
0,0,4400,0.033864


In [37]:
# Stroke rate by smoking status
df_clean.groupby("smoking_status")["stroke"].agg(["count", "mean"]).sort_values("mean", ascending=False)

Unnamed: 0_level_0,count,mean
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
formerly smoked,885,0.079096
smokes,789,0.053232
never smoked,1892,0.047569
Unknown,1544,0.03044


In [38]:
# Stroke rate by age groups
age_bins = [0, 18, 30, 45, 60, 75, 120]
age_labels = ["0-18", "19-30", "31-45", "46-60", "61-75", "76+"]

df_clean["age_group"] = pd.cut(df_clean["age"], bins=age_bins, labels=age_labels, right=True, include_lowest=True)
df_clean.groupby("age_group", observed=True)["stroke"].agg(["count", "mean"]).sort_values("age_group")

Unnamed: 0_level_0,count,mean
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0-18,916,0.002183
19-30,654,0.0
31-45,1048,0.010496
46-60,1188,0.049663
61-75,839,0.096544
76+,465,0.206452


In [39]:
# Stroke rate by glucose and BMI quantiles (where available)
df_clean["glucose_q"] = pd.qcut(df_clean["avg_glucose_level"], q=4, duplicates="drop")
df_clean["bmi_q"] = pd.qcut(df_clean["bmi"], q=4, duplicates="drop")

glucose_summary = df_clean.groupby("glucose_q", observed=True)["stroke"].agg(["count", "mean"])
bmi_summary = df_clean.groupby("bmi_q", observed=True)["stroke"].agg(["count", "mean"])

pd.concat(
    [
        glucose_summary.assign(metric="glucose_q").reset_index().rename(columns={"glucose_q": "quantile"}),
        bmi_summary.assign(metric="bmi_q").reset_index().rename(columns={"bmi_q": "quantile"}),
    ],
    ignore_index=True,
)[["metric", "quantile", "count", "mean"]]


Unnamed: 0,metric,quantile,count,mean
0,glucose_q,"(55.119, 77.245]",1278,0.043036
1,glucose_q,"(77.245, 91.885]",1277,0.029757
2,glucose_q,"(91.885, 114.09]",1278,0.037559
3,glucose_q,"(114.09, 271.74]",1277,0.084573
4,bmi_q,"(10.299000000000001, 23.5]",1232,0.017857
5,bmi_q,"(23.5, 28.1]",1251,0.05036
6,bmi_q,"(28.1, 33.1]",1216,0.055921
7,bmi_q,"(33.1, 97.6]",1210,0.046281


In [41]:
# Display frequency tables for each categorical (object-type) column
cat_cols = df.select_dtypes(include="object").columns
for c in cat_cols:
    display(
        df[c].value_counts(dropna=False)
          .to_frame("count")
          .assign(percent=lambda x: (x["count"]/x["count"].sum()*100).round(2))
    )

Unnamed: 0_level_0,count,percent
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,2994,58.59
Male,2115,41.39
Other,1,0.02


Unnamed: 0_level_0,count,percent
ever_married,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,3353,65.62
No,1757,34.38


Unnamed: 0_level_0,count,percent
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Private,2925,57.24
Self-employed,819,16.03
children,687,13.44
Govt_job,657,12.86
Never_worked,22,0.43


Unnamed: 0_level_0,count,percent
Residence_type,Unnamed: 1_level_1,Unnamed: 2_level_1
Urban,2596,50.8
Rural,2514,49.2


Unnamed: 0_level_0,count,percent
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
never smoked,1892,37.03
Unknown,1544,30.22
formerly smoked,885,17.32
smokes,789,15.44


---

# Exploratory data analysis

---

# Data pre-processing

---

# Pre-processed dataset versions

---

# Selected machine learning techniques