## Import

In [40]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import jinja2
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path")

import src.load_libs 
from src.load_data import load_stroke_data 


## Loading data

In [41]:
print("Loading data...")
df = load_stroke_data()
display(df.head())

Loading data...
Attempting to load file: healthcare-dataset-stroke-data.csv from dataset fedesoriano/stroke-prediction-dataset
Dataset loaded successfully!


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [42]:
df.shape

(5110, 12)

In [43]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Mmm... empty datas on bmi.

Let's do median imputation for bmi.

In [44]:
if df is not None:
    # Check missing values before imputation
    print("Missing BMI values before imputation:", df['bmi'].isnull().sum())

    if df['bmi'].isnull().sum() > 0:
        # Calculate the median BMI (ignoring NaNs)
        median_bmi = df['bmi'].median()
        print(f"Median BMI: {median_bmi:.2f}")

        # Impute missing values using the median
        # This is the core line for imputation:
        df['bmi'].fillna(median_bmi, inplace=True)

        # Verify that missing values have been filled
        print("Missing BMI values after imputation:", df['bmi'].isnull().sum())
    else:
        print("No missing BMI values found.")
else:
    print("DataFrame 'df' not loaded.")

Missing BMI values before imputation: 201
Median BMI: 28.10
Missing BMI values after imputation: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(median_bmi, inplace=True)


## Preproc

In [45]:
median_bmi = df['bmi'].median()
df['bmi'].fillna(median_bmi, inplace=True)

# Drop ID
if 'id' in df.columns:
    df.drop('id', axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(median_bmi, inplace=True)


In [46]:
df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

## Encoding data

ML algos need numerical features, so we need to convert the categorical features to numerical features.

In [47]:
if df is not None:
    print("DataFrame shape before encoding:", df.shape)
    print("Columns before encoding:", df.columns.tolist())
    print("\nData types before encoding:")
    print(df.dtypes)

    # Identify categorical columns to encode (excluding binary 0/1 already numerical)
    # We'll encode 'object' type columns. We can also explicitly list them.
    categorical_to_encode = df.select_dtypes(include=['object']).columns.tolist()
    df_processed = pd.get_dummies(df, columns=categorical_to_encode, drop_first=True, dtype=int)
    print("Initial preprocessing done.")
    print("Columns:", df_processed.columns.tolist())




DataFrame shape before encoding: (5110, 11)
Columns before encoding: ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

Data types before encoding:
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
Initial preprocessing done.
Columns: ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke', 'gender_Male', 'gender_Other', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']


## Split data into training and testing

We gonna split here our dataset into training and testing dataset.  (BEFORE the scaling and standardization)  
We will use 80% of the dataset for training and 20% for testing, good for this case i guess.

In [48]:
# --- Define Features (X) and Target (y) ---
# Ensure 'stroke' column exists
if 'stroke' not in df_processed.columns:
    raise ValueError("'stroke' column not found in the DataFrame after preprocessing.")

X = df_processed.drop('stroke', axis=1)
y = df_processed['stroke']

# --- Split Data ---
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Splitting data into train and test sets...
Train shape: (4088, 16) Test shape: (1022, 16)


## Scaling

Now we have to scale our features, ML perform better when numerical features are scaled.  
For example, age, avg_glucole_lvl, bmi ae on very different ranges, so we need to scale them.

To do this, we will use standardization.

In [49]:

# --- Scale Numerical Features (AFTER SPLIT) ---
# Identify numerical columns (excluding potentially added binary encoded ones)
# Adjust this list based on your actual columns after encoding
numerical_cols = ['age', 'avg_glucose_level', 'bmi', 'hypertension', 'heart_disease'] # Example list
# Ensure columns exist
numerical_cols = [col for col in numerical_cols if col in X_train.columns]
print(f"Scaling numerical columns: {numerical_cols}")

scaler = StandardScaler()
# Fit scaler ONLY on training data
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
# Transform test data using the SAME fitted scaler
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
print("Scaling complete.")

Scaling numerical columns: ['age', 'avg_glucose_level', 'bmi', 'hypertension', 'heart_disease']
Scaling complete.


## Save data

Now that our datas are cleaned, encoded and scaled, we can save them to a new file before modeling.

In [50]:

# Save the DataFrame to a CSV file
# index=False prevents pandas from writing the DataFrame index as a column
processed_file_path = '../data/processed/stroke_data_processed.csv'
df_processed.to_csv(processed_file_path, index=False)

print(f"Processed data saved successfully to: {processed_file_path}")
print(f"Final DataFrame shape: {df_processed.shape}")
print("Columns in saved data:", df_processed.columns.tolist())


Processed data saved successfully to: ../data/processed/stroke_data_processed.csv
Final DataFrame shape: (5110, 17)
Columns in saved data: ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke', 'gender_Male', 'gender_Other', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes']
