## Import

In [18]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import jinja2

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path")

import src.load_libs 
from src.load_data import load_stroke_data 


## Loading data

In [22]:
print("Loading data...")
df = load_stroke_data()
display(df.head())

Loading data...
Attempting to load file: healthcare-dataset-stroke-data.csv from dataset fedesoriano/stroke-prediction-dataset
Dataset loaded successfully!


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Encoding data

In [23]:
if df is not None:
    print("DataFrame shape before encoding:", df.shape)
    print("Columns before encoding:", df.columns.tolist())
    print("\nData types before encoding:")
    df.info()
    df.head()

    # Identify categorical columns to encode (excluding binary 0/1 already numerical)
    # We'll encode 'object' type columns. We can also explicitly list them.
    categorical_to_encode = df.select_dtypes(include=['object']).columns.tolist()
    # Or be explicit:
    # categorical_to_encode = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
    categorical_to_encode = [col for col in categorical_to_encode if col in df.columns]

    print(f"\nCategorical columns to one-hot encode: {categorical_to_encode}")

    if categorical_to_encode:
        # Use pandas get_dummies for one-hot encoding
        # drop_first=True can help reduce multicollinearity by dropping one category per feature
        df_encoded = pd.get_dummies(df, columns=categorical_to_encode, drop_first=True, dtype=int)

        print("\nDataFrame shape after encoding:", df_encoded.shape)
        print("Columns after encoding:", df_encoded.columns.tolist())

        # Display the first few rows with new columns
        print("\nFirst 5 rows after encoding:")
        display(df_encoded.head())

        # Update df to the encoded version
        df = df_encoded
        print("\nDataFrame 'df' updated with encoded categorical variables.")
    else:
        print("\nNo categorical columns found or specified for encoding.")

else:
    print("DataFrame not loaded.")

DataFrame shape before encoding: (5110, 12)
Columns before encoding: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

Data types before encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,1,0,1,0,1,0,0,1,1,0,0
1,51676,61.0,0,0,202.21,,1,0,0,1,0,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,1,0,1,0,1,0,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,0,0,1,0,1,0,0,1,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,0,0,1,0,0,1,0,0,0,1,0



DataFrame 'df' updated with encoded categorical variables.
