In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


In [10]:
# Load and preprocess data
path = "../data/raw/synthetic_stroke_data.csv"
df = pd.read_csv(path)

# Basic preprocessing
df = df[df['gender'] != 'Other'].reset_index(drop=True)
df['bmi'].fillna(df['bmi'].median(), inplace=True)

print("="*80)
print("FEATURE ENGINEERING")
print("="*80)
print(f"Starting dataset shape: {df.shape}")

FEATURE ENGINEERING
Starting dataset shape: (57281, 12)


## Feature Engineering

In [11]:
df_processed = df.copy()

# Create age groups
df_processed['age_group'] = pd.cut(df_processed['age'], 
                                    bins=[0, 30, 40, 50, 60, 100], 
                                    labels=['<30', '30-40', '40-50', '50-60', '60+'])
print("Created age_group feature")

# Create high risk score
df_processed['high_risk_score'] = (df_processed['hypertension'] + 
                                    df_processed['heart_disease'] + 
                                    (df_processed['age'] > 50).astype(int) +
                                    (df_processed['avg_glucose_level'] > 150).astype(int))
print("Created high_risk_score feature")

# Create glucose level categories
df_processed['glucose_level_cat'] = pd.cut(df_processed['avg_glucose_level'],
                                           bins=[0, 100, 150, 200, 500],
                                           labels=['normal', 'prediabetic', 'diabetic', 'very_high'])
print("Created glucose_level_cat feature")

# Create BMI categories
df_processed['bmi_category'] = pd.cut(df_processed['bmi'],
                                      bins=[0, 18.5, 25, 30, 100],
                                      labels=['underweight', 'normal', 'overweight', 'obese'])
print("Created bmi_category feature")

# Create smoking risk
df_processed['smoking_risk'] = df_processed['smoking_status'].map({
    'never smoked': 0,
    'Unknown': 1,
    'formerly smoked': 2,
    'smokes': 3
})
print("Created smoking_risk feature")

print(f"\nDataset shape after feature engineering: {df_processed.shape}")

Created age_group feature
Created high_risk_score feature
Created glucose_level_cat feature
Created bmi_category feature
Created smoking_risk feature

Dataset shape after feature engineering: (57281, 17)


## Categorical Encoding

In [12]:
print("\n" + "="*80)
print("CATEGORICAL ENCODING")
print("="*80)

df_model = df_processed.copy()

# Binary encoding
df_model['gender'] = (df_model['gender'] == 'Male').astype(int)
df_model['ever_married'] = (df_model['ever_married'] == 'Yes').astype(int)
print("Applied binary encoding to gender and ever_married")

# One-hot encoding
df_model = pd.get_dummies(df_model, 
                          columns=['work_type', 'Residence_type', 'age_group', 
                                  'glucose_level_cat', 'bmi_category'], 
                          drop_first=True)
print("Applied one-hot encoding to categorical features")

# Drop unnecessary columns
df_model = df_model.drop(['id', 'smoking_status'], axis=1)
print("Dropped id and smoking_status columns")

# Convert boolean to int
for col in df_model.columns:
    if df_model[col].dtype == 'bool':
        df_model[col] = df_model[col].astype(int)

print(f"\nFinal model dataset shape: {df_model.shape}")
print(f"Total features: {len(df_model.columns) - 1}")  # -1 for target
print(f"\nColumn names:")
print(df_model.columns.tolist())


CATEGORICAL ENCODING
Applied binary encoding to gender and ever_married
Applied one-hot encoding to categorical features
Dropped id and smoking_status columns

Final model dataset shape: (57281, 24)
Total features: 23

Column names:
['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'bmi', 'stroke', 'high_risk_score', 'smoking_risk', 'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'Residence_type_Urban', 'age_group_30-40', 'age_group_40-50', 'age_group_50-60', 'age_group_60+', 'glucose_level_cat_prediabetic', 'glucose_level_cat_diabetic', 'glucose_level_cat_very_high', 'bmi_category_normal', 'bmi_category_overweight', 'bmi_category_obese']


## Train-Test Split & Scaling

In [13]:
print("\n" + "="*80)
print("TRAIN-TEST SPLIT & SCALING")
print("="*80)

X = df_model.drop('stroke', axis=1)
y = df_model['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                      random_state=42, stratify=y)

print(f"\nTrain set: {X_train.shape[0]} samples (Stroke rate: {y_train.mean()*100:.2f}%)")
print(f"Test set: {X_test.shape[0]} samples (Stroke rate: {y_test.mean()*100:.2f}%)")


TRAIN-TEST SPLIT & SCALING

Train set: 45824 samples (Stroke rate: 16.95%)
Test set: 11457 samples (Stroke rate: 16.95%)


In [14]:
# Apply scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("Applied StandardScaler to features")
print(f"\nX_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

Applied StandardScaler to features

X_train_scaled shape: (45824, 23)
X_test_scaled shape: (11457, 23)


In [15]:
# Compute class weights for handling imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"\nClass weights (for handling imbalance):")
print(f"Class 0 (No Stroke): {class_weight_dict[0]:.4f}")
print(f"Class 1 (Stroke): {class_weight_dict[1]:.4f}")


Class weights (for handling imbalance):
Class 0 (No Stroke): 0.6020
Class 1 (Stroke): 2.9503


In [16]:
# Save processed data for next notebooks
X_train_scaled.to_csv("../data/X_train_scaled.csv", index=False)
X_test_scaled.to_csv("../data/X_test_scaled.csv", index=False)
y_train.to_csv("../data/y_train.csv", index=False)
y_test.to_csv("../data/y_test.csv", index=False)

print("\nSaved processed data files for next notebooks")


Saved processed data files for next notebooks
