In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
import joblib

# Load dataset
data_path = 'C:/Users/chigu/Desktop/stroke_prediction_project/Data/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(data_path)

# Display first few rows
display(df.head())

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Check for missing values
print("\nMissing values:\n")
print(df.isnull().sum())

# Fill missing BMI values with mean
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())  # Avoid inplace=True to remove warning

# Encode categorical variables
encoder = LabelEncoder()
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

# Feature selection using Chi-Square test
chi_scores = {col: chi2_contingency(pd.crosstab(df[col], df['stroke']))[0] for col in categorical_columns}
chi_sorted = sorted(chi_scores.items(), key=lambda x: x[1], reverse=True)
print("\nChi-Square Scores:", chi_sorted)

# Feature selection using ANOVA test
anova_scores, _ = f_classif(df.drop(columns=['stroke']), df['stroke'])
anova_results = sorted(zip(df.columns[:-1], anova_scores), key=lambda x: x[1], reverse=True)
print("\nANOVA Scores:", anova_results)

# Balancing the dataset using SMOTE
X = df.drop(columns=['stroke'])
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)

# Save preprocessed data
pd.DataFrame(X_train_scaled, columns=X.columns).to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/X_train_preprocessed.csv', index=False)
pd.DataFrame(X_test_scaled, columns=X.columns).to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/X_test_preprocessed.csv', index=False)
y_train_bal.to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/y_train_preprocessed.csv', index=False)
y_test.to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/y_test_preprocessed.csv', index=False)
joblib.dump(scaler, 'C:/Users/chigu/Desktop/stroke_prediction_project/Models/scaler.pkl')

print("\nData preprocessing completed successfully!")

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1



Missing values:

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Chi-Square Scores: [('ever_married', np.float64(58.923890259034195)), ('work_type', np.float64(49.163511976675295)), ('smoking_status', np.float64(29.147269191399264)), ('Residence_type', np.float64(1.0816367471627524)), ('gender', np.float64(0.47258662884530234))]

ANOVA Scores: [('age', np.float64(326.9165678586842)), ('heart_disease', np.float64(94.69840601636668)), ('avg_glucose_level', np.float64(90.50386961378622)), ('hypertension', np.float64(84.95354215995648)), ('ever_married', np.float64(60.66722965592002)), ('bmi', np.float64(7.759775654155833)), ('work_type', np.float64(5.340018517248394)), ('smoking_status', np.float64(4.043033245970619)), ('Residence_type', np.float64(1.