In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif, f_classif, chi2
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from scipy.stats import chi2_contingency

# Load dataset
data_path = 'C:/Users/chigu/Desktop/stroke_prediction_project/Data/healthcare-dataset-stroke-data.csv'
df = pd.read_csv(data_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Handle missing values
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

# Encode categorical variables
encoder = LabelEncoder()
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

# Feature selection using Mutual Information Score
mi_scores = mutual_info_classif(df.drop(columns=['stroke']), df['stroke'])
mi_results = sorted(zip(df.columns[:-1], mi_scores), key=lambda x: x[1], reverse=True)
print("Mutual Information Scores:", mi_results)

# Feature selection using Chi-Square test
chi_scores = {col: chi2_contingency(pd.crosstab(df[col], df['stroke']))[0] for col in categorical_columns}
chi_sorted = sorted(chi_scores.items(), key=lambda x: x[1], reverse=True)
print("\nChi-Square Scores:", chi_sorted)

# Feature selection using ANOVA test
anova_scores, _ = f_classif(df.drop(columns=['stroke']), df['stroke'])
anova_results = sorted(zip(df.columns[:-1], anova_scores), key=lambda x: x[1], reverse=True)
print("\nANOVA Scores:", anova_results)

# Feature scaling
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df.drop(columns=['stroke'])), columns=df.columns[:-1])
df_scaled['stroke'] = df['stroke']

# Save processed dataset
df_scaled.to_csv('C:/Users/chigu/Desktop/stroke_prediction_project/Data/feature_engineered_data.csv', index=False)

print("\nFeature engineering completed successfully!")


Mutual Information Scores: [('age', np.float64(0.038959746477661206)), ('heart_disease', np.float64(0.010484892875409546)), ('bmi', np.float64(0.010084893859771515)), ('work_type', np.float64(0.009623636164780347)), ('ever_married', np.float64(0.006243391508300888)), ('avg_glucose_level', np.float64(0.00503476256098101)), ('hypertension', np.float64(0.0034962983547299764)), ('smoking_status', np.float64(0.0015488216837975521)), ('Residence_type', np.float64(0.0010379337717520976)), ('gender', np.float64(0.0))]

Chi-Square Scores: [('ever_married', np.float64(58.923890259034195)), ('work_type', np.float64(49.163511976675295)), ('smoking_status', np.float64(29.147269191399264)), ('Residence_type', np.float64(1.0816367471627524)), ('gender', np.float64(0.47258662884530234))]

ANOVA Scores: [('age', np.float64(326.9165678586842)), ('heart_disease', np.float64(94.69840601636668)), ('avg_glucose_level', np.float64(90.50386961378622)), ('hypertension', np.float64(84.95354215995648)), ('ever_m