In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../../data/heart_2020_cleaned.csv")

Normalize numerical features

In [67]:
numerical_features=['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df[numerical_features] = scaler.fit_transform(df[numerical_features])

Transform Categorical features

In [68]:
# Filter the DataFrame to keep only 'Yes' and 'No' values in 'Diabetic'
df = df[df['Diabetic'].isin(['Yes', 'No'])]

# List of columns with "Yes" and "No" values to be transformed
boolean_columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']

for column in boolean_columns:
    df[column] = df[column].replace({'Yes': 1, 'No': 0})

df['Sex'] = df['Sex'].map({'Female': 0, 'Male': 1})

age_mapping = {
    '18-24': 21,
    '25-29': 27,
    '30-34': 32,
    '35-39': 37,
    '40-44': 42,
    '45-49': 47,
    '50-54': 52,
    '55-59': 57,
    '60-64': 62,
    '65-69': 67,
    '70-74': 72,
    '75-79': 77,
    '80 or older': 85
}
df['AgeCategory'] = df['AgeCategory'].map(age_mapping)

health_mapping = {
    'Excellent': 5,
    'Very good': 4,
    'Good': 3,
    'Fair': 2,
    'Poor': 1
}
df['GenHealth'] = df['GenHealth'].map(health_mapping)

df.drop('Race', axis=1, inplace=True)

df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,0.055294,1,0,0,0.1,1.0,0,0,57,1,1,4,0.173913,1,0,1
1,0,0.100447,0,0,1,0.0,0.0,0,0,85,0,1,4,0.26087,0,0,0
2,0,0.175782,1,0,0,0.666667,1.0,0,1,67,1,1,2,0.304348,1,0,0
3,0,0.147169,0,0,0,0.0,0.0,0,0,77,0,0,3,0.217391,0,0,1
4,0,0.141132,0,0,0,0.933333,0.0,1,0,42,0,1,4,0.304348,0,0,0


Balancing Data 

In [69]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Separate features and target variable
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Define the resampling strategy using a pipeline
resampling_strategy = [('over', SMOTE(sampling_strategy=0.2)), ('under', RandomUnderSampler(sampling_strategy=1.0))]
pipeline = Pipeline(steps=resampling_strategy)

# Apply the resampling to the dataset
X_resampled, y_resampled = pipeline.fit_resample(X, y)

# Check the class distribution after resampling
resampled_counts = pd.Series(y_resampled).value_counts()
print(resampled_counts)

HeartDisease
0    56795
1    56795
Name: count, dtype: int64


Save the preprocessed data to a CSV file

In [70]:
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)
heart_disease_counts = resampled_data['HeartDisease'].value_counts()
print(heart_disease_counts)
resampled_data.to_csv('../../data/preprocessed_data_with_resampling.csv', index=False)

HeartDisease
0    56795
1    56795
Name: count, dtype: int64
