In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("../data/processed/standardized_data.csv")

#Convert into X and y, where X is the feautures/conditions of users, and Y is if heart disease is present 
X = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']


In [65]:
from sklearn.preprocessing import StandardScaler

# Standardize data to have mean=0 and variance=1
scaler = StandardScaler()
df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Exercise Hours', 'Stress Level']] = scaler.fit_transform(df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar','Exercise Hours', 'Stress Level']])

# Save the cleaned data to a new CSV file
df.to_csv("../data/processed/standardized_data.csv", index=False)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Select the best features using SelectKBest
k= 10 
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print(f"The {k} best features by SelectKBest are: {selected_features}")

The 10 best features by SelectKBest are: Index(['Age', 'Cholesterol', 'Gender_Male', 'Smoking_Former', 'Smoking_Never',
       'Alcohol Intake_Moderate', 'Family History_Yes', 'Obesity_Yes',
       'Chest Pain Type_Atypical Angina', 'Chest Pain Type_Non-anginal Pain'],
      dtype='object')


In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

#Apply RFE with logistic regression
rfe = RFE(estimator=model, n_features_to_select=10)
X_rfe = rfe.fit_transform(X, y)

#Get the selected features from RFE (recursive feature elimination)
selected_features_rfe = X.columns[rfe.support_]
print(f"The {k} best features are: {selected_features_rfe}")

#Ranking of features
ranking = pd.DataFrame({'Feature': X.columns, 'Rank': rfe.ranking_})
ranking = ranking.sort_values(by='Rank')

The 10 best features are: Index(['Age', 'Cholesterol', 'Gender_Male', 'Smoking_Former',
       'Family History_Yes', 'Diabetes_Yes', 'Exercise Induced Angina_Yes',
       'Chest Pain Type_Atypical Angina', 'Chest Pain Type_Non-anginal Pain',
       'Chest Pain Type_Typical Angina'],
      dtype='object')
