In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CLEANED data (not standardized)
df = pd.read_csv("../data/processed/cleaned_data.csv")  # <-- Use cleaned_data.csv

# Split into features (X) and target (y)
X = df.drop("Heart Disease", axis=1)
y = df["Heart Disease"]

# Split into training and testing sets FIRST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
# from sklearn.preprocessing import StandardScaler
#
# # Standardize data to have mean=0 and variance=1
# scaler = StandardScaler()
# df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Exercise Hours', 'Stress Level']] = scaler.fit_transform(df[['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar','Exercise Hours', 'Stress Level']])
#
# # Save the cleaned data to a new CSV file
# df.to_csv("../data/processed/standardized_data.csv", index=False)


In [3]:
#[OPTIONAL If standardization is needed: OTHERWISE COMMENT THIS ONE OUT]
from sklearn.preprocessing import StandardScaler

# Standardize numerical features (only on training data)
numerical_cols = ['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Blood Sugar', 'Exercise Hours', 'Stress Level']
scaler = StandardScaler()

# Fit on training data, transform both train and test
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])  # Use same scaler

In [4]:
from sklearn.feature_selection import SelectKBest, f_classif

# SelectKBest: Fit on training data only
k = 10
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)  # Transform test data

# Get selected features
selected_features = X_train.columns[selector.get_support()]
print(f"The {k} best features by SelectKBest are: {selected_features}")

The 10 best features by SelectKBest are: Index(['Age', 'Cholesterol', 'Blood Pressure', 'Stress Level',
       'Smoking_Former', 'Smoking_Never', 'Alcohol Intake_Moderate',
       'Family History_Yes', 'Obesity_Yes',
       'Chest Pain Type_Non-anginal Pain'],
      dtype='object')


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# RFE: Fit on training data only
model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)  # Transform test data

# Get selected features
selected_features_rfe = X_train.columns[rfe.support_]
print(f"The {k} best features are: {selected_features_rfe}")

The 10 best features are: Index(['Age', 'Cholesterol', 'Blood Pressure', 'Gender_Male', 'Smoking_Former',
       'Alcohol Intake_Moderate', 'Family History_Yes',
       'Chest Pain Type_Atypical Angina', 'Chest Pain Type_Non-anginal Pain',
       'Chest Pain Type_Typical Angina'],
      dtype='object')
