In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

In [22]:
df = pd.read_csv('../data/heart_disease_preprocessed.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [24]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)
top_rf_features = feature_importance.head(8)['feature'].tolist()
print(f"Top 8 RF features: {top_rf_features}")

     feature  importance
7    thalach    0.138926
9    oldpeak    0.117851
0        age    0.115244
4       chol    0.113633
3   trestbps    0.099319
11        ca    0.094761
12      thal    0.082430
2         cp    0.075682
8      exang    0.046423
10     slope    0.042554
6    restecg    0.029849
1        sex    0.027128
5        fbs    0.016198
Top 8 RF features: ['thalach', 'oldpeak', 'age', 'chol', 'trestbps', 'ca', 'thal', 'cp']


In [25]:
f_selector = SelectKBest(f_classif, k=8)
X_f_selected = f_selector.fit_transform(X, y)
selected_features_f = X.columns[f_selector.get_support()]
print(f"F-test selected features: {list(selected_features_f)}")

F-test selected features: ['sex', 'cp', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']


In [26]:
rfe = RFE(estimator=rf, n_features_to_select=8)
rfe.fit(X, y)
selected_features_rfe = X.columns[rfe.support_]

In [27]:
final_features = list(set(selected_features_f) & set(selected_features_rfe))
if len(final_features) < 6:
    final_features = top_rf_features[:8]

print(f"\n Final Selected Features: {final_features}")


 Final Selected Features: ['thalach', 'oldpeak', 'age', 'chol', 'trestbps', 'ca', 'thal', 'cp']


In [28]:
X_selected = X[final_features]
df_selected = pd.concat([X_selected, y], axis=1)
df_selected.to_csv('../data/heart_disease_selected.csv', index=False)
print(f" Selected features dataset saved!")

 Selected features dataset saved!
