# Improving model accuracy using feature selection
Dataset: Heart Disease Dataset (https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset/data)

### 1. Prerequisites

In [1]:
# Importing modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report

### 2. Data Loading

In [2]:
# Reading data
df = pd.read_csv("heart.csv")

# Feature matrix
x = df.drop("target", axis=1)

# Target labels
y = df["target"]

### 3. Preprocessing

In [3]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Feature scaling
sds = StandardScaler()

x_train = pd.DataFrame(sds.fit_transform(x_train), columns=x_train.columns)

x_test = pd.DataFrame(sds.fit_transform(x_test), columns=x_test.columns)

### 4. ML without feature selection

In [4]:
# Model
model = LogisticRegression()

# Training
model.fit(x_train, y_train)

# Testing
y_pred = model.predict(x_test)

# Model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.69      0.76       102
           1       0.74      0.87      0.80       103

    accuracy                           0.78       205
   macro avg       0.79      0.78      0.78       205
weighted avg       0.79      0.78      0.78       205



### 5. ML with feature selection

### 5.1. Feature selection

In [5]:
# Model
model = LogisticRegression()

# Feature selection using recursive feature elimination (RFE)
selector = RFE(model, n_features_to_select=6)
selector = selector.fit(x_train, y_train)
selected_features = [x.columns[i] for i in range(len(x.columns)) if selector.support_[i] == True]

# Feature matrix with selected features
x_train_sel = x_train[selected_features]
x_test_sel = x_test[selected_features]

### 5.2. Machine learning

In [6]:
# Model
model = LogisticRegression()

# Training
model.fit(x_train_sel, y_train)

# Testing
y_pred = model.predict(x_test_sel)

# Model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.73      0.78       102
           1       0.76      0.86      0.81       103

    accuracy                           0.80       205
   macro avg       0.80      0.79      0.79       205
weighted avg       0.80      0.80      0.79       205



**Conclusion:** 
- From the classification report it can be seen that the accuracy of the model has increased post feature selection.
- Better performance has been achieved using less number of features.