**Feature Selection**

Using feature selection, we can select the set of features that are most relevant to the target variable. This ends up reducing the complexity of the model, as well as minimizing the resources required for training and inference.

In [None]:
# Import packages
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, SelectFromModel, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [None]:
csv_filepath = "../input/heart-disease-uci/heart.csv"

In [None]:
df = pd.read_csv(csv_filepath, na_values=['na', '--'])

# See the dataset
display(df.head())

# Check info
display(df.info())    

In [None]:
# Describe columns
print(df.describe(include='all'))

**Check NaN values and remove the unwanted features**

In [None]:
# To identify the total missing values 
print(df.isnull().sum())

**Split the data**

Now, we split the dataset into feature vectors X and target vector (stroke) Y to fit a RandomForestClassifier. 

In [None]:
# Split feature and target vectors
X = df.drop(["target"], axis=1)
Y = df["target"]

**Random Forest Model**

In [None]:
def RF_model(X, Y):
  # define the model 
  model = RandomForestClassifier(criterion="entropy", random_state=42)
  # Train the model
  model.fit(X, Y)

  return model

# Calculate metrics
def cal_accuracy(model, X_test_scaled, Y_test):
  # Predict model
  y_pred = model.predict(X_test_scaled)

  # Calculate metrics for evaluating the model
  roc = roc_auc_score(Y_test, y_pred)
  print('roc score is : {}'.format(roc))

  accuracy = accuracy_score(Y_test, y_pred)
  print('Accuracy score is : {}'.format(accuracy))

  precision = precision_score(Y_test, y_pred)
  print('Precision score is : {}'.format(precision))

  recall = recall_score(Y_test, y_pred)
  print('Recall score is : {}'.format(recall))

  f1 = f1_score(Y_test, y_pred)
  print('f1 score is : {}'.format(f1))

  return accuracy, roc, precision, recall, f1

def train_model(X, Y):
  # Split data to train and test
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

  # Normalize feature's values
  scaler = StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Call the RF_model 
  model = RF_model(X_train_scaled, Y_train)

  # Make predictions on test dataset and calculate metrics.
  accuracy, roc, precision, recall, f1 = cal_accuracy(model, X_test_scaled, Y_test)

  return accuracy, roc, precision, recall, f1

def evaluate_model(X, Y):
  # Train the model and compute metrics
  accuracy, roc, precision, recall, f1 = train_model(X, Y)

  # Display all metrics in a dataframe
  metrics_df = pd.DataFrame([[accuracy, roc, precision, recall, f1, X.shape[1]]], 
                            columns=["Accuracy", "ROC", "Precision", "Recall", "F1 Score", "Feature Count"])

  return metrics_df

In [None]:
metrics_df = evaluate_model(X, Y) 
metrics_df.index = ["All features"]

results = metrics_df
display(metrics_df.head())


**Calculate and Visualize the Correlation Matrix**

To find which feature has the highest correlation.

In [None]:
# correlation matrix
cor = df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(cor, annot=True, cmap=plt.cm.PuBu)
plt.show()

**Correlation with the target variable (Stroke)**

In [None]:
# Get value of the correlation
target_correlation = abs(cor["target"])

# Select highly correlated features (thresold = 0.2)
high_corr_feature = target_correlation[target_correlation > 0.2]

# Determine the name of features
names = []
for idx, value in high_corr_feature.iteritems():
  names.append(idx)

# Remove the name of target value
names.remove("target")
print("features are strongly correlated with the target : {}".format(names))

In [None]:
# Check the new features with the model
df_strong_feature = evaluate_model(df[names], Y)
df_strong_feature.index = ["Strong Features"]

# Add to the previous results
results = results.append(df_strong_feature)
display(results.head())

In [None]:
# Correlation with strong features
# correlation matrix
cor_features = df[names].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(cor_features, annot=True, cmap=plt.cm.PuBu)
plt.show()

**Subset of features**

In [None]:
# Select a subset of features
new_feature_corr = df[['cp', 'thalach', 'exang', 'oldpeak', 'slope']].corr()

plt.figure(figsize=(12,10))
sns.heatmap(new_feature_corr, annot=True, cmap=plt.cm.Blues)
plt.show()

In [None]:
# Check the name of subsets in highl correlation with other features list and then remove them
subset_features = [i for i in names if i not in ['cp', 'thalach', 'exang', 'oldpeak']]

# Check the new features with the model
subset_features_df = evaluate_model(df[subset_features], Y)
subset_features_df.index= ["Subset Features"]

results = results.append(subset_features_df)
results.head()

**Filter methods for feature selection**

There are three ways to filter the features.

**1- Univariate Selection with Sci-Kit Learn**

Scikit learn have several methods which can be used for feature selection/dimensionality reduction on sample sets. For more information, you can visit [this](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) website.

In this section, we are going to use SelectKBest() method to select the top 10 features.

In [None]:
def univariate_selection():
  # Split data to train and test
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

  # Normalize feature's values
  scaler = StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Using SelectKBest method we can select top 7 features based on f-test
  selector = SelectKBest(f_classif, k=7)

  # Fit selector to scaled data, then transform it
  X_new = selector.fit_transform(X_train_scaled, Y_train)
  
  # See the results
  feature_index = selector.get_support()
  # Romove the target value fro dataframe
  df_new = df.drop("target", axis=1)
  for name, idx in zip(df_new.columns, feature_index):
    print("%s: %s" % (name, idx))

  # Drop the target variable
  feature_names = df.drop("target", axis=1).columns[feature_index]

  return feature_names

In [None]:
names_univariate_feature = univariate_selection()

# Check the univariate features with the model
df_univariate_feature = evaluate_model(df[names_univariate_feature], Y)
df_univariate_feature.index = ["F-test"]

# Add to the previous results
results = results.append(df_univariate_feature)
display(results.head())

**2- Wrapper Methods**

Wrapper methods use a specific machine learning algorithm for feature selection process trying to measure the effectiveness of a particular subset of features.

Most commonly used techniques under wrapper methods are:

1- Forward selection : uses k-fold cross validation scores to decide which features to add or remove

2- Backward elimination: starts with all predictors and eliminates one-by-one iteratively. One of the most popular algorithms is Recursive Feature Elimination (RFE) which eliminates less important predictors based on feature importance ranking.

3- Bi-directional elimination(Stepwise Selection): is based on a combination of forward selection and backward elimination.

**Recursive Feature Elimination**

We are going to use Recursive Feature Elimination, which wraps around the selected model (random forest in this case) to perform feature selection.

One of the popular libraries in Python which can be used to perform wrapper method for feature selection is Recursive Feature Elimination from Scikit-learn.

For more details, you can visit [official doc](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html) 

In [None]:
def recursive_feature_selection():
  # Split data to train and test
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

  # Normalize feature's values
  scaler = StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Set the model
  RF_model = RandomForestClassifier(criterion="entropy", random_state=42)

  # Wrap RFE around the model
  rfe = RFE(RF_model, 7)

  # Train the RFE
  rfe.fit(X_train_scaled, Y_train)
  feature_names = df.drop("target", axis=1).columns[rfe.get_support()]

  return feature_names

In [None]:
feature_names_rfe = recursive_feature_selection()

# Check the rfe features with the model
df_rfe_feature = evaluate_model(df[feature_names_rfe], Y)
df_rfe_feature.index = ["RFE"]

# Add to the previous results
results = results.append(df_rfe_feature)
display(results.head())

**Embedded Methods**

Embedded methods using the construction of the machine learning algorithm complete the feature selection process.\
In fact, embedded methods tackle those problems encountering with the filter and wrapper methods.

In this section, we want to explore two embedded feature selection namely tree-based methods and regularization.

**1- Tree-based methods**

Tree-based algorithms and models, such as random forest, are well-established algorithms that are able to specify the feature importance to select features.

In order to select features from the trained model, we can use [SelectFromModel](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html).

In [None]:
# Get Feature importance from RandomForestClassifier model
def tree_based_feature_importance_fn(X, Y):
  # Split data to train and test
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

  # Normalize feature's values
  scaler = StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Set the model
  rf_model = RandomForestClassifier()
  rf_model = rf_model.fit(X_train_scaled, Y_train)

  # Plot feature importance
  plt.figure(figsize=(12,10))
  feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
  feature_importance.sort_values(ascending=False).plot(kind='barh')
  plt.show()

  return rf_model

def select_features(model):
  
  # Set the model
  selection = SelectFromModel(model, prefit=True, threshold=0.013)

  # see the selected features
  selected_features = selection.get_support()
  feature_names = df.drop("target", axis=1).columns[selected_features]

  return feature_names

In [None]:
tree_based_model = tree_based_feature_importance_fn(X, Y)
feature_names = select_features(tree_based_model)

In [None]:
# Check the rfe features with the model
df_feature_importance = evaluate_model(df[feature_names], Y)
df_feature_importance.index = ["Feature Importance"]

# Add to the previous results
results = results.append(df_feature_importance)
display(results.head(n=10))

**2- Regularization**

Regularization introduces a penalty  to the different parameters of a model to reduce its freedom. 

There are three main types of regularization for linear models which we are going to use **lasso regression or L1 regularization** in this stage.

**L1 Regularization**

L1 regularization adds a penalty to the loss function which leads to the least important features being eliminated.

For learning algorithm, we can use Linear Support Vector Classification or LinearSVC (for more information, see [this](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) document). 

Moreover, after training the LinearSVC model, we use SelectFromModel() to select features.

In [None]:
def l1_regularization_fn(X, Y):
  # Split data to train and test
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

  # Normalize feature's values
  scaler = StandardScaler().fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # Select L1 regulated features from LinearSVC model
  model = LinearSVC(C=1, penalty="l1", dual=False)   # Prefer dual=False when n_samples > n_features
  selection = SelectFromModel(model)
  selection.fit(X_train_scaled, Y_train)

  # Determine feature selected
  feature_selected = selection.get_support()
  feature_names = df.drop("target", axis=1).columns[feature_selected]

  return feature_names

In [None]:
lr_regularization_feature_names = l1_regularization_fn(X, Y)

# Check the rfe features with the model
df_l1_regularization = evaluate_model(df[lr_regularization_feature_names], Y)
df_l1_regularization.index = ["L1 Regularization"]

# Add to the previous results
results = results.append(df_l1_regularization)
display(results.head(n=10))