In [None]:
!pip install boruta mrmr_selection skrebate kydavra ucimlrepo

# Feature Importance Workshop

In [None]:
import os
import pandas as pd
import numpy as np
import textwrap
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from skrebate import ReliefF
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.impute import SimpleImputer
import shap
import time
from boruta import BorutaPy
from mrmr import mrmr_classif
from kydavra import ReliefFSelector
from sklearn.inspection import permutation_importance

# The Task

In this workshop, you will use XGBoost to classify individuals with diabetes. The main task is reduce the number of features in the model while maximising recall to ensure individuals with diabetes are correctly identified.

### Data Preparation:

* **Train and validation:** Used for model training and feature selection.
* **Test:** A set is held out for final performance evaluation.

### Feature Selection Techniques to Try:
* Model-based Importance:
  * XGBoost built-in feature importance
  * Permutation importance
  * SHAP values
* Wrapper Methods for Feature Selection (Iterative):
  * Recursive Feature Elimination (RFE)
  * Boruta algorithm
  * Sequential feature selection
* Statistical Methods:
  * ANOVA F-test
  * Chi-squared
  * Mutual Information
  * Minimum Redundancy Maximum Relevance (MRMR)

### Workflow:

1. Apply feature selection methods to identify top features.
2. Train XGBoost on selected features using train & validation sets.
3. Evaluate model on test set using metrics like AUC, Recall, Accuracy, F1-score.
4. Record feature subset and performance metrics in a summary table.


## Dataset Overview: CDC Diabetes Health Indicators

This dataset comprises healthcare statistics and lifestyle survey information about individuals in the United States. It was collected by the Centers for Disease Control and Prevention (CDC) and is publicly available through the UCI Machine Learning Repository.

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', None)

df_info = fetch_ucirepo(id=891)["variables"]
df_info

Below loads the same dataframe, but the classes have been balanced.

In [None]:
url = "https://github.com/scarlett-k-nhs/feature_selection_classification/blob/main/data/balanced_df.csv?raw=true"
df = pd.read_csv(url)
df.head()

# This creates your dataframe to record runs in.
output_df = pd.DataFrame()

This defines your train, test and validation datasets

In [None]:
feature_cols = [
                'HighBP',
                'HighChol',
                'CholCheck',
                'BMI',
                'Smoker',
                'Stroke',
                'HeartDiseaseorAttack',
                'PhysActivity',
                'Fruits',
                'Veggies',
                'HvyAlcoholConsump',
                'AnyHealthcare',
                'NoDocbcCost',
                'GenHlth',
                'MentHlth',
                'PhysHlth',
                'DiffWalk',
                'Sex',
                'Age',
                'Education',
                'Income'
                ]

target_col = "Diabetes_binary"

X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42, stratify=y_test
)

## Record a Run

Use this cell to record all of your runs from using the various feature selection techniques in the feature_importance_cheatsheet.

1. **Base Model**: Run this cell first with all features selected to get baseline metrics. Set comment = "All Features Selected".
2. **Feature Selection Runs**: After identifying important features (e.g., via SHAP, permutation importance, Boruta), update selected_features and change comment to reflect the source.
3. **Recording Results:** Each run appends a row to output_df with the selected features, their count, training time, and performance metrics (Recall, AUC, F1, Precision, Accuracy).

In [None]:
comment = "All Features Selected" # Change this to reflect the decision made e.g. Model embeded importance of cover has been run.

# Change this to reflect which features have been selected.
selected_features = [
                'HighBP',
                'HighChol',
                'CholCheck',
                'BMI',
                'Smoker',
                'Stroke',
                'HeartDiseaseorAttack',
                'PhysActivity',
                'Fruits',
                'Veggies',
                'HvyAlcoholConsump',
                'AnyHealthcare',
                'NoDocbcCost',
                'GenHlth',
                'MentHlth',
                'PhysHlth',
                'DiffWalk',
                'Sex',
                'Age',
                'Education',
                'Income'
                ]

# Train XGBoost model and record training time
start_time = time.time()
model = xgb.XGBClassifier(
    n_estimators=500,
    eval_metric='logloss',
    random_state=42,
    seed=42
)
model.fit(X_train[selected_features], y_train)
train_time = time.time() - start_time

# Test set predictions & metrics
y_pred_test = model.predict(X_test[selected_features])
y_pred_proba_test = model.predict_proba(X_test[selected_features])[:, 1]

auc_test = roc_auc_score(y_test, y_pred_proba_test)
acc_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

# Append results to output_df
output_df = pd.concat([
    output_df,
    pd.DataFrame([{
        'Comment': comment,
        'Selected_Features': selected_features,
        'Num_Features': len(selected_features),
        'Train_Time(s)': train_time,
        'Recall': recall_test,
        'AUC': auc_test,
        'F1': f1_test,
        'Precision': precision_test,
        'Accuracy': acc_test
    }])
], ignore_index=True)

# Display the updated dataframe
output_df



## Feature Selection Code

Bring the Feature Selection Code from the Cheatsheet to work out what features you would like to impute into the model.