# Feature Engineering

In this step, we create **9 feature sets**, each using a different feature selection technique.
Each method selects **5 top features from each scale** (PSS-10, GAD-7, PHQ-9), for a total of 15 features.
The target variable is **Depression Label** (encoded).

We then split each feature set into **80/20 Train/Test**.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE, SelectKBest, chi2, f_classif, mutual_info_classif, mutual_info_regression, VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr

# Paths
DATA_DIR = Path("../data/processed")
FEATURES_DIR = DATA_DIR / "features"
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# Load processed dataset
df = pd.read_csv(DATA_DIR / "mhp_processed.csv")

print("âœ… Processed dataset loaded successfully!")

âœ… Processed dataset loaded successfully!


## Prepare Data
We define:
- Feature groups: PSS (Stress), GAD (Anxiety), PHQ (Depression)
- Target variable: Depression Label (encoded)

In [2]:
# Define column groups
pss_cols = [f"PSS{i+1}" for i in range(10)]
gad_cols = [f"GAD{i+1}" for i in range(7)]
phq_cols = [f"PHQ{i+1}" for i in range(9)]

# Encode target
le = LabelEncoder()
df["DepressionEncoded"] = le.fit_transform(df["Depression Label"])

X = df[pss_cols + gad_cols + phq_cols]
y = df["DepressionEncoded"]

print(f"PSS features: {len(pss_cols)}, GAD features: {len(gad_cols)}, PHQ features: {len(phq_cols)}")
print(f"Target classes: {list(le.classes_)}")

PSS features: 10, GAD features: 7, PHQ features: 9
Target classes: ['Mild', 'Minimal', 'Moderate', 'Moderately Severe', 'Severe']


## Utility Functions for Feature Selection and Saving
These functions:
- Select top `n` features by different algorithms
- Save train/test CSV files in dedicated folders

In [3]:
def select_top_features(method_name, X, y, feature_groups, top_n=5):
    """Select top `top_n` features from each scale group using different algorithms."""
    selected_features = []

    for group_name, cols in feature_groups.items():
        X_group = X[cols]
        method_name_lower = method_name.lower()

        if method_name_lower == "rfe":
            model = LogisticRegression(max_iter=200)
            selector = RFE(model, n_features_to_select=top_n)
            selector.fit(X_group, y)
            feats = X_group.columns[selector.support_].tolist()

        elif method_name_lower == "skb":
            selector = SelectKBest(score_func=f_classif, k=top_n)
            selector.fit(X_group, y)
            feats = X_group.columns[np.argsort(selector.scores_)[-top_n:]].tolist()

        elif method_name_lower == "fscs":
            selector = SelectKBest(score_func=chi2, k=top_n)
            selector.fit(abs(X_group), y)
            feats = X_group.columns[np.argsort(selector.scores_)[-top_n:]].tolist()

        elif method_name_lower == "etc":
            model = ExtraTreesClassifier(n_estimators=200, random_state=42)
            model.fit(X_group, y)
            importances = model.feature_importances_
            feats = X_group.columns[np.argsort(importances)[-top_n:]].tolist()

        elif method_name_lower == "pc":
            scores = [abs(pearsonr(X_group[c], y)[0]) for c in X_group.columns]
            feats = [col for _, col in sorted(zip(scores, X_group.columns))[-top_n:]]

        elif method_name_lower == "mi":
            scores = mutual_info_classif(X_group, y, random_state=42)
            feats = X_group.columns[np.argsort(scores)[-top_n:]].tolist()

        elif method_name_lower == "mir":
            scores = mutual_info_regression(X_group, y, random_state=42)
            feats = X_group.columns[np.argsort(scores)[-top_n:]].tolist()

        elif method_name_lower == "mu":
            # Manual Uniqueness â€” features with the highest number of unique values
            uniq_scores = X_group.nunique().sort_values(ascending=False)
            feats = uniq_scores.head(top_n).index.tolist()

        elif method_name_lower == "vt":
            selector = VarianceThreshold()
            selector.fit(X_group)
            variances = selector.variances_
            feats = X_group.columns[np.argsort(variances)[-top_n:]].tolist()

        else:
            raise ValueError(f"Unknown method: {method_name}")

        selected_features.extend(feats)

    return selected_features


def save_feature_set(X, y, features, method_name):
    """Split into train/test, scale, and save CSVs."""
    X_sel = X[features]
    X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    out_dir = FEATURES_DIR / method_name.lower()
    out_dir.mkdir(parents=True, exist_ok=True)

    pd.DataFrame(X_train_scaled, columns=features).assign(DepressionEncoded=y_train).to_csv(out_dir / "train.csv", index=False)
    pd.DataFrame(X_test_scaled, columns=features).assign(DepressionEncoded=y_test).to_csv(out_dir / "test.csv", index=False)

    print(f"ðŸ’¾ Saved feature set: {method_name} â†’ {len(features)} features")
    print(f"Train/Test saved in: {out_dir}")

## Run All 9 Feature Selection Methods

Each method will:
- Select 5 features from each scale (15 total)
- Save Train/Test CSVs in a dedicated folder

In [4]:
feature_groups = {
    "PSS10": pss_cols,
    "GAD7": gad_cols,
    "PHQ9": phq_cols
}

methods = [
    "RFE",
    "SKB",
    "FSCS",
    "ETC",
    "PC",
    "MI",
    "MIR",
    "MU",
    "VT"
]

summary = {}

for method in methods:
    print(f"\nðŸ”¹ Running feature selection using: {method}")
    selected = select_top_features(method, X, y, feature_groups, top_n=5)
    save_feature_set(X, y, selected, method)
    summary[method] = selected

summary_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in summary.items()]))
summary_df.to_csv(DATA_DIR / "feature_selection_summary.csv", index=False)
print("\nâœ… Feature selection complete! Summary saved to data/processed/feature_selection_summary.csv")
summary_df


ðŸ”¹ Running feature selection using: RFE
ðŸ’¾ Saved feature set: RFE â†’ 15 features
Train/Test saved in: ..\data\processed\features\rfe

ðŸ”¹ Running feature selection using: SKB
ðŸ’¾ Saved feature set: SKB â†’ 15 features
Train/Test saved in: ..\data\processed\features\skb

ðŸ”¹ Running feature selection using: FSCS
ðŸ’¾ Saved feature set: FSCS â†’ 15 features
Train/Test saved in: ..\data\processed\features\fscs

ðŸ”¹ Running feature selection using: ETC
ðŸ’¾ Saved feature set: ETC â†’ 15 features
Train/Test saved in: ..\data\processed\features\etc

ðŸ”¹ Running feature selection using: PC
ðŸ’¾ Saved feature set: PC â†’ 15 features
Train/Test saved in: ..\data\processed\features\pc

ðŸ”¹ Running feature selection using: MI
ðŸ’¾ Saved feature set: MI â†’ 15 features
Train/Test saved in: ..\data\processed\features\mi

ðŸ”¹ Running feature selection using: MIR
ðŸ’¾ Saved feature set: MIR â†’ 15 features
Train/Test saved in: ..\data\processed\features\mir

ðŸ”¹ Running feature selectio

Unnamed: 0,RFE,SKB,FSCS,ETC,PC,MI,MIR,MU,VT
0,PSS1,PSS1,PSS9,PSS9,PSS1,PSS9,PSS9,PSS1,PSS1
1,PSS2,PSS4,PSS1,PSS6,PSS4,PSS3,PSS1,PSS2,PSS3
2,PSS3,PSS2,PSS2,PSS8,PSS2,PSS4,PSS2,PSS3,PSS10
3,PSS9,PSS3,PSS4,PSS7,PSS3,PSS1,PSS10,PSS4,PSS9
4,PSS10,PSS10,PSS10,PSS5,PSS10,PSS10,PSS4,PSS5,PSS4
5,GAD1,GAD6,GAD1,GAD1,GAD6,GAD1,GAD1,GAD1,GAD5
6,GAD3,GAD1,GAD6,GAD2,GAD1,GAD6,GAD6,GAD2,GAD6
7,GAD4,GAD5,GAD5,GAD4,GAD4,GAD5,GAD4,GAD3,GAD3
8,GAD6,GAD4,GAD4,GAD3,GAD5,GAD4,GAD5,GAD4,GAD2
9,GAD7,GAD7,GAD7,GAD7,GAD7,GAD7,GAD7,GAD5,GAD7
