# Step 4 ‚Äî Feature Engineering

In this notebook we create **9 feature sets** from the processed dataset.

Each method selects the **top 5 features from each scale** (PSS-10, GAD-7, PHQ-9) ‚Üí 15 features total.

We then split the data (80/20, stratified) for model training later.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (
    RFE, SelectKBest, chi2, mutual_info_classif, mutual_info_regression, VarianceThreshold
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import f_classif

# Base paths
DATA_PATH = Path("../data/processed/mhp_processed.csv")
OUT_BASE = Path("../data/processed/features")
OUT_BASE.mkdir(parents=True, exist_ok=True)

## Load and prepare the dataset
We load `mhp_processed.csv`, encode `Depression Label` ‚Üí `DepressionEncoded`,  
and identify columns by scale: PSS, GAD, PHQ.

In [2]:
df = pd.read_csv(DATA_PATH)

# Encode Depression Label
label_map = {
    "Minimal": 0, "Mild": 1, "Moderate": 2,
    "Moderately Severe": 3, "Severe": 4
}
df["DepressionEncoded"] = df["Depression Label"].map(label_map)

# Fill NaNs by recomputing from PHQ-9 if missing
phq_cols = [c for c in df.columns if c.startswith("PHQ")]
phq_sum = df[phq_cols].sum(axis=1)
df.loc[df["DepressionEncoded"].isna(), "DepressionEncoded"] = pd.cut(
    phq_sum,
    bins=[-1, 4, 9, 14, 19, 27],
    labels=[0, 1, 2, 3, 4]
).astype(int)

df["DepressionEncoded"] = df["DepressionEncoded"].astype(int)

# Identify scale columns
pss_cols = [c for c in df.columns if c.startswith("PSS")]
gad_cols = [c for c in df.columns if c.startswith("GAD")]
phq_cols = [c for c in df.columns if c.startswith("PHQ")]

print(f"PSS cols: {len(pss_cols)}, GAD cols: {len(gad_cols)}, PHQ cols: {len(phq_cols)}")

PSS cols: 10, GAD cols: 7, PHQ cols: 9


## Feature Selection Helpers

Each function returns the top N features for a given scale.

In [3]:
def select_rfe(X, y, n=5):
    model = LogisticRegression(max_iter=500)
    selector = RFE(model, n_features_to_select=n)
    selector.fit(X, y)
    return X.columns[selector.support_].tolist()

def select_skb(X, y, n=5):
    skb = SelectKBest(score_func=f_classif, k=n)
    skb.fit(X, y)
    return X.columns[skb.get_support()].tolist()

def select_fscs(X, y, n=5):
    # Fisher Score (Chi2)
    Xn = X - X.min()  # chi2 requires non-negative
    fs = SelectKBest(score_func=chi2, k=n)
    fs.fit(Xn, y)
    return X.columns[fs.get_support()].tolist()

def select_etc(X, y, n=5):
    model = ExtraTreesClassifier(random_state=42)
    model.fit(X, y)
    imp = pd.Series(model.feature_importances_, index=X.columns)
    return imp.nlargest(n).index.tolist()

def select_pc(X, y, n=5):
    corrs = X.corrwith(y).abs().sort_values(ascending=False)
    return corrs.head(n).index.tolist()

def select_mi(X, y, n=5):
    mi = mutual_info_classif(X, y, random_state=42)
    return X.columns[np.argsort(mi)[-n:]].tolist()

def select_mir(X, y, n=5):
    mir = mutual_info_regression(X, y, random_state=42)
    return X.columns[np.argsort(mir)[-n:]].tolist()

def select_mu(X, y, n=5):
    uniq = X.nunique().sort_values(ascending=False)
    return uniq.head(n).index.tolist()

def select_vt(X, y, n=5):
    vt = VarianceThreshold()
    vt.fit(X)
    var = pd.Series(vt.variances_, index=X.columns)
    return var.nlargest(n).index.tolist()

## Run all nine feature selection methods

Each method selects 5 features per scale, combines them into 15 features, scales them, splits 80/20 (stratified), and saves train/test CSVs.

In [4]:
methods = {
    "rfe": select_rfe,
    "skb": select_skb,
    "fscs": select_fscs,
    "etc": select_etc,
    "pc": select_pc,
    "mi": select_mi,
    "mir": select_mir,
    "mu": select_mu,
    "vt": select_vt
}

summary = []

X = df[pss_cols + gad_cols + phq_cols]
y = df["DepressionEncoded"]

for name, func in methods.items():
    print(f"\nüèóÔ∏è Feature Selection: {name.upper()}")

    selected = []
    for scale in [pss_cols, gad_cols, phq_cols]:
        sel = func(df[scale], y, n=5)
        selected.extend(sel)

    scaler = StandardScaler()
    X_sel = scaler.fit_transform(X[selected])
    X_sel = pd.DataFrame(X_sel, columns=selected)

    X_train, X_test, y_train, y_test = train_test_split(
        X_sel, y, test_size=0.2, random_state=42, stratify=y
    )

    out_dir = OUT_BASE / name
    out_dir.mkdir(parents=True, exist_ok=True)

    train_df = X_train.copy()
    train_df["DepressionEncoded"] = y_train.values
    test_df = X_test.copy()
    test_df["DepressionEncoded"] = y_test.values

    train_df.to_csv(out_dir / "train.csv", index=False)
    test_df.to_csv(out_dir / "test.csv", index=False)

    summary.append({"Method": name, "Selected Features": selected})
    print(f"‚úÖ Saved feature set for {name.upper()} ({len(selected)} features)")


üèóÔ∏è Feature Selection: RFE
‚úÖ Saved feature set for RFE (15 features)

üèóÔ∏è Feature Selection: SKB
‚úÖ Saved feature set for SKB (15 features)

üèóÔ∏è Feature Selection: FSCS
‚úÖ Saved feature set for FSCS (15 features)

üèóÔ∏è Feature Selection: ETC
‚úÖ Saved feature set for ETC (15 features)

üèóÔ∏è Feature Selection: PC
‚úÖ Saved feature set for PC (15 features)

üèóÔ∏è Feature Selection: MI
‚úÖ Saved feature set for MI (15 features)

üèóÔ∏è Feature Selection: MIR
‚úÖ Saved feature set for MIR (15 features)

üèóÔ∏è Feature Selection: MU
‚úÖ Saved feature set for MU (15 features)

üèóÔ∏è Feature Selection: VT
‚úÖ Saved feature set for VT (15 features)


In [5]:
pd.DataFrame(summary).to_csv("../data/processed/feature_selection_summary.csv", index=False)
print("üìÑ Saved feature_selection_summary.csv")

üìÑ Saved feature_selection_summary.csv
