# Feature Engineering

In this notebook we create **9 feature sets** from the processed dataset:

1. **Recursive Feature Elimination**
2. **Select K Best**
3. **Fisher Score Chi-Square**
4. **Extra Trees Classifier**
5. **Pearson Correlation**
6. **Mutual Information**
7. **Mutual Info Regression**
8. **Manual Uniqueness**
9. **Variance Threshold**

For each method select:
- 5 features from PSS-10
- 5 features from GAD-7
- 5 features from PHQ-9

Combine to 15 features ‚Üí standardize

We then split the data (80/20, stratified) for model training later.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (
    RFE, SelectKBest, chi2, VarianceThreshold, f_classif,
    mutual_info_classif, mutual_info_regression
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import pearsonr

BASE_DIR = Path.cwd().parents[1]
DATA_IN = BASE_DIR / "data" / "processed" / "tabular" / "mhp_processed.csv"
OUT_BASE = BASE_DIR / "features" / "First Working"
OUT_BASE.mkdir(parents=True, exist_ok=True)

SUMMARY_PATH = BASE_DIR / "summary" / "features" / "first_working_features_summary"

RANDOM_STATE = 42

## Load and prepare the dataset

In [None]:
df = pd.read_csv(DATA_IN)
print("Loaded:", DATA_IN, "shape:", df.shape)

pss_cols = [c for c in df.columns if c.upper().startswith("PSS")]
gad_cols = [c for c in df.columns if c.upper().startswith("GAD")]
phq_cols = [c for c in df.columns if c.upper().startswith("PHQ")]

print("Detected columns counts:", len(pss_cols), len(gad_cols), len(phq_cols))
print("PSS cols:", pss_cols)
print("GAD cols:", gad_cols)
print("PHQ cols:", phq_cols)

if "DepressionEncoded" not in df.columns or df["DepressionEncoded"].isna().sum() > 0:
    df_phq = df[phq_cols].apply(pd.to_numeric, errors="coerce").fillna(0)
    phq_sum = df_phq.sum(axis=1)
    df["DepressionEncoded"] = pd.cut(
        phq_sum,
        bins=[-1, 4, 9, 14, 19, 27],
        labels=[0,1,2,3,4]
    ).astype(int)
print("Target distribution (DepressionEncoded):")
print(df["DepressionEncoded"].value_counts())

## Feature Selection Helpers

In [None]:
def select_rfe(X, y, k):
    model = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    sel = RFE(model, n_features_to_select=k)
    sel.fit(X, y)
    return X.columns[sel.support_].tolist()

def select_skb(X, y, k):
    sel = SelectKBest(score_func=f_classif, k=k)
    sel.fit(X.fillna(0), y)
    return X.columns[sel.get_support()].tolist()

def select_fscs(X, y, k):
    Xn = X.copy()
    Xn = Xn - Xn.min()
    sel = SelectKBest(score_func=chi2, k=k)
    sel.fit(Xn.fillna(0).astype(int), y)
    return X.columns[sel.get_support()].tolist()

def select_etc(X, y, k):
    model = ExtraTreesClassifier(n_estimators=200, random_state=RANDOM_STATE)
    model.fit(X.fillna(0), y)
    importances = pd.Series(model.feature_importances_, index=X.columns)
    return importances.nlargest(k).index.tolist()

def select_pc(X, y, k):
    scores = []
    for c in X.columns:
        try:
            r = pearsonr(X[c].fillna(X[c].mean()), y)[0]
        except Exception:
            r = 0.0
        scores.append((abs(r), c))
    scores.sort(reverse=True)
    return [c for _, c in scores[:k]]

def select_mi(X, y, k):
    scores = mutual_info_classif(X.fillna(0), y, random_state=RANDOM_STATE)
    order = np.argsort(scores)[::-1][:k]
    return X.columns[order].tolist()

def select_mir(X, y, k):
    scores = mutual_info_regression(X.fillna(0), y, random_state=RANDOM_STATE)
    order = np.argsort(scores)[::-1][:k]
    return X.columns[order].tolist()

def select_mu(X, y, k):
    uniq = X.nunique().sort_values(ascending=False)
    return uniq.head(k).index.tolist()

def select_vt(X, y, k):
    var = X.var().sort_values(ascending=False)
    return var.head(k).index.tolist()

## Run all nine feature selection methods

In [None]:
methods = {
    "rfe": select_rfe,
    "skb": select_skb,
    "fscs": select_fscs,
    "etc": select_etc,
    "pc": select_pc,
    "mi": select_mi,
    "mir": select_mir,
    "mu": select_mu,
    "vt": select_vt
}

summary = []

for name, func in methods.items():
    print(f"\n‚û° Running method: {name.upper()}")

    selected = []
    sel_pss = func(df[pss_cols], df["DepressionEncoded"], k=5)
    sel_gad = func(df[gad_cols], df["DepressionEncoded"], k=5)
    sel_phq = func(df[phq_cols], df["DepressionEncoded"], k=5)

    selected.extend(sel_pss)
    selected.extend(sel_gad)
    selected.extend(sel_phq)

    print(f"‚úÖ Selected ({len(selected)}):", selected)

    X_sel = df[selected].copy()
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(
        scaler.fit_transform(X_sel.fillna(0)),
        columns=selected
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled,
        df["DepressionEncoded"],
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=df["DepressionEncoded"]
    )

    out_dir = OUT_BASE / name
    out_dir.mkdir(parents=True, exist_ok=True)

    train_df = X_train.copy()
    train_df["DepressionEncoded"] = y_train.values

    test_df = X_test.copy()
    test_df["DepressionEncoded"] = y_test.values

    train_df.to_csv(out_dir / "train.csv", index=False)
    test_df.to_csv(out_dir / "test.csv", index=False)

    joblib.dump(scaler, out_dir / "scaler.pkl")

    print(f"üíæ Saved: train.csv, test.csv, scaler.pkl ‚Üí {out_dir}")

    summary.append({
        "Method": name,
        "Selected Features": selected
    })

## Save the Summary

In [None]:
pd.DataFrame(summary).to_csv(SUMMARY_PATH, index=False)
print("\n‚úÖ Feature selection complete. Summary saved to:", SUMMARY_PATH)

## Summary of Selected Features

In [None]:
summary_path = BASE_DIR / "summary" / "features" / "first_working_features_summary"

if summary_path.exists():
    summary_df = pd.read_csv(summary_path)
    pd.set_option("display.max_colwidth", None)
    display(summary_df)
else:
    print("‚ö†Ô∏è No summary file found. Please run all feature selection steps first.")