In [1]:
# Socio-Clinical Depression Assessment System — Feature Engineering & Selection

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

sns.set(style="whitegrid", palette="viridis", font_scale=1.1)
print("✅ Libraries loaded successfully!")


✅ Libraries loaded successfully!


In [2]:
BASE_DIR = os.path.abspath("..")
DATA_PROCESSED = os.path.join(BASE_DIR, "data", "processed")
data_path = os.path.join(DATA_PROCESSED, "mhp_cleaned.csv")

df = pd.read_csv(data_path)
print(f"✅ Dataset loaded. Shape: {df.shape}")
df.head()


✅ Dataset loaded. Shape: (2028, 39)


Unnamed: 0,Age,Gender,University,Department,Academic_Year,Current_CGPA,waiver_or_scholarship,PSS1,PSS2,PSS3,...,PHQ2,PHQ3,PHQ4,PHQ5,PHQ6,PHQ7,PHQ8,PHQ9,Depression_Value,Depression_Label
0,18-22,0,8,2,3,2.50 - 2.99,0,3,4,3,...,2,3,2,2,2,2,3,2,20,5
1,18-22,1,8,2,4,3.00 - 3.39,0,3,3,4,...,2,2,2,2,2,2,2,2,19,3
2,18-22,1,0,2,4,3.00 - 3.39,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,18-22,1,0,2,4,3.00 - 3.39,0,3,1,2,...,1,2,1,2,1,2,2,1,14,2
4,18-22,1,10,2,3,2.50 - 2.99,0,4,4,4,...,3,3,3,1,3,0,3,3,20,5


In [3]:
leak_cols = [
    "Depression_Value", "Depression_Label",
    "Anxiety_Value", "Anxiety_Label",
    "Stress_Value", "Stress_Label"
]

df = df.drop(columns=[c for c in leak_cols if c in df.columns])
print(f"✅ Leak columns removed. Remaining columns: {df.shape[1]}")


✅ Leak columns removed. Remaining columns: 33


In [4]:
le = LabelEncoder()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("✅ Encoded categorical columns:", categorical_cols)


✅ Encoded categorical columns: ['Age', 'Current_CGPA']


In [5]:
# Load target from original clean dataset
target_path = os.path.join(DATA_PROCESSED, "mhp_cleaned.csv")
target_df = pd.read_csv(target_path)
y = target_df["Depression_Label"]

# Identify feature groups
pss_cols = [col for col in df.columns if col.startswith("PSS")]
gad_cols = [col for col in df.columns if col.startswith("GAD")]
phq_cols = [col for col in df.columns if col.startswith("PHQ")]

print(f"PSS features: {len(pss_cols)}, GAD features: {len(gad_cols)}, PHQ features: {len(phq_cols)}")


PSS features: 10, GAD features: 7, PHQ features: 9


In [6]:
def select_top_k_features(X, y, feature_group, k=5):
    selector = SelectKBest(score_func=f_classif, k=min(k, len(feature_group)))
    selector.fit(df[feature_group], y)
    selected_indices = selector.get_support(indices=True)
    return [feature_group[i] for i in selected_indices]

top_pss = select_top_k_features(df, y, pss_cols, k=5)
top_gad = select_top_k_features(df, y, gad_cols, k=5)
top_phq = select_top_k_features(df, y, phq_cols, k=5)

selected_features = top_pss + top_gad + top_phq

print("✅ Top 5 features from each group:")
print("PSS:", top_pss)
print("GAD:", top_gad)
print("PHQ:", top_phq)
print("\nFinal selected features (15 total):", selected_features)


✅ Top 5 features from each group:
PSS: ['PSS1', 'PSS2', 'PSS3', 'PSS4', 'PSS10']
GAD: ['GAD1', 'GAD4', 'GAD5', 'GAD6', 'GAD7']
PHQ: ['PHQ2', 'PHQ3', 'PHQ4', 'PHQ6', 'PHQ7']

Final selected features (15 total): ['PSS1', 'PSS2', 'PSS3', 'PSS4', 'PSS10', 'GAD1', 'GAD4', 'GAD5', 'GAD6', 'GAD7', 'PHQ2', 'PHQ3', 'PHQ4', 'PHQ6', 'PHQ7']


In [7]:
X = df[selected_features]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train[selected_features] = scaler.fit_transform(X_train[selected_features])
X_test[selected_features]  = scaler.transform(X_test[selected_features])

print(f"✅ Data ready. Train: {X_train.shape}, Test: {X_test.shape}")


✅ Data ready. Train: (1622, 15), Test: (406, 15)


In [8]:
MODELS_DIR = os.path.join(BASE_DIR, "models")
os.makedirs(MODELS_DIR, exist_ok=True)

# Save feature list
feature_path = os.path.join(MODELS_DIR, "feature_list.txt")
with open(feature_path, "w") as f:
    for feat in selected_features:
        f.write(f"{feat}\n")

# Save datasets
train_df = pd.concat([X_train, y_train.rename("Depression_Label")], axis=1)
test_df  = pd.concat([X_test, y_test.rename("Depression_Label")], axis=1)

train_path = os.path.join(DATA_PROCESSED, "train_data.csv")
test_path = os.path.join(DATA_PROCESSED, "test_data.csv")

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"📁 Saved:\n- Feature list → {feature_path}\n- Train → {train_path}\n- Test → {test_path}")


📁 Saved:
- Feature list → d:\Study\CSE299\Depression Assessment System\models\feature_list.txt
- Train → d:\Study\CSE299\Depression Assessment System\data\processed\train_data.csv
- Test → d:\Study\CSE299\Depression Assessment System\data\processed\test_data.csv


In [9]:
print("""
✅ Automated Feature Selection Summary:
1. Removed target leakage.
2. Encoded categorical variables.
3. Automatically picked top 5 features each from PSS, GAD, and PHQ using ANOVA F-test.
4. Scaled and split data.
5. Saved final train/test datasets and feature list.
Next step → 04_modeling_baselines.ipynb
""")



✅ Automated Feature Selection Summary:
1. Removed target leakage.
2. Encoded categorical variables.
3. Automatically picked top 5 features each from PSS, GAD, and PHQ using ANOVA F-test.
4. Scaled and split data.
5. Saved final train/test datasets and feature list.
Next step → 04_modeling_baselines.ipynb

