# Data Preprocessing

This notebook cleans and standardizes the **MHP dataset** for modelling.  

Steps performed:
1. Rename demographic columns and normalize their values
2. Rename and recode **PSS-10 (Stress)** items
3. Rename and recode **GAD-7 (Anxiety)** items
4. Rename and recode **PHQ-9 (Depression)** items
5. Create `Depression Value` (PHQ-9 sum) and categorical `Depression Label`
6. Check for missing & duplicate values

In [None]:
import pandas as pd
from pathlib import Path
import re

BASE_DIR = Path.cwd().parents[1]
RAW_PATH = BASE_DIR / "data" / "raw" / "mhp_dataset.csv"
PROCESSED_PATH = BASE_DIR / "data" / "processed" / "tabular" / "mhp_processed.csv"

df = pd.read_csv(RAW_PATH)
print("Shape before processing:", df.shape)
df.head()

## Demographic columns cleanup

In [None]:
new_demo_names = [
    "Age", "Gender", "University", "Department",
    "Year", "CGPA", "Scholarship"
]
df.rename(columns=dict(zip(df.columns[:7], new_demo_names)), inplace=True)

df["Gender"] = df["Gender"].replace({
    "Prefer not to say": "Other",
    "prefer not to say": "Other"
}).str.title()

def extract_initials(text):
    if isinstance(text, str):
        m = re.search(r"\(([^)]+)\)", text)
        if m:
            return m.group(1).strip()
        else:
            return text.strip().split()[0]
    return text

df["University"] = df["University"].apply(extract_initials)

df["Department"] = df["Department"].astype(str).str.split().str[0]

df["Year"] = df["Year"].astype(str).str.split().str[0]

df["Scholarship"] = df["Scholarship"].replace({
    "Yes, full waiver": "Yes",
    "Yes, partial waiver": "Yes",
    "No waiver": "No"
}).fillna("No")

df[new_demo_names].head()

## PSS-10 (Stress) columns

In [None]:
pss_cols = df.columns[7:17]
df.rename(columns=dict(zip(pss_cols, [f"PSS{i+1}" for i in range(10)])), inplace=True)

pss_map = {
    "0 - Never": 0,
    "1 - Almost Never": 1,
    "2 - Sometimes": 2,
    "3 - Fairly Often": 3,
    "4 - Very Often": 4
}

for c in [f"PSS{i+1}" for i in range(10)]:
    df[c] = df[c].replace(pss_map)
    df[c] = pd.to_numeric(df[c], errors="coerce")

df[[f"PSS{i+1}" for i in range(10)]].head()

## GAD-7 (Anxiety) columns

In [None]:
gad_cols = df.columns[17:24]
df.rename(columns=dict(zip(gad_cols, [f"GAD{i+1}" for i in range(7)])), inplace=True)

gad_map = {
    "0 - Not at all": 0,
    "1 - Several days (less than 15 days)": 1,
    "1 - Several days": 1,
    "2 - More than half the semester": 2,
    "2 - More than half the days": 2,
    "3 - Nearly every day": 3
}

for c in [f"GAD{i+1}" for i in range(7)]:
    df[c] = df[c].replace(gad_map)
    df[c] = pd.to_numeric(df[c], errors="coerce")

df[[f"GAD{i+1}" for i in range(7)]].head()

## PHQ-9 (Depression) columns

In [None]:
phq_cols = df.columns[24:33]
df.rename(columns=dict(zip(phq_cols, [f"PHQ{i+1}" for i in range(9)])), inplace=True)

phq_map = {
    "0 - Not at all": 0,
    "1 - Several days": 1,
    "2 - More than half the days": 2,
    "3 - Nearly every day": 3
}

for c in [f"PHQ{i+1}" for i in range(9)]:
    df[c] = df[c].replace(phq_map)
    df[c] = pd.to_numeric(df[c], errors="coerce")

df["Depression Value"] = df[[f"PHQ{i+1}" for i in range(9)]].sum(axis=1)

def phq_label(val):
    if pd.isna(val):
        return None
    if val <= 4:
        return "Minimal"
    elif val <= 9:
        return "Mild"
    elif val <= 14:
        return "Moderate"
    elif val <= 19:
        return "Moderately Severe"
    else:
        return "Severe"

df["Depression Label"] = df["Depression Value"].apply(phq_label)

display_cols = [f"PHQ{i+1}" for i in range(9)] + ["Depression Value", "Depression Label"]
df[display_cols].head(10)

## Data Quality Checks — Missing & Duplicate Values

In [None]:
print("Dataset info before cleaning:\n")
print(df.info())

missing_counts = df.isna().sum()
missing_total = missing_counts.sum()

print("\nMissing values summary:")
print(missing_counts[missing_counts > 0].sort_values(ascending=False))

if missing_total > 0:
    print(f"\n⚠️ Found {missing_total} missing values. Handling them now...")

    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = df.select_dtypes(include=["object"]).columns

    df[num_cols] = df[num_cols].apply(lambda col: col.fillna(col.median()))
    df[cat_cols] = df[cat_cols].apply(lambda col: col.fillna(col.mode()[0] if not col.mode().empty else "Unknown"))

    print("✅ Missing values handled (numeric → median, categorical → mode).")
else:
    print("\n✅ No missing values found.")

dup_count = df.duplicated().sum()
print(f"\nDuplicate rows found: {dup_count}")

if dup_count > 0:
    df = df.drop_duplicates().reset_index(drop=True)
    print(f"✅ Removed {dup_count} duplicate rows.")
else:
    print("✅ No duplicate rows found.")

print("\nAfter cleaning:")
print(f"Shape: {df.shape}")
print("\nDepression label distribution:")
print(df["Depression Label"].value_counts())

## Remove Derived Columns to Prevent Overfitting

In [None]:
if "Depression Value" in df.columns:
    df.drop(columns=["Depression Value"], inplace=True)
    print("✅ 'Depression Value' column removed to prevent overfitting.")
else:
    print("ℹ️ 'Depression Value' column already removed or not found.")

print(f"Remaining columns ({len(df.columns)}):")
print(df.columns.tolist())

## Save processed dataset

In [None]:
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(PROCESSED_PATH, index=False)
print(f"Processed dataset saved to: {PROCESSED_PATH.resolve()}")