# Data Preprocessing (NLP)

This notebook converts the tabular processed dataset `mhp_processed.csv` into text-based datasets.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import textwrap

DATA_IN = Path("../data/processed/mhp_processed.csv")
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_ALL = OUT_DIR / "mhp_processed_text.csv"
OUT_NO_PSS = OUT_DIR / "mhp_processed_text1.csv"
OUT_NO_GAD = OUT_DIR / "mhp_processed_text2.csv"

RANDOM_SEED = 42

## Load and inspect the processed dataset

In [None]:
df = pd.read_csv(DATA_IN)
print("Loaded:", DATA_IN)
print("Shape:", df.shape)
print("\nColumns:")
for i,c in enumerate(df.columns[:40]):
    print(f"{i+1:02d}. {c}")

print("\nSample column types:")
display(df.dtypes.value_counts())

print("\nMissing values per column (top 10):")
display(df.isna().sum().sort_values(ascending=False).head(10))

if "Depression Label" not in df.columns and "DepressionLabel" in df.columns:
    df.rename(columns={"DepressionLabel":"Depression Label"}, inplace=True)

if "Depression Label" not in df.columns:
    raise KeyError("Depression Label column not found. Please check that mhp_processed.csv contains the final categorical target.")
    
print("\nDepression Label distribution:")
display(df["Depression Label"].value_counts(dropna=False))

## Detect PSS, GAD, PHQ and demographics

In [None]:
cols = list(df.columns)

demog_expected = ["Age", "Gender", "University", "Department", "Year", "CGPA", "Scholarship"]
demog = [c for c in cols if c in demog_expected]
if len(demog) < 7:
    demog = cols[:7]

pss_cols = [c for c in cols if c.upper().startswith("PSS")]
gad_cols = [c for c in cols if c.upper().startswith("GAD")]
phq_cols = [c for c in cols if c.upper().startswith("PHQ")]

print("Demographic columns used:", demog)
print("Detected PSS columns:", pss_cols)
print("Detected GAD columns:", gad_cols)
print("Detected PHQ columns:", phq_cols)

print("\nUnique values (PSS):")
for c in pss_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

print("\nUnique values (GAD):")
for c in gad_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

print("\nUnique values (PHQ):")
for c in phq_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

## Build mapping dictionaries based on actual value ranges

In [None]:
def build_pss_map(pss_values):
    vals = sorted([int(x) for x in set(pss_values.dropna().astype(int).unique())])
    if 4 in vals:
        return {0:"Never", 1:"Almost Never", 2:"Sometimes", 3:"Fairly Often", 4:"Very Often"}
    else:
        return {0:"Never", 1:"Almost Never", 2:"Sometimes", 3:"Very Often"}

if len(pss_cols)>0:
    pss_map = build_pss_map(df[pss_cols[0]])
else:
    pss_map = {}

gad_map = {0:"Not at all", 1:"Several days", 2:"More than half the days", 3:"Nearly every day"}
phq_map = {0:"Not at all", 1:"Several days", 2:"More than half the days", 3:"Nearly every day"}

print("PSS map used:", pss_map)
print("GAD map:", gad_map)
print("PHQ map:", phq_map)

## Labeling questions

In [None]:
PSS_LABELS = {
    f"PSS{i}":label for i,label in enumerate([
        "Emotional Response to Setbacks",
        "Sense of Control Over Academics",
        "Overall Academic Stress Level",
        "Confidence in Coping Abilities",
        "Problem-Solving Self-Efficacy",
        "Perception of Academic Progress",
        "Tolerance for Academic Frustration",
        "Academic Self-Confidence",
        "Frustration With Academic Results",
        "Sense of Academic Helplessness"
    ], start=1)
}

GAD_LABELS = {
    f"GAD{i}":label for i,label in enumerate([
        "Feeling Nervous or On Edge",
        "Uncontrollable Worry",
        "Difficulty Relaxing",
        "Irritability Due to Anxiety",
        "Frequency of Excessive Worry",
        "Physical Symptoms of Anxiety",
        "Fear of Something Bad Happening"
    ], start=1)
}

PHQ_LABELS = {
    f"PHQ{i}":label for i,label in enumerate([
        "Loss of Interest",
        "Low Mood or Hopelessness",
        "Sleep Difficulties",
        "Fatigue or Low Energy",
        "Appetite or Weight Changes",
        "Feelings of Worthlessness",
        "Difficulty Concentrating",
        "Psychomotor Changes",
        "Suicidal Thoughts"
    ], start=1)
}

## Conversion function & build Student Information column

In [None]:
def describe_demographics(row, demog_cols):
    age = row.get("Age", "")
    gender = row.get("Gender", "")
    uni = row.get("University", "")
    dept = row.get("Department", "")
    year = row.get("Year", "")
    cgpa = row.get("CGPA", "")
    sch = row.get("Scholarship", "")
    dem = []
    if pd.notna(age): dem.append(f"The student is around {age} years old")
    if pd.notna(gender): dem.append(f"{gender.lower()}")
    if pd.notna(uni): dem.append(f"studying at {uni}")
    if pd.notna(dept): dem.append(f"pursuing {dept} degree")
    if pd.notna(year): dem.append(f"currently in their {year.lower()} year")
    if pd.notna(sch):
        sch_txt = "have a scholarship" if str(sch).strip().lower() in ["yes","true","1","y"] else "do not have a scholarship"
    else:
        sch_txt = ""
    dem_text = ", ".join([d for d in dem if d])
    if sch_txt:
        dem_text = f"{dem_text}. They {sch_txt}."
    else:
        dem_text = dem_text + "."
    return dem_text

def pss_sentence(col, val):
    if pd.isna(val): return ""
    phrase = pss_map.get(int(val), str(val))
    label = PSS_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def gad_sentence(col, val):
    if pd.isna(val): return ""
    phrase = gad_map.get(int(val), str(val))
    label = GAD_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def phq_sentence(col, val):
    if pd.isna(val): return ""
    phrase = phq_map.get(int(val), str(val))
    label = PHQ_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def build_student_text(row, include_pss=True, include_gad=True, include_phq=True):
    parts = []

    parts.append(describe_demographics(row, demog))
    if include_pss:
        for c in pss_cols:
            parts.append(pss_sentence(c, row.get(c, np.nan)))
    if include_gad:
        for c in gad_cols:
            parts.append(gad_sentence(c, row.get(c, np.nan)))
    if include_phq:
        for c in phq_cols:
            parts.append(phq_sentence(c, row.get(c, np.nan)))
    text = " ".join([p for p in parts if p]).strip()
    text = " ".join(text.split())
    return text

sample = df.head(3).copy()
sample["Student Information"] = sample.apply(lambda r: build_student_text(r), axis=1)
sample_text = sample[["Student Information", "Depression Label"]]
display(sample_text.head(3))

## Create and save the datasets

In [None]:
print("Building full text column (all features)...")
df_all = df.copy()
df_all["Student Information"] = df_all.apply(lambda r: build_student_text(r, include_pss=True, include_gad=True, include_phq=True), axis=1)
df_text_all = df_all[["Student Information", "Depression Label"]].copy()
df_text_all.to_csv(OUT_ALL, index=False)
print("Saved:", OUT_ALL, "shape:", df_text_all.shape)

print("Building text dataset WITHOUT PSS...")
df_nopss = df.copy()
df_nopss["Student Information"] = df_nopss.apply(lambda r: build_student_text(r, include_pss=False, include_gad=True, include_phq=True), axis=1)
df_text_nopss = df_nopss[["Student Information", "Depression Label"]].copy()
df_text_nopss.to_csv(OUT_NO_PSS, index=False)
print("Saved:", OUT_NO_PSS, "shape:", df_text_nopss.shape)

print("Building text dataset WITHOUT GAD...")
df_nogad = df.copy()
df_nogad["Student Information"] = df_nogad.apply(lambda r: build_student_text(r, include_pss=True, include_gad=False, include_phq=True), axis=1)
df_text_nogad = df_nogad[["Student Information", "Depression Label"]].copy()
df_text_nogad.to_csv(OUT_NO_GAD, index=False)
print("Saved:", OUT_NO_GAD, "shape:", df_text_nogad.shape)

## Quick verification and summary stats

In [None]:
for p in [OUT_ALL, OUT_NO_PSS, OUT_NO_GAD]:
    print("\nFile:", p)
    tmp = pd.read_csv(p)
    print("Shape:", tmp.shape)
    print("Sample row:")
    display(tmp.sample(1, random_state=RANDOM_SEED).iloc[0].to_dict())
    print("Depression Label counts:")
    display(tmp["Depression Label"].value_counts())