# Data Preprocessing (NLP)

This notebook converts the tabular processed dataset `mhp_processed.csv` into text-based datasets.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import textwrap

BASE_DIR = Path.cwd().parents[1]
DATA_IN = BASE_DIR / "data" / "processed" / "tabular" / "mhp_processed_tabular.csv"
OUT_DIR = BASE_DIR / "data" / "processed" / "text"
OUT_DIR.mkdir(parents=True, exist_ok=True)

OUT_ALL = OUT_DIR / "mhp_processed_text1.csv"
OUT_NO_PSS = OUT_DIR / "mhp_processed_text2.csv"
OUT_NO_GAD = OUT_DIR / "mhp_processed_text3.csv"

RANDOM_SEED = 42

## Load and inspect the processed dataset

In [2]:
df = pd.read_csv(DATA_IN)
print("Loaded:", DATA_IN)
print("Shape:", df.shape)
print("\nColumns:")
for i,c in enumerate(df.columns[:40]):
    print(f"{i+1:02d}. {c}")

print("\nSample column types:")
display(df.dtypes.value_counts())

print("\nMissing values per column (top 10):")
display(df.isna().sum().sort_values(ascending=False).head(10))

if "Depression Label" not in df.columns and "DepressionLabel" in df.columns:
    df.rename(columns={"DepressionLabel":"Depression Label"}, inplace=True)

if "Depression Label" not in df.columns:
    raise KeyError("Depression Label column not found. Please check that mhp_processed.csv contains the final categorical target.")
    
print("\nDepression Label distribution:")
display(df["Depression Label"].value_counts(dropna=False))

Loaded: d:\Programming\Projects\Depression Severity Assessment\data\processed\tabular\mhp_processed_tabular.csv
Shape: (2022, 34)

Columns:
01. Age
02. Gender
03. University
04. Department
05. Year
06. CGPA
07. Scholarship
08. PSS1
09. PSS2
10. PSS3
11. PSS4
12. PSS5
13. PSS6
14. PSS7
15. PSS8
16. PSS9
17. PSS10
18. GAD1
19. GAD2
20. GAD3
21. GAD4
22. GAD5
23. GAD6
24. GAD7
25. PHQ1
26. PHQ2
27. PHQ3
28. PHQ4
29. PHQ5
30. PHQ6
31. PHQ7
32. PHQ8
33. PHQ9
34. Depression Label

Sample column types:


int64     26
object     8
Name: count, dtype: int64


Missing values per column (top 10):


Age            0
Gender         0
University     0
Department     0
Year           0
CGPA           0
Scholarship    0
PSS1           0
PSS2           0
PSS3           0
dtype: int64


Depression Label distribution:


Depression Label
Moderately Severe    511
Severe               504
Moderate             455
Mild                 411
Minimal              141
Name: count, dtype: int64

## Detect PSS, GAD, PHQ and demographics

In [3]:
cols = list(df.columns)

demog_expected = ["Age", "Gender", "University", "Department", "Year", "CGPA", "Scholarship"]
demog = [c for c in cols if c in demog_expected]
if len(demog) < 7:
    demog = cols[:7]

pss_cols = [c for c in cols if c.upper().startswith("PSS")]
gad_cols = [c for c in cols if c.upper().startswith("GAD")]
phq_cols = [c for c in cols if c.upper().startswith("PHQ")]

print("Demographic columns used:", demog)
print("Detected PSS columns:", pss_cols)
print("Detected GAD columns:", gad_cols)
print("Detected PHQ columns:", phq_cols)

print("\nUnique values (PSS):")
for c in pss_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

print("\nUnique values (GAD):")
for c in gad_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

print("\nUnique values (PHQ):")
for c in phq_cols:
    print(c, "→", sorted(df[c].dropna().unique())[:10])

Demographic columns used: ['Age', 'Gender', 'University', 'Department', 'Year', 'CGPA', 'Scholarship']
Detected PSS columns: ['PSS1', 'PSS2', 'PSS3', 'PSS4', 'PSS5', 'PSS6', 'PSS7', 'PSS8', 'PSS9', 'PSS10']
Detected GAD columns: ['GAD1', 'GAD2', 'GAD3', 'GAD4', 'GAD5', 'GAD6', 'GAD7']
Detected PHQ columns: ['PHQ1', 'PHQ2', 'PHQ3', 'PHQ4', 'PHQ5', 'PHQ6', 'PHQ7', 'PHQ8', 'PHQ9']

Unique values (PSS):
PSS1 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS2 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS3 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS4 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS5 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS6 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS7 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS8 → [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4)]
PSS9 → [np.in

## Build mapping dictionaries based on actual value ranges

In [4]:
def build_pss_map(pss_values):
    vals = sorted([int(x) for x in set(pss_values.dropna().astype(int).unique())])
    if 4 in vals:
        return {0:"Never", 1:"Almost Never", 2:"Sometimes", 3:"Fairly Often", 4:"Very Often"}
    else:
        return {0:"Never", 1:"Almost Never", 2:"Sometimes", 3:"Very Often"}

if len(pss_cols)>0:
    pss_map = build_pss_map(df[pss_cols[0]])
else:
    pss_map = {}

gad_map = {0:"Not at all", 1:"Several days", 2:"More than half the days", 3:"Nearly every day"}
phq_map = {0:"Not at all", 1:"Several days", 2:"More than half the days", 3:"Nearly every day"}

print("PSS map used:", pss_map)
print("GAD map:", gad_map)
print("PHQ map:", phq_map)

PSS map used: {0: 'Never', 1: 'Almost Never', 2: 'Sometimes', 3: 'Fairly Often', 4: 'Very Often'}
GAD map: {0: 'Not at all', 1: 'Several days', 2: 'More than half the days', 3: 'Nearly every day'}
PHQ map: {0: 'Not at all', 1: 'Several days', 2: 'More than half the days', 3: 'Nearly every day'}


## Labeling questions

In [5]:
PSS_LABELS = {
    f"PSS{i}":label for i,label in enumerate([
        "Emotional Response to Setbacks",
        "Sense of Control Over Academics",
        "Overall Academic Stress Level",
        "Confidence in Coping Abilities",
        "Problem-Solving Self-Efficacy",
        "Perception of Academic Progress",
        "Tolerance for Academic Frustration",
        "Academic Self-Confidence",
        "Frustration With Academic Results",
        "Sense of Academic Helplessness"
    ], start=1)
}

GAD_LABELS = {
    f"GAD{i}":label for i,label in enumerate([
        "Feeling Nervous or On Edge",
        "Uncontrollable Worry",
        "Difficulty Relaxing",
        "Irritability Due to Anxiety",
        "Frequency of Excessive Worry",
        "Physical Symptoms of Anxiety",
        "Fear of Something Bad Happening"
    ], start=1)
}

PHQ_LABELS = {
    f"PHQ{i}":label for i,label in enumerate([
        "Loss of Interest",
        "Low Mood or Hopelessness",
        "Sleep Difficulties",
        "Fatigue or Low Energy",
        "Appetite or Weight Changes",
        "Feelings of Worthlessness",
        "Difficulty Concentrating",
        "Psychomotor Changes",
        "Suicidal Thoughts"
    ], start=1)
}

## Conversion function & build Student Information column

In [6]:
def describe_demographics(row, demog_cols):
    age = row.get("Age", "")
    gender = row.get("Gender", "")
    uni = row.get("University", "")
    dept = row.get("Department", "")
    year = row.get("Year", "")
    cgpa = row.get("CGPA", "")
    sch = row.get("Scholarship", "")
    dem = []
    if pd.notna(age): dem.append(f"The student is around {age} years old")
    if pd.notna(gender): dem.append(f"{gender.lower()}")
    if pd.notna(uni): dem.append(f"studying at {uni}")
    if pd.notna(dept): dem.append(f"pursuing {dept} degree")
    if pd.notna(year): dem.append(f"currently in their {year.lower()} year")
    if pd.notna(sch):
        sch_txt = "have a scholarship" if str(sch).strip().lower() in ["yes","true","1","y"] else "do not have a scholarship"
    else:
        sch_txt = ""
    dem_text = ", ".join([d for d in dem if d])
    if sch_txt:
        dem_text = f"{dem_text}. They {sch_txt}."
    else:
        dem_text = dem_text + "."
    return dem_text

def pss_sentence(col, val):
    if pd.isna(val): return ""
    phrase = pss_map.get(int(val), str(val))
    label = PSS_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def gad_sentence(col, val):
    if pd.isna(val): return ""
    phrase = gad_map.get(int(val), str(val))
    label = GAD_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def phq_sentence(col, val):
    if pd.isna(val): return ""
    phrase = phq_map.get(int(val), str(val))
    label = PHQ_LABELS.get(col, col)
    return f"For {label}, they report {phrase.lower()}."

def build_student_text(row, include_pss=True, include_gad=True, include_phq=True):
    parts = []

    parts.append(describe_demographics(row, demog))
    if include_pss:
        for c in pss_cols:
            parts.append(pss_sentence(c, row.get(c, np.nan)))
    if include_gad:
        for c in gad_cols:
            parts.append(gad_sentence(c, row.get(c, np.nan)))
    if include_phq:
        for c in phq_cols:
            parts.append(phq_sentence(c, row.get(c, np.nan)))
    text = " ".join([p for p in parts if p]).strip()
    text = " ".join(text.split())
    return text

sample = df.head(3).copy()
sample["Student Information"] = sample.apply(lambda r: build_student_text(r), axis=1)
sample_text = sample[["Student Information", "Depression Label"]]
display(sample_text.head(3))

Unnamed: 0,Student Information,Depression Label
0,"The student is around 18-22 years old, female,...",Severe
1,"The student is around 18-22 years old, male, s...",Moderately Severe
2,"The student is around 18-22 years old, male, s...",Minimal


## Create and save the datasets

In [7]:
print("Building full text column (all features)...")
df_all = df.copy()
df_all["Student Information"] = df_all.apply(lambda r: build_student_text(r, include_pss=True, include_gad=True, include_phq=True), axis=1)
df_text_all = df_all[["Student Information", "Depression Label"]].copy()
df_text_all.to_csv(OUT_ALL, index=False)
print("Saved:", OUT_ALL, "shape:", df_text_all.shape)

print("Building text dataset WITHOUT PSS...")
df_nopss = df.copy()
df_nopss["Student Information"] = df_nopss.apply(lambda r: build_student_text(r, include_pss=False, include_gad=True, include_phq=True), axis=1)
df_text_nopss = df_nopss[["Student Information", "Depression Label"]].copy()
df_text_nopss.to_csv(OUT_NO_PSS, index=False)
print("Saved:", OUT_NO_PSS, "shape:", df_text_nopss.shape)

print("Building text dataset WITHOUT GAD...")
df_nogad = df.copy()
df_nogad["Student Information"] = df_nogad.apply(lambda r: build_student_text(r, include_pss=True, include_gad=False, include_phq=True), axis=1)
df_text_nogad = df_nogad[["Student Information", "Depression Label"]].copy()
df_text_nogad.to_csv(OUT_NO_GAD, index=False)
print("Saved:", OUT_NO_GAD, "shape:", df_text_nogad.shape)

Building full text column (all features)...
Saved: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text1.csv shape: (2022, 2)
Building text dataset WITHOUT PSS...
Saved: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text2.csv shape: (2022, 2)
Building text dataset WITHOUT GAD...
Saved: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text3.csv shape: (2022, 2)


## Quick verification and summary stats

In [8]:
for p in [OUT_ALL, OUT_NO_PSS, OUT_NO_GAD]:
    print("\nFile:", p)
    tmp = pd.read_csv(p)
    print("Shape:", tmp.shape)
    print("Sample row:")
    display(tmp.sample(1, random_state=RANDOM_SEED).iloc[0].to_dict())
    print("Depression Label counts:")
    display(tmp["Depression Label"].value_counts())


File: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text1.csv
Shape: (2022, 2)
Sample row:


{'Student Information': 'The student is around 23-26 years old, female, studying at DUET, pursuing Engineering degree, currently in their second year. They do not have a scholarship. For Emotional Response to Setbacks, they report never. For Sense of Control Over Academics, they report sometimes. For Overall Academic Stress Level, they report fairly often. For Confidence in Coping Abilities, they report sometimes. For Problem-Solving Self-Efficacy, they report almost never. For Perception of Academic Progress, they report sometimes. For Tolerance for Academic Frustration, they report sometimes. For Academic Self-Confidence, they report fairly often. For Frustration With Academic Results, they report fairly often. For Sense of Academic Helplessness, they report almost never. For Feeling Nervous or On Edge, they report several days. For Uncontrollable Worry, they report several days. For Difficulty Relaxing, they report nearly every day. For Irritability Due to Anxiety, they report more 

Depression Label counts:


Depression Label
Moderately Severe    511
Severe               504
Moderate             455
Mild                 411
Minimal              141
Name: count, dtype: int64


File: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text2.csv
Shape: (2022, 2)
Sample row:


{'Student Information': 'The student is around 23-26 years old, female, studying at DUET, pursuing Engineering degree, currently in their second year. They do not have a scholarship. For Feeling Nervous or On Edge, they report several days. For Uncontrollable Worry, they report several days. For Difficulty Relaxing, they report nearly every day. For Irritability Due to Anxiety, they report more than half the days. For Frequency of Excessive Worry, they report several days. For Physical Symptoms of Anxiety, they report more than half the days. For Fear of Something Bad Happening, they report more than half the days. For Loss of Interest, they report several days. For Low Mood or Hopelessness, they report more than half the days. For Sleep Difficulties, they report more than half the days. For Fatigue or Low Energy, they report more than half the days. For Appetite or Weight Changes, they report more than half the days. For Feelings of Worthlessness, they report more than half the days. 

Depression Label counts:


Depression Label
Moderately Severe    511
Severe               504
Moderate             455
Mild                 411
Minimal              141
Name: count, dtype: int64


File: d:\Programming\Projects\Depression Severity Assessment\data\processed\text\mhp_processed_text3.csv
Shape: (2022, 2)
Sample row:


{'Student Information': 'The student is around 23-26 years old, female, studying at DUET, pursuing Engineering degree, currently in their second year. They do not have a scholarship. For Emotional Response to Setbacks, they report never. For Sense of Control Over Academics, they report sometimes. For Overall Academic Stress Level, they report fairly often. For Confidence in Coping Abilities, they report sometimes. For Problem-Solving Self-Efficacy, they report almost never. For Perception of Academic Progress, they report sometimes. For Tolerance for Academic Frustration, they report sometimes. For Academic Self-Confidence, they report fairly often. For Frustration With Academic Results, they report fairly often. For Sense of Academic Helplessness, they report almost never. For Loss of Interest, they report several days. For Low Mood or Hopelessness, they report more than half the days. For Sleep Difficulties, they report more than half the days. For Fatigue or Low Energy, they report 

Depression Label counts:


Depression Label
Moderately Severe    511
Severe               504
Moderate             455
Mild                 411
Minimal              141
Name: count, dtype: int64