# Feature Engineering (NLP)

This notebook loads the cleaned text dataset (mhp_processed_text1.csv), prepares the "Student Information" text column for NLP-based model training, assigns Depression Label as the classification target, and splits the dataset into training and testing sets (80/20).

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

## Load Dataset

In [None]:
data_path = "../data/processed/mhp_processed_text1.csv"

df = pd.read_csv(data_path)
df.head()

## Inspect Dataset Structure

In [None]:
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())

## Verify Required Columns

In [None]:
required_columns = ["Student Information", "Depression Label"]

for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Required column missing: {col}")

print("All required columns found.")

## Prepare Inputs and Target Variables

In [None]:
X = df["Student Information"]
y = df["Depression Label"]

print("Sample Text:\n", X.iloc[0])
print("\nSample Label:", y.iloc[0])

## Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

## Convert Splits Into DataFrames

In [None]:
train_df = pd.DataFrame({
    "Student Information": X_train,
    "Depression Label": y_train
})

test_df = pd.DataFrame({
    "Student Information": X_test,
    "Depression Label": y_test
})

train_df.head(), test_df.head()

## Output Directory

In [None]:
output_dir = "../data/processed/nlpfeatures1"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Output directory confirmed:", output_dir)

## Save Train/Test Files

In [None]:
train_path = os.path.join(output_dir, "train.csv")
test_path = os.path.join(output_dir, "test.csv")

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved:")
print("Train ->", train_path)
print("Test  ->", test_path)