# 40 â€“ Predicting Mental Health Risk from Mobile Sensing

In this notebook we:

1. Prepare features and labels from the mobile sensing dataset.
2. Train:
   - A regression model to predict `depression_score`.
   - A classification model to predict `high_risk` (binary).
3. Evaluate model performance with appropriate metrics.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

base_dir = os.path.dirname(os.path.dirname(os.getcwd())) if 'Speech_12052025' in os.getcwd() else "/mnt/data/cardiff_ai_talk_runbook"
data_path = os.path.join(base_dir, "data", "raw", "mental_health_mobile_sensing_synthetic.csv")
df = pd.read_csv(data_path)

feature_cols = [
    "avg_daily_steps",
    "avg_daily_distance_km",
    "time_at_home_hours",
    "num_unique_locations",
    "calls_per_day",
    "texts_per_day",
    "avg_sleep_duration_hours",
    "sleep_onset_variability_hours",
    "screen_time_hours",
]

X = df[feature_cols]
y_reg = df["depression_score"]
y_clf = df["high_risk"]

X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.25, random_state=42, stratify=y_clf
)

X_train.shape, X_test.shape

((600, 9), (200, 9))

In [2]:
# Regression model: Gradient Boosting
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_reg_train)

y_reg_pred = gbr.predict(X_test)

print("=== Regression (depression_score) ===")
print("MAE :", mean_absolute_error(y_reg_test, y_reg_pred))
print("RMSE:", mean_squared_error(y_reg_test, y_reg_pred, squared=False))
print("R^2 :", r2_score(y_reg_test, y_reg_pred))

=== Regression (depression_score) ===
MAE : 1.7401392266180635
RMSE: 2.174028015164563
R^2 : 0.3775712433928381




In [3]:
# Classification model: Random Forest
rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)
rf_clf.fit(X_train, y_clf_train)

y_clf_pred = rf_clf.predict(X_test)
y_clf_prob = rf_clf.predict_proba(X_test)[:, 1]

print("\n=== Classification (high_risk) ===")
print("Accuracy :", accuracy_score(y_clf_test, y_clf_pred))
print("Precision:", precision_score(y_clf_test, y_clf_pred))
print("Recall   :", recall_score(y_clf_test, y_clf_pred))
print("F1       :", f1_score(y_clf_test, y_clf_pred))
print("ROC-AUC  :", roc_auc_score(y_clf_test, y_clf_prob))

IndexError: index 1 is out of bounds for axis 1 with size 1

In [None]:
# Feature importances (classification model)
import matplotlib.pyplot as plt
import numpy as np

importances = rf_clf.feature_importances_
idx = np.argsort(importances)[::-1]

plt.bar(range(len(feature_cols)), importances[idx])
plt.xticks(range(len(feature_cols)), np.array(feature_cols)[idx], rotation=45, ha="right")
plt.ylabel("Importance")
plt.title("Random Forest Feature Importances (Mental Health)")
plt.tight_layout()
plt.show()