### Model development based on demographic features, pyPPG features, and xgboost architecture

*Date:* 2025-06-19 <br> 
*Author:* salil apte  <br> 
*Version:* 1.0  <br> 
*Filename:* `04-ppg-demo-xgboost-model.ipynb`

This notebook contains the training and evaluation of a [xgboost](https://xgboost.readthedocs.io) model using the five demographic features and the [pyPPG](https://pyppg.readthedocs.io) features. The process is repeated 50 times with different training/validation splits, which are designed to not have any overlap of users.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed
from tqdm import tqdm

In [None]:
# Load the training data
df_demo = pd.read_parquet(r"E:\repos\vital-sign-estimation\data\processed\demographics.parquet")
df_ppg = pd.read_parquet(r"E:\repos\vital-sign-estimation\data\processed\features.parquet")
labels = pd.read_csv(r"E:\repos\vital-sign-estimation\data\raw\train_labels.csv")
df = pd.concat([df_demo, df_ppg], axis=1)
df.drop(columns=["id"], inplace=True)
print(df.shape)
df.head()

In [None]:
# Prepare features, labels, and user ids
X = df.to_numpy()
y = labels.to_numpy()
unique_ids = df_demo["id"].unique()

In [None]:
# Set training and model parameters
n_bootstraps = 50
test_size = 0.2

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 5,
    "eta": 0.03,
    "subsample": 0.75,
    "colsample_bytree": 0.7,
    "reg_alpha": 3.0,
    "reg_lambda": 2.0,
    "seed": 1,
    "tree_method": "gpu_hist",  
}

In [None]:
# Define the training run
def training_run(i):
    # Split the ids for training and testing    
    train_ids, test_ids = train_test_split(unique_ids, test_size=test_size, random_state=i)

    train_mask = df_demo["id"].isin(train_ids)
    test_mask = df_demo["id"].isin(test_ids)

    # Split the train and test dataset
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[test_mask], y[test_mask]

    # Create DMatrix objects
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dval = xgb.DMatrix(data=X_val, label=y_val)

    # Train model with early stopping
    evals = [(dtrain, "train"), (dval, "val")]

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=400,
        evals=evals,
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    y_pred = model.predict(dval)
    mse = mean_squared_error(y_val, y_pred)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return {"seed": i, "mse": mse, "mape": mape}

In [None]:
# Run 50 training runs in parallel
results = Parallel(n_jobs=-1)(
    delayed(training_run)(i) for i in tqdm(range(n_bootstraps))
)

In [None]:
# Collate the results
results_df = pd.DataFrame(results)
print(results_df.describe())

# Visualize the results
plt.figure(figsize=(10, 5))
sns.boxplot(data=results_df[["mse", "mape"]])
plt.title("XGBoost with PPG and demo features - 50 Bootstrap Runs")
plt.show()