### Model development based on demographic features and xgboost architecture

*Date:* 2025-06-19 <br> 
*Author:* salil apte  <br> 
*Version:* 1.0  <br> 
*Filename:* `03-demo-xgboost-model.ipynb`

This notebook contains the training and evaluation of a [xgboost](https://xgboost.readthedocs.io) model using only the additional five (likely demographic) features. The process is repeated 50 times with different training/validation splits, which are designed to not have any overlap of users.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
# Load the training data
df = pd.read_parquet(r"E:\repos\vital-sign-estimation\data\processed\demographics.parquet")
labels = pd.read_csv(r"E:\repos\vital-sign-estimation\data\raw\train_labels.csv")
print(df.shape)
df.head()

In [None]:
# Step 1: Prepare features, target, and group labels
feature_names = ["features_0", "features_1", "features_2", "features_3", "features_4"]
X = df[feature_names].to_numpy()
y = labels.to_numpy()
unique_ids = df["id"].unique()

In [None]:
# Set the training parameters
n_bootstraps = 50
test_size = 0.2
seed = 1

In [None]:
# Prepare arrays to store results
mse_list = []
mape_list = []

for i in range(n_bootstraps):

    # Split the ids for training and testing    
    train_ids, test_ids = train_test_split(unique_ids, test_size=test_size, random_state=i)

    train_mask = df["id"].isin(train_ids)
    test_mask = df["id"].isin(test_ids)

    # Split the train and test dataset
    X_train, y_train = X[train_mask], y[train_mask]
    X_val, y_val = X[test_mask], y[test_mask]

    # Create DMatrix objects
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dval = xgb.DMatrix(data=X_val, label=y_val)

    # Set training parameters
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "max_depth": 3,
        "eta": 0.05,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_alpha": 1.0,
        "reg_lambda": 1.0,
        "seed": seed
    }

    # Train model with early stopping
    evals = [(dtrain, "train"), (dval, "val")]
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        evals=evals,
        early_stopping_rounds=20,
        verbose_eval=False
    )

    # Predict and evaluate
    y_pred = model.predict(dval)
    mse = mean_squared_error(y_val, y_pred)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    print(f"XGBoost MSE: {mse:.2f}")
    print(f"XGBoost MAPE: {mape:.2f}")

    mse_list.append(mse)
    mape_list.append(mape)

In [None]:
# Plot boxplots for MSE and MAPE across 50 repetitions
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.boxplot(mse_list)
plt.title("MSE across 50 bootstraps")
plt.ylabel("Mean Squared Error")

plt.subplot(1, 2, 2)
plt.boxplot(mape_list)
plt.title("MAPE across 50 bootstraps")
plt.ylabel("Mean absolute percentage error")

plt.tight_layout()
plt.show()