# Feature Engineering for LightGBM Baseline

This notebook demonstrates how to engineer temporal, volatility, and derivative features for time series modeling using LightGBM.

In [7]:
# Ensure required Azure ML data access packages are installed (run once; restart kernel if versions upgrade)
%pip install -q azure-ai-ml azure-identity azureml-fsspec mltable

# After installation, you may need to restart the kernel if this is the first time installing these libraries.

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opentelemetry-exporter-prometheus 0.56b0 requires opentelemetry-sdk~=1.35.0, but you have opentelemetry-sdk 1.36.0 which is incompatible.
mlflow-skinny 2.21.3 requires packaging<25, but you have packaging 25.0 which is incompatible.
jupyterlab-nvdashboard 0.13.0 requires jupyterlab>=4, but you have jupyterlab 3.6.8 which is incompatible.
jupyter-resource-usage 0.7.2 requires psutil~=5.6, but you have psutil 7.0.0 which is incompatible.
dask-sql 2024.5.0 requires dask[dataframe]>=2024.4.1, but you have dask 2023.2.0 which is incompatible.
dask-sql 2024.5.0 requires distributed>=2024.4.1, but you have distributed 2023.2.0 which is incompatible.
azureml-training-tabular 1.60.0 requires psutil<5.9.4,>=5.2.2, but you have psutil 7.0.0 which is incompatible.
azureml-training-tabular 1.60.0 requires scipy<1.11.0,>=1

In [8]:
# Configuration parameters (loaded from JSON if present)
import json, random, os
from pathlib import Path
try:
    # In notebooks, __file__ may be undefined; fall back to current working directory
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    BASE_DIR = Path(os.getcwd())
    print('Note: __file__ undefined in notebook context; using CWD:', BASE_DIR)
params_path = BASE_DIR / 'featureengineering_params.json'
if params_path.exists():
    with open(params_path,'r') as f:
        CONFIG = json.load(f)
else:
    CONFIG = {
        'LAGS':[1,7,14],
        'ROLLING_WINDOWS':[7,14,30],
        'SLOPE_WINDOW':30,
        'SPIKE_Z_THRESHOLD':2.5,
        'MIN_HISTORY_DAYS':14,
        'SEED':42,
        'N_FOLDS':5,
        'LGB_PARAMS': {'objective':'regression','metric':['mae','rmse'],'learning_rate':0.03}
    }
    # Optionally write default file for reproducibility
    try:
        with open(params_path,'w') as f:
            json.dump(CONFIG,f,indent=2)
        print('Wrote default params to', params_path)
    except Exception as e:
        print('Could not write default params file:', e)
print('Loaded CONFIG keys:', list(CONFIG.keys()))

Note: __file__ undefined in notebook context; using CWD: /mnt/batch/tasks/shared/LS_root/mounts/clusters/rgknp1/code
Loaded CONFIG keys: ['LAGS', 'ROLLING_WINDOWS', 'SLOPE_WINDOW', 'SPIKE_Z_THRESHOLD', 'MIN_HISTORY_DAYS', 'SEED', 'N_FOLDS', 'LGB_PARAMS']


In [9]:
import pandas as pd
import numpy as np

In [10]:
# Reproducibility seeds
import numpy as np, random, os
SEED = CONFIG.get('SEED', 42)
random.seed(SEED)
np.random.seed(SEED)
try:
    import lightgbm as lgb
    lgb.register_logger(lambda msg: None)
except Exception as e:
    print('LightGBM not yet imported or available:', e)
print('Seeds set to', SEED)

Seeds set to 42


## Load Data

In [None]:
# Load and flatten multidomain cognitive JSON dataset via Azure ML Data Asset (SDK v2) with credential fallbacks
# Official guidance: https://learn.microsoft.com/azure/machine-learning/tutorial-explore-data?view=azureml-api-2
# This cell now attempts multiple auth strategies: DefaultAzureCredential -> AzureCliCredential -> InteractiveBrowserCredential.

import json
import pandas as pd
from pathlib import Path

from azure.identity import (
    DefaultAzureCredential,
    AzureCliCredential,
    InteractiveBrowserCredential
)
from azure.ai.ml import MLClient
from azure.core.exceptions import ClientAuthenticationError

# --- CONFIGURE: name & version of the registered data asset ---
DATA_ASSET_NAME = "multidomain_cognitive_dataset"  # adjust to actual asset name
DATA_ASSET_VERSION = "1"  # adjust version

SUBSCRIPTION_ID = "5944e6be-7b60-49c8-886b-307896de21f9"
RESOURCE_GROUP = "rahgupt-hackathon25"
WORKSPACE_NAME = "carehaven-ai-workspace"

credential_attempts = []
ml_client = None

# Try DefaultAzureCredential first
try:
    cred = DefaultAzureCredential(exclude_interactive_browser_credential=True)
    ml_client = MLClient(credential=cred, subscription_id=SUBSCRIPTION_ID, resource_group_name=RESOURCE_GROUP, workspace_name=WORKSPACE_NAME)
    # simple call to ensure it works
    _ = ml_client.workspaces.get(WORKSPACE_NAME)
    credential_attempts.append("DefaultAzureCredential: SUCCESS")
except Exception as e:
    credential_attempts.append(f"DefaultAzureCredential: {type(e).__name__}: {e}")
    ml_client = None

# Fallback: Azure CLI cached token (requires az login on compute)
if ml_client is None:
    try:
        cred = AzureCliCredential()
        ml_client = MLClient(credential=cred, subscription_id=SUBSCRIPTION_ID, resource_group_name=RESOURCE_GROUP, workspace_name=WORKSPACE_NAME)
        _ = ml_client.workspaces.get(WORKSPACE_NAME)
        credential_attempts.append("AzureCliCredential: SUCCESS")
    except Exception as e:
        credential_attempts.append(f"AzureCliCredential: {type(e).__name__}: {e}")
        ml_client = None

# Fallback: Interactive browser (will open auth flow; may not work in headless runs)
if ml_client is None:
    try:
        cred = InteractiveBrowserCredential()
        ml_client = MLClient(credential=cred, subscription_id=SUBSCRIPTION_ID, resource_group_name=RESOURCE_GROUP, workspace_name=WORKSPACE_NAME)
        _ = ml_client.workspaces.get(WORKSPACE_NAME)
        credential_attempts.append("InteractiveBrowserCredential: SUCCESS")
    except Exception as e:
        credential_attempts.append(f"InteractiveBrowserCredential: {type(e).__name__}: {e}")
        ml_client = None

if ml_client is None:
    diag = "\n".join(credential_attempts)
    raise ClientAuthenticationError(
        message=(
            "All credential attempts failed.\n" + diag +
            "\nTroubleshooting steps:\n"
            "  1. If using compute instance, ensure you are signed in: open a terminal and run 'az login'.\n"
            "  2. If using managed identity, attach a User Assigned Managed Identity and grant 'AzureML Data Scientist' or Reader roles.\n"
            "  3. For service principal, set environment variables AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and re-run.\n"
            "  4. As a last resort, enable InteractiveBrowserCredential (already attempted) ensuring UI login possible.\n"
        )
    )
else:
    print("Authentication succeeded via:", [a for a in credential_attempts if a.endswith('SUCCESS')][0])

# Retrieve the data asset metadata
try:
    data_asset = ml_client.data.get(name=DATA_ASSET_NAME, version=DATA_ASSET_VERSION)
    print(f"Retrieved data asset '{DATA_ASSET_NAME}' v{DATA_ASSET_VERSION}")
    print('Asset path URI:', data_asset.path)
except Exception as e:
    raise RuntimeError(
        f"Failed to retrieve data asset {DATA_ASSET_NAME} v{DATA_ASSET_VERSION}: {e}\n" \
        "Confirm the asset exists under Assets > Data or update name/version."
    )

# Determine if asset is a file or folder; handle JSON file(s).
import fsspec
fs, _, paths = fsspec.get_fs_token_paths(data_asset.path)
if len(paths) == 0:
    raise FileNotFoundError("Data asset path resolved to zero files. Verify the asset contents.")
if len(paths) > 1:
    print('Note: multiple files detected; concatenating all JSON arrays.')

records_raw = []
for p in paths:
    with fs.open(p, 'r') as f:
        try:
            payload = json.load(f)
        except json.JSONDecodeError as jde:
            raise ValueError(f"File {p} is not valid JSON: {jde}")
        if isinstance(payload, list):
            records_raw.extend(payload)
        else:
            raise ValueError(f"JSON root in {p} not a list; adapt loader for object-based format.")

print('Total session records loaded:', len(records_raw))
if not records_raw:
    raise ValueError('No session records found after reading asset files.')

# Flatten nested structure
domains = ['attention','executive_function','memory','orientation','processing_speed','mood_behavior']
flat_rows = []
for rec in records_raw:
    base = {
        'device_id': rec.get('device_id'),
        'patient_id': rec.get('patient_id'),
        'session_date': rec.get('session_date')
    }
    for d in domains:
        dct = rec.get(d, {}) or {}
        for k,v in dct.items():
            base[f"{d}__{k}"] = v
    flat_rows.append(base)

df = pd.DataFrame(flat_rows)
if df.empty:
    raise ValueError('Resulting DataFrame is empty after flattening.')

# Date handling and ordering
df['session_date'] = pd.to_datetime(df['session_date'])
df = df.sort_values(['patient_id','session_date']).reset_index(drop=True)
print('DataFrame shape:', df.shape)
print('Preview:')
display(df.head(3))
print('Columns:', len(df.columns))

Found the config file in: /config.json
DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: string indices must be integers
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.
DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: string indices must be integers
To mitigate this issue, 

RuntimeError: Failed to retrieve data asset multidomain_cognitive_dataset v1: DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: string indices must be integers
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.
Confirm it exists under Assets > Data in the workspace.

## Temporal Features

In [None]:
# Temporal features based on session_date
df['days_since_baseline'] = df.groupby('patient_id')['session_date'].transform(lambda s: (s - s.min()).dt.days)
df['dayofweek'] = df['session_date'].dt.dayofweek
df['weekofyear'] = df['session_date'].dt.isocalendar().week.astype(int)
df['month'] = df['session_date'].dt.month
df['is_weekend'] = (df['dayofweek'] >=5).astype(int)
df['sin_dayofweek'] = np.sin(2*np.pi*df['dayofweek']/7)
df['cos_dayofweek'] = np.cos(2*np.pi*df['dayofweek']/7)
df.head(2)

## Lag Features

In [None]:
# Construct composite cognitive_health_index if absent
target_col = 'cognitive_health_index'
if target_col not in df.columns:
    # Select representative domain metrics (normalize each to 0-1 via min-max per cohort)
    component_cols = [c for c in df.columns if any(prefix in c for prefix in ['attention__','executive_function__','memory__','processing_speed__','mood_behavior__'])]
    comp_df = df[component_cols].copy()
    # Coerce booleans to int
    for c in comp_df.columns:
        if comp_df[c].dtype == bool:
            comp_df[c] = comp_df[c].astype(int)
    # Min-max scale across all patients (could be per-patient or robust scaled later)
    comp_min = comp_df.min()
    comp_range = (comp_df.max() - comp_min).replace(0,1)
    comp_scaled = (comp_df - comp_min)/comp_range
    # Weighted average: equal weights for now
    df[target_col] = comp_scaled.mean(axis=1)
    # Slight shrink away from 0/1 for stability
    eps = 1e-4
    df[target_col] = df[target_col].clip(eps,1-eps)
print(df[[target_col]].head())

## Volatility & Advanced Rolling Features

In [None]:
# Feature engineering utility functions
from typing import Sequence
def add_lags(data, group_col: str, date_col: str, features: Sequence[str], lags=(1, 7, 14)):
    data = data.sort_values([group_col, date_col])
    for f in features:
        for L in lags:
            data[f"{f}_lag{L}"] = data.groupby(group_col)[f].shift(L)
    return data

def add_rolling_stats(data, group_col: str, date_col: str, features: Sequence[str], windows=(7, 14, 30)):
    data = data.sort_values([group_col, date_col])
    for f in features:
        g = data.groupby(group_col)[f]
        for w in windows:
            data[f"roll_mean_{f}_{w}"] = g.transform(lambda s: s.rolling(w, min_periods=max(3, int(w/2))).mean())
            data[f"roll_std_{f}_{w}"]  = g.transform(lambda s: s.rolling(w, min_periods=max(3, int(w/2))).std())
    return data

def add_ratios(df):
    if 'memory__delayed_recall' in df and 'memory__immediate_recall' in df:
        df['memory_retention_ratio'] = df['memory__delayed_recall'] / (df['memory__immediate_recall'] + 1e-6)
    if 'attention__digit_span_max' in df and 'attention__latency_sec' in df:
        df['attention_efficiency'] = df['attention__digit_span_max'] / (df['attention__latency_sec'] + 1e-6)
    if 'executive_function__verbal_fluency_words' in df and 'executive_function__avg_pause_ms' in df:
        df['executive_fluency_efficiency'] = df['executive_function__verbal_fluency_words'] / (df['executive_function__avg_pause_ms'] + 1e-3)
    return df

def add_slopes(data, group_col: str, date_col: str, feature: str, window=30):
    import numpy as np
    data = data.sort_values([group_col, date_col])
    def rolling_slope(series):
        idx = np.arange(len(series))
        out = np.full(len(series), np.nan)
        for i in range(window - 1, len(series)):
            y = series.iloc[i - window + 1:i + 1].values
            x = idx[i - window + 1:i + 1]
            x = x - x.mean()
            denom = (x ** 2).sum()
            if denom == 0:
                continue
            out[i] = (x * y).sum() / denom
        return out
    data[f"local_slope_{feature}_{window}"] = data.groupby(group_col)[feature].transform(rolling_slope)
    return data

def add_volatility_flags(df, base_feature: str, window=7, z_threshold=2.5):
    mean_col = f"roll_mean_{base_feature}_{window}"
    std_col = f"roll_std_{base_feature}_{window}"
    if mean_col in df and std_col in df:
        df[f"spike_{base_feature}"] = (df[base_feature] - df[mean_col]) / (df[std_col] + 1e-6)
        df[f"spike_flag_{base_feature}"] = (df[f"spike_{base_feature}"] > z_threshold).astype(int)
    return df

print('Utility functions defined')

## Derivative Features

In [None]:
# Apply feature engineering pipeline
target = 'cognitive_health_index'
numeric_features = [c for c in df.columns if any(prefix in c for prefix in ['attention__','executive_function__','memory__','processing_speed__','mood_behavior__']) and df[c].dtype!=object]
df = add_ratios(df)
# Recompute numeric list including ratios
extended_numeric = numeric_features + [c for c in ['memory_retention_ratio','attention_efficiency','executive_fluency_efficiency'] if c in df.columns]
df = add_lags(df,'patient_id','session_date', extended_numeric, lags=(1,7,14))
df = add_rolling_stats(df,'patient_id','session_date', extended_numeric, windows=(7,14,30))
# Volatility flags for a few sentinel features
sentinel_feats = [f for f in ['processing_speed__avg_reaction_time_ms','memory__immediate_recall','executive_function__verbal_fluency_words'] if f in df.columns]
for sf in sentinel_feats:
    df = add_volatility_flags(df, sf, window=7, z_threshold=2.5)
df = add_slopes(df,'patient_id','session_date', target, window=30)
# Cumulative decline from patient baseline
baseline_target = df.groupby('patient_id')[target].transform('first')
df['cumulative_decline'] = baseline_target - df[target]
# Drop early rows with insufficient history (optional)
min_history_days = 14
df_filtered = df[df['days_since_baseline'] >= min_history_days].copy()
print('Original rows:', len(df), 'Filtered rows:', len(df_filtered))
df_filtered.head(3)

## Prepare for LightGBM

In [None]:
# Prepare dataset for LightGBM
id_cols = ['patient_id','device_id','session_date']
drop_cols = id_cols + ['cognitive_health_index']
feature_cols = [c for c in df_filtered.columns if c not in drop_cols]
X = df_filtered[feature_cols]
y = df_filtered['cognitive_health_index']
print('Feature count:', len(feature_cols))
print('X shape:', X.shape, 'y shape:', y.shape)
print('Sample features:', feature_cols[:15])

In [None]:
# Basic quality checks
missing_summary = df_filtered[feature_cols].isna().mean().sort_values(ascending=False).head(10)
print('Top 10 missing feature proportions:\n', missing_summary)
print('Describe target:')
print(y.describe())
print('First 3 rows of engineered features:')
display(df_filtered[id_cols + ['cognitive_health_index'] + feature_cols[:5]].head(3))

In [None]:
# Persist final stacked model (simple average for now)
import pickle
model_path = Path(__file__).resolve().parent / 'lgbm_groupcv_models.pkl'
with open(model_path,'wb') as f:
    pickle.dump(models, f)
print('Saved individual fold models to', model_path)

In [None]:
# LightGBM Grouped CV Training
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math, json
groups = df_filtered['patient_id'].values
N_FOLDS = CONFIG.get('N_FOLDS',5)
params = CONFIG['LGB_PARAMS']
fold_mae = []
fold_rmse = []
models = []
gkf = GroupKFold(n_splits=N_FOLDS)
for fold,(tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups),1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_valid = lgb.Dataset(X_va, label=y_va, reference=lgb_train)
    booster = lgb.train(params, lgb_train, num_boost_round=4000, valid_sets=[lgb_train, lgb_valid],
                        valid_names=['train','valid'], early_stopping_rounds=150, verbose_eval=False)
    pred = booster.predict(X_va, num_iteration=booster.best_iteration)
    mae = mean_absolute_error(y_va, pred)
    rmse = math.sqrt(mean_squared_error(y_va, pred))
    fold_mae.append(mae); fold_rmse.append(rmse)
    models.append(booster)
    print(f'Fold {fold}: MAE={mae:.4f} RMSE={rmse:.4f} BestIter={booster.best_iteration}')
print('Overall MAE mean±std:', f'{np.mean(fold_mae):.4f} ± {np.std(fold_mae):.4f}')
print('Overall RMSE mean±std:', f'{np.mean(fold_rmse):.4f} ± {np.std(fold_rmse):.4f}')
# Feature importance (gain) aggregate
importances = {}
for m in models:
    for feat, imp in zip(m.feature_name(), m.feature_importance(importance_type='gain')):
        importances[feat] = importances.get(feat,0) + imp
sorted_feats = sorted(importances.items(), key=lambda x: x[1], reverse=True)[:40]
print('Top 20 features:')
for f,i in sorted_feats[:20]:
    print(f'{f}: {i:.1f}')

## Save Engineered Features

In [None]:
# Export engineered features
output_path = Path(__file__).resolve().parent / 'engineered_cognitive_features.csv'
df_filtered.to_csv(output_path, index=False)
print('Saved engineered features to', output_path)

In [None]:
# Persist feature list artifact
feature_list_path = Path(__file__).resolve().parent / 'feature_list.json'
with open(feature_list_path,'w') as f:
    json.dump(feature_cols, f, indent=2)
print('Saved feature list to', feature_list_path)