In [None]:
import os

subject_id = 'S1'
file_path = f'PPG_FieldStudy/{subject_id}/{subject_id}.pkl'

# 1. Check File Size
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
print(f"File Size: {file_size_mb:.2f} MB")

# 2. Check Memory (RAM)
import psutil # You might need to pip install this, or just skip this part
print(f"Available RAM: {psutil.virtual_memory().available / (1024**3):.2f} GB")

File Size: 1302.52 MB
Available RAM: 6.36 GB


In [7]:
import pickle
import pandas as pd

# Load one of the subject files
subject_id = 'S5'
file_path = f'PPG_FieldStudy/{subject_id}/{subject_id}.pkl' # file path

with open(file_path, 'rb') as f:
    data = pickle.load(f, encoding='latin1')

In [8]:
print("Available keys:", data.keys())

SAMPLING_RATE_IMU = 32   # 'wrist' 'ACC' is 32 Hz
SAMPLING_RATE_LABELS = 4 # 'activity' signal is 4 Hz

# Get the IMU data (accelerometer)
# This dataset provides wrist ACC at 32Hz
acc_data = data['signal']['wrist']['ACC']
print(f"ACC data shape (samples, axes): {acc_data.shape}")

# Get the activity labels
# 'activity' holds the activity IDs (0-8)
activity_labels_4hz = data['activity']
print(f"Activity labels shape: {activity_labels_4hz.shape}")

# This mapping is correct according to the readme
activity_mapping = {
    0: 'transient', 1: 'sitting', 2: 'stairs', 3: 'soccer',
    4: 'cycling', 5: 'driving', 6: 'lunch', 7: 'walking', 8: 'working' # Changed 'running' to 'working' to match readme
}

# --- IMPORTANT ---
# The readme says the 'activity' signal has a 4Hz sampling rate [cite: 111]
# The 'ACC' signal has a 32Hz sampling rate [cite: 22, 89]
# You will need to downsample or upsample to align them, just like in the previous dataset.

Available keys: dict_keys(['rpeaks', 'signal', 'label', 'activity', 'questionnaire', 'subject'])
ACC data shape (samples, axes): (297760, 3)
Activity labels shape: (37220, 1)


### Train Model A (Avticvity Classifier):
#### This is where we will use athlete datasets. PPG-DaLiA has a PPG + IMU sensor combination. As well, as the functions defined in feature_extraction.py
This model will take the features from the IMU sensor.
Process, and output the current activity being performed by the soldier

In [9]:
import joblib
from feature_extractor import process_imu, SAMPLING_RATE_IMU
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [10]:
# Upsample Labels to match ACC data (THE CRITICAL FIX) ---
# We need to map each 32Hz ACC sample to a 4Hz label
# Ratio = 4 / 32 = 0.125
ratio = SAMPLING_RATE_LABELS / SAMPLING_RATE_IMU

# For each sample in 'acc_data', find the corresponding index in the 'activity_labels_4hz' array
indices = np.floor(np.arange(len(acc_data)) * ratio).astype(int)

# Handle any potential out-of-bounds
indices = np.clip(indices, 0, len(activity_labels_4hz) - 1)

# Create the new, perfectly aligned label array
activity_labels = activity_labels_4hz[indices]

print(f"Upsampled Labels shape: {activity_labels.shape}") # This should now match acc_data!

Upsampled Labels shape: (297760, 1)


In [11]:
# Create Windows from PPG-DaLiA data
# Works similar to a simplified "sliding window"
WINDOW_SIZE_SEC = 5  
WINDOW_SAMPLES = WINDOW_SIZE_SEC * SAMPLING_RATE_IMU
STEP_SAMPLES = SAMPLING_RATE_IMU // 2 # 50% overlap

all_features = []
all_labels = []

# Filter out 'transient' (0) and other non-relevant activities 
target_labels = {1, 2, 4, 7} # sitting, stairs, cycling, walking
valid_indices = np.where(np.isin(activity_labels, list(target_labels)))[0]

acc_data_filtered = acc_data[valid_indices]
activity_labels_filtered = activity_labels[valid_indices]

print(f"Length of filtered data: {len(acc_data_filtered)}")

for i in range(0, len(acc_data_filtered) - WINDOW_SAMPLES, STEP_SAMPLES):
    window = acc_data_filtered[i : i + WINDOW_SAMPLES]
    #get most frequent label in the window
    window_labels = activity_labels_filtered[i : i + WINDOW_SAMPLES].flatten().astype(int)
    label = np.bincount(window_labels).argmax()    
    features = process_imu(window)
    all_features.append(features)
    all_labels.append(label)
    

Length of filtered data: 66752


  features[f'acc_{axis}_skew'] = skew(axis_data)
  features[f'acc_{axis}_kurt'] = kurtosis(axis_data)


In [12]:
X = pd.DataFrame(all_features)
y = np.array(all_labels)

# Clean up any NaNs (from empty windows, etc.)
X = X.fillna(0)

print(f"Created {len(X)} feature vectors.")
print(f"Class labels: {np.unique(y)}")

Created 4162 feature vectors.
Class labels: [1 2 4 7]


In [13]:
# Evaluate and Save
if len(X) > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model_A = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model_A.fit(X_train, y_train)

    # Evaluate and Save 
    y_pred = model_A.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Model A (Activity) Accuracy: {acc * 100:.2f}%")

    # Save the trained model
    joblib.dump(model_A, 'activity_model.pkl')
    print("Successfully saved 'activity_model.pkl'")
else:
    print("No feature vectors were created. This subject might not have the 'target_labels'.")
    print("Try loading a different subject file (e.g., S2.pkl, S3.pkl, etc.)")

Model A (Activity) Accuracy: 99.60%
Successfully saved 'activity_model.pkl'


## Build the Trauma & Triage Pipeline (Model B)

In [17]:
#Find relevant patients & events 

import vitaldb
import pandas as pd
import numpy as np
from feature_extractor import extract_vitals_from_signals
import ssl

# This disables SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context

#Find all cases that have the signal we need 
# We want ECG (from VITAL_ECG_II) and PPG (from VITAL_PLETH)
cases = vitaldb.find_cases(['VITAL_ECG_II', 'VITAL_PLETH'])
print(f"Found {len(cases)} cases with required signals (ECG, PPG).")

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

Go through each case, find critical events, and extract the vital signs form the 60 seconds leading up to the events

In [None]:
all_vitals_data = []

# List of event names that we'll label as 'T1' (Triage group 1 - most urgent)
# Can add more event names from the VitalDB website
T1_EVENT_NAMES = ['HYPOTENSION', 'DESATURATION', 'TACHYCARDIA', 'BRADYCARDIA']

SAMPLING_RATE_ECG = 500  # VitalDB ECG is 500 Hz
SAMPLING_RATE_PPG = 100  # VitalDB PLETH is 100 Hz
# Ensure sampling rates on feature_extractor.py are the same as here

for case_id in cases:
    print(f"\nProcessing Case {case_id}")
    
    # Load the case data. This downloads the file.
    case = vitaldb.load_case(case_id, ['VITAL_ECG_II', 'VITAL_PLETH'], 1/SAMPLING_RATE_ECG)
    
    # Load the event log for this case
    event_df = vitaldb.load_events(case_id)
    
    # Find all 'T1' events
    t1_events = event_df[event_df['eventid'].isin(T1_EVENT_NAMES)]
    
    # 1. Get T1 (Immediate) Data
    for _, event in t1_events.iterrows():
        event_time_s = event['time']
        
        # Get the 60 seconds of data *before* the event
        start_s = event_time_s - 60
        end_s = event_time_s
        
        # Extract the raw signal windows
        try:
            ecg_window = case.loc[start_s:end_s, 'VITAL_ECG_II'].values
            ppg_window = case.loc[start_s:end_s, 'VITAL_PLETH'].values
            
            # Use our feature extractor (must update sample rates!)
            vitals = extract_vitals_from_signals(ecg_window, ppg_window)
            
            # Add our "trick" and "ground truth" labels
            vitals['activity_label'] = 'lying_still' 
            vitals['triage_label'] = 'T1_Immediate'
            
            all_vitals_data.append(vitals)
        except Exception as e:
            print(f"  Warning: Failed to process window for event {event['eventid']}: {e}")

    # 2. Get T3 (Minimal) Data
    # take 5 "stable" samples from the beginning of the surgery
    try:
        for i in range(5):
            start_s = i * 60
            end_s = (i + 1) * 60
            
            ecg_window = case.loc[start_s:end_s, 'VITAL_ECG_II'].values
            ppg_window = case.loc[start_s:end_s, 'VITAL_PLETH'].values
            
            vitals = extract_vitals_from_signals(ecg_window, ppg_window)
            
            vitals['activity_label'] = 'lying_still' 
            vitals['triage_label'] = 'T3_Minimal'
            
            all_vitals_data.append(vitals)
    except Exception as e:
        print(f"  Warning: Failed to get T3 data: {e}")

# 5. Save New Dataset
triage_df = pd.DataFrame(all_vitals_data)
triage_df = triage_df.dropna() # Drop any windows that failed processing
triage_df.to_csv('triage_dataset.csv', index=False)

print(f"\n success!")
print(f"Created 'triage_dataset.csv' with {len(triage_df)} data points.")
print(triage_df['triage_label'].value_counts())

Train Model 

In [None]:
# Train Model B (Triage Classifier)
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load the dataset we created earlier
try:
    triage_df = pd.read_csv('triage_dataset.csv')
    print(f"Loaded {len(triage_df)} samples from triage_dataset.csv")
except FileNotFoundError:
    print("Error: 'triage_dataset.csv' not found. Make sure Step 5 finished successfully.")

# 2. Prepare Features (X) and Labels (y)
# Note: include 'activity_label' to provide context (e.g., "Running" vs "Lying Still")
features_list = ['ecg_hr', 'hrv_rmssd', 'breathing_rate_edr', 'ppg_hr', 'activity_label']

X = triage_df[features_list]
y = triage_df['triage_label']

# 3. Encode the 'activity_label' (Text -> Numbers)
# We need to turn "lying_still" into a number (like 0) for the model
le = LabelEncoder()
X['activity_label'] = le.fit_transform(X['activity_label'])

# Save this encoder - needed later for the final simulation.
joblib.dump(le, 'activity_encoder.pkl')
print("Saved 'activity_encoder.pkl'")

# 4. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 5. Train the Model
print("Training Random Forest Model...")
model_B = RandomForestClassifier(n_estimators=100, random_state=42)
model_B.fit(X_train, y_train)

# 6. Evaluate
y_pred = model_B.predict(X_test)

print("\nClassification Report")
print(classification_report(y_test, y_pred))

# 7. Visualize Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues',
            xticklabels=model_B.classes_, yticklabels=model_B.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Model B Confusion Matrix')
plt.show()

# 8. Save the Final Model
joblib.dump(model_B, 'triage_model.pkl')
print("Model saved as 'triage_model.pkl'")