In [77]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [78]:
df = pd.read_pickle("../../data/raw/df_resample_100ms.pkl")

In [79]:
df

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,label,subject_id,trial,age,height,weight,gender
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1970-01-01 00:38:38.700,0.948777,-9.636166,0.002699,0.003818,0.016875,0.006643,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:38.800,0.948993,-9.759188,0.087482,0.020693,0.027565,0.015669,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:38.900,1.071514,-9.787465,-0.093610,0.168721,0.075747,0.037797,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:39.000,1.135679,-9.754036,0.065878,0.339488,0.078970,0.040134,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:39.100,1.126174,-9.443248,0.075006,0.594494,-0.012065,0.103557,BSC,1.0,1.0,32.0,180.0,85.0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 04:49:55.200,2.128992,14.084110,0.907692,-1.022787,0.162796,0.609964,WAL,67.0,1.0,23.0,180.0,67.0,M
1970-01-01 04:49:55.300,-1.563028,11.139874,1.061856,-2.417346,-0.699792,1.861551,WAL,67.0,1.0,23.0,180.0,67.0,M
1970-01-01 04:49:55.400,2.767679,8.869539,0.154884,-0.998184,0.612194,1.557859,WAL,67.0,1.0,23.0,180.0,67.0,M
1970-01-01 04:49:55.500,0.149554,5.565219,2.023148,0.629237,0.462074,-0.136162,WAL,67.0,1.0,23.0,180.0,67.0,M


In [80]:
from scipy.stats import skew, kurtosis, entropy
from scipy.signal import welch
import numpy as np

### Feature Extraction

In [81]:
def extract_features_from_windows(df, window_duration_s=1, overlap=0.5, sampling_rate_ms=100):
    """
    Slide window through each trial of each subject.
    Extract statistical features per window: mean, std, min, max.
    """
    window_size = int(window_duration_s * 1000 / sampling_rate_ms)
    step_size = int(window_size * (1 - overlap))
    
    features = []
    labels = []
    
    for subject in df['subject_id'].unique():
        subject_data = df[df['subject_id'] == subject]
        
        for trial in subject_data['trial'].unique():
            trial_data = subject_data[subject_data['trial'] == trial]
            trial_data = trial_data.sort_index()  # sort by timestamp

            signal_data = trial_data[['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']].values
            class_labels = trial_data['label'].values

            for start in range(0, len(signal_data) - window_size + 1, step_size):
                window = signal_data[start:start + window_size]
                window_labels = class_labels[start:start + window_size]
                label = window_labels[-1]  # Take the last label in the window  
                # statistical features
                
               # Extract stats
                mean_feat = window.mean(axis=0)
                std_feat = window.std(axis=0)
                min_feat = window.min(axis=0)
                max_feat = window.max(axis=0)
                skew_feat = skew(window, axis=0)
                kurt_feat = kurtosis(window, axis=0)
                minmax_diff = max_feat - min_feat
                
                # **Frequency-domain features**
                fft_feat = np.abs(np.fft.rfft(window, axis=0)).mean(axis=0)  # Average FFT magnitude
                psd_feat = np.array([welch(window[:, i], fs=1000/sampling_rate_ms)[1].mean() for i in range(window.shape[1])])  # PSD mean
                entropy_feat = entropy(np.abs(np.fft.rfft(window, axis=0)), axis=0)  # FFT entropy

                # **Concatenate all features**
                feat = np.concatenate([
                    mean_feat,
                    std_feat,
                    min_feat,
                    max_feat,
                    skew_feat,
                    kurt_feat,
                    minmax_diff,
                    fft_feat,
                    psd_feat,
                    entropy_feat
                ])
                
                features.append(feat)
                labels.append(label)
                
    return features, labels
    

### Train/Test Split

In [82]:
def split_data(X, y, test_size=0.2, random_state=42):
    """
    Perform stratified train-test split to preserve fall/non-fall proportion.
    """
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

In [83]:
def train_model(X_train, y_train, model_type="random_forest", **kwargs):
    """
    Train a classifier. Supported types: 'random_forest', 'logistic', 'svm', 'mlp'
    """
    if model_type == "random_forest":
        model = RandomForestClassifier(n_estimators=100, random_state=42, **kwargs)
    elif model_type == "logistic":
        model = LogisticRegression(max_iter=1000, random_state=42, **kwargs)
    elif model_type == "svm":
        model = SVC(probability=True, random_state=42, **kwargs)
    elif model_type == "mlp":
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42, **kwargs)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    model.fit(X_train, y_train)
    return model

In [84]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(model, X_test, y_test, model_name=None, show_confusion_matrix=True):
    """
    Evaluate the model and print standard classification metrics.
    """
    if model_name:
        print(f"\n📈 Evaluation Results for Model: {model_name}")

    y_pred = model.predict(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    if show_confusion_matrix:
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_proba)
        print(f"\nROC AUC: {roc_auc:.4f}")
    except AttributeError:
        print("\nROC AUC not available (predict_proba missing for this model).")

In [85]:
from sklearn.preprocessing import LabelEncoder

def run_pipeline(df, sampling_rate_ms, window_duration_s=1, overlap=0.5, model_type="random_forest"):
    print(f"\n📊 Running pipeline for sampling rate = {sampling_rate_ms} ms | Model = {model_type}")

    # Extract features and labels
    X, y = extract_features_from_windows(
        df,
        window_duration_s=window_duration_s,
        overlap=overlap,
        sampling_rate_ms=sampling_rate_ms
    )

    # Label Encode y
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    print(f"\nTotal windows: {len(y)} | Unique Labels: {np.unique(y)}")

    # Split Data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Train Model
    model = train_model(X_train, y_train, model_type=model_type)

    # Evaluate Model
    evaluate_model(model, X_test, y_test, model_name=model_type)

    return model, label_encoder  # Return encoder if needed for inverse transformation


In [86]:
for model_type in ["random_forest", "logistic", "svm", "mlp"]:
    run_pipeline(df, sampling_rate_ms=100, model_type=model_type)


📊 Running pipeline for sampling rate = 100 ms | Model = random_forest


  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,


  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,
  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, no


Total windows: 168850 | Unique Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]

📈 Evaluation Results for Model: random_forest

Classification Report:
              precision    recall  f1-score   support

           0     0.7623    0.4097    0.5330       227
           1     0.9000    0.3711    0.5255        97
           2     0.6824    0.5230    0.5922       608
           3     0.7054    0.5820    0.6378       646
           4     0.6975    0.4560    0.5515       182
           5     0.7143    0.3313    0.4527       166
           6     0.9276    0.9363    0.9319      2325
           7     0.9642    0.9590    0.9616      2246
           8     0.8814    0.9034    0.8923      2049
           9     0.7110    0.5599    0.6265       334
          10     0.7848    0.3147    0.4493       197
          11     0.8808    0.8764    0.8786      2411
          12     0.9178    0.9621    0.9394      9883
          13     0.8576    0.7340    0.7910      1222
          14     0.8672    0

ValueError: multi_class must be in ('ovo', 'ovr')