In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

  from pandas.core import (


In [2]:
import os
print(os.getcwd())

/Users/vega7unk/Documents/4th Sem DSI/ilab project/ilab-group-12-1-fall-detection/notebooks


In [3]:
df_100ms = pd.read_pickle("../data/raw/df_resample_100ms.pkl")

In [4]:
df_100ms.head()

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,label,subject_id,trial,age,height,weight,gender
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1970-01-01 00:38:38.700,0.948777,-9.636166,0.002699,0.003818,0.016875,0.006643,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:38.800,0.948993,-9.759188,0.087482,0.020693,0.027565,0.015669,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:38.900,1.071514,-9.787465,-0.09361,0.168721,0.075747,0.037797,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:39.000,1.135679,-9.754036,0.065878,0.339488,0.07897,0.040134,BSC,1.0,1.0,32.0,180.0,85.0,M
1970-01-01 00:38:39.100,1.126174,-9.443248,0.075006,0.594494,-0.012065,0.103557,BSC,1.0,1.0,32.0,180.0,85.0,M


In [5]:
def label_falls(df):
    fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']
    df = df.copy()
    df['binary_label'] = df['label'].apply(lambda x: 1 if x in fall_labels else 0)
    return df


In [6]:
def extract_features_from_windows(df, window_duration_s=2.5, overlap=0.5, sampling_rate_ms=50):
    """
    Slide window through each trial of each subject.
    Extract statistical features per window: mean, std, min, max.
    """
    window_size = int(window_duration_s * 1000 / sampling_rate_ms)
    step_size = int(window_size * (1 - overlap))
    
    features = []
    labels = []
    
    for subject in df['subject_id'].unique():
        subject_data = df[df['subject_id'] == subject]
        
        for trial in subject_data['trial'].unique():
            trial_data = subject_data[subject_data['trial'] == trial]
            trial_data = trial_data.sort_index()  # sort by timestamp

            signal_data = trial_data[['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']].values
            binary_labels = trial_data['binary_label'].values

            for start in range(0, len(signal_data) - window_size + 1, step_size):
                window = signal_data[start:start + window_size]
                window_labels = binary_labels[start:start + window_size]
                label = int(np.any(window_labels))  # 1 if fall happened in the window
                
                # statistical features
                feat = np.concatenate([
                    window.mean(axis=0),
                    window.std(axis=0),
                    window.min(axis=0),
                    window.max(axis=0),
                ])
                
                features.append(feat)
                labels.append(label)
    
    return np.array(features), np.array(labels)

In [7]:
def train_model(X_train, y_train, model_type="mlp", **kwargs):
    if model_type == "mlp":
        model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42, **kwargs)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    model.fit(X_train, y_train)
    return model

In [8]:
def split_data(X, y, test_size=0.2, random_state=42):
    """
    Perform stratified train-test split to preserve fall/non-fall proportion.
    """
    return train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

def evaluate_model(model, X_test, y_test, model_name=None, show_confusion_matrix=True):
    """
    Evaluate the model and print standard classification metrics.
    """
    if model_name:
        print(f"\n📈 Evaluation Results for Model: {model_name}")

    y_pred = model.predict(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))

    if show_confusion_matrix:
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_proba)
        print(f"\nROC AUC: {roc_auc:.4f}")
    except AttributeError:
        print("\nROC AUC not available (predict_proba missing for this model).")

In [10]:
def run_pipeline(df, sampling_rate_ms, window_duration_s=2.5, overlap=0.5, model_type="random_forest"):
    print(f"\n📊 Running pipeline for sampling rate = {sampling_rate_ms} ms | Model = {model_type}")

    df = label_falls(df)

    X, y = extract_features_from_windows(
        df,
        window_duration_s=window_duration_s,
        overlap=overlap,
        sampling_rate_ms=sampling_rate_ms
    )

    print(f"\nTotal windows: {len(y)} | Fall: {np.sum(y)} | Non-Fall: {len(y) - np.sum(y)}")

    X_train, X_test, y_train, y_test = split_data(X, y)

    model = train_model(X_train, y_train, model_type=model_type)

    evaluate_model(model, X_test, y_test, model_name=model_type)

    return model

In [11]:
for model_type in ["mlp"]:
    run_pipeline(df_100ms, sampling_rate_ms=100, model_type=model_type)


📊 Running pipeline for sampling rate = 100 ms | Model = mlp

Total windows: 69983 | Fall: 3158 | Non-Fall: 66825

📈 Evaluation Results for Model: mlp

Classification Report:
              precision    recall  f1-score   support

           0     0.9878    0.9942    0.9910     13365
           1     0.8571    0.7405    0.7946       632

    accuracy                         0.9827     13997
   macro avg     0.9225    0.8673    0.8928     13997
weighted avg     0.9819    0.9827    0.9821     13997


Confusion Matrix:
[[13287    78]
 [  164   468]]

ROC AUC: 0.9830


In [None]:
import pickle

# Option 1: If you want to run the pipeline just once and save that model

best_model = run_pipeline(df_100ms, sampling_rate_ms=100, model_type="mlp")

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved as best_model.pkl")

# Option 2: If you already run the pipeline in a loop for different model types,
# capture the model (assuming 'mlp' is the best one) and then save it:

saved_model = None
for model_type in ["mlp"]:
    saved_model = run_pipeline(df_100ms, sampling_rate_ms=100, model_type=model_type)
    
with open("best_model.pkl", "wb") as f:
    pickle.dump(saved_model, f)
    
print("Best model saved as best_model.pkl")
