# Fall‑risk prediction on the gait analysis data

This notebook demonstrates how to load the combined gait dataset, preprocess it, handle class imbalance, train multiple models (Logistic Regression, Random Forest and XGBoost), and evaluate them on a hold‑out test set.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
df = pd.read_csv('combined_output.csv')
# Map labels to binary
y = df['Faller'].map({'F': 1, 'NF': 0})
# Drop ID and label, convert to numeric and fill missing values
X = df.drop(columns=['ID', 'Faller']).apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.median())
print('Dataset shape:', X.shape)
print('Class distribution:', y.value_counts())

In [None]:
# Split into train/test with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversample the minority class in the training set
train_df = X_train.copy()
train_df['label'] = y_train
majority = train_df[train_df['label'] == 0]
minority = train_df[train_df['label'] == 1]
minority_over = resample(minority, replace=True, n_samples=len(majority), random_state=42)
train_bal = pd.concat([majority, minority_over])
X_train_bal = train_bal.drop(columns=['label'])
y_train_bal = train_bal['label']

print('Balanced training set shape:', X_train_bal.shape)
print('Balanced class distribution:', y_train_bal.value_counts())

In [None]:
# Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train_bal)

y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:,1]

acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr, zero_division=0)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_prob_lr)

print('Logistic Regression:')
print('Accuracy:', acc_lr)
print('Precision:', prec_lr)
print('Recall:', rec_lr)
print('F1:', f1_lr)
print('ROC AUC:', auc_lr)

# Confusion matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('Logistic Regression Confusion Matrix')
plt.show()

In [None]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_bal, y_train_bal)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, zero_division=0)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_prob_rf)

print('Random Forest:')
print('Accuracy:', acc_rf)
print('Precision:', prec_rf)
print('Recall:', rec_rf)
print('F1:', f1_rf)
print('ROC AUC:', auc_rf)

cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('Random Forest Confusion Matrix')
plt.show()

In [None]:
# XGBoost
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss', n_estimators=200, max_depth=4, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8)
xgb_model.fit(X_train_bal, y_train_bal)
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:,1]

acc_xgb = accuracy_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb, zero_division=0)
rec_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_prob_xgb)

print('XGBoost:')
print('Accuracy:', acc_xgb)
print('Precision:', prec_xgb)
print('Recall:', rec_xgb)
print('F1:', f1_xgb)
print('ROC AUC:', auc_xgb)

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('XGBoost Confusion Matrix')
plt.show()

In [None]:
# Summarize results in a DataFrame
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [acc_lr, acc_rf, acc_xgb],
    'Precision': [prec_lr, prec_rf, prec_xgb],
    'Recall': [rec_lr, rec_rf, rec_xgb],
    'F1': [f1_lr, f1_rf, f1_xgb],
    'ROC AUC': [auc_lr, auc_rf, auc_xgb]
})
results