In [1]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import joblib

# === Load and combine all .xlsx files ===
xlsx_files = glob('../data/raw/*.xlsx')
dfs = [pd.read_excel(f) for f in xlsx_files]
df_combined = pd.concat(dfs, ignore_index=True)

print("=== Original Data Info ===")
print(df_combined['label'].value_counts())
print(f"Total samples: {len(df_combined)}")

# === SIMPLE WINDOWING FUNCTION ===
def create_simple_windows(df, window_size=75, stride=25):
    """
    Create windows with just mean and std for each sensor.
    This gives 24 features (12 sensors × 2 stats).
    """
    feature_cols = ['p_ax','p_ay','p_az','p_gx','p_gy','p_gz',
                    'a_ax','a_ay','a_az','a_gx','a_gy','a_gz']
    
    windows_features = []
    windows_labels = []
    
    # Sort by time
    df = df.sort_values('time').reset_index(drop=True)
    
    # Process each recording session separately
    for (label, reps), group in df.groupby(['label', 'reps']):
        group = group.reset_index(drop=True)
        
        # Create sliding windows
        for i in range(0, len(group) - window_size + 1, stride):
            window = group.iloc[i:i+window_size]
            
            # Extract mean and std for each sensor
            features = []
            for col in feature_cols:
                values = window[col].values
                features.append(np.mean(values))  # Mean
                features.append(np.std(values))   # Standard deviation
            
            windows_features.append(features)
            windows_labels.append(label)
    
    X = np.array(windows_features)
    y = np.array(windows_labels)
    
    return X, y

# === Create windowed dataset ===
print("\n=== Creating Sliding Windows ===")
window_size = 75   # 1.5 seconds at 50Hz
stride = 25        # 0.5 second stride (50% overlap)

X_windowed, y_windowed = create_simple_windows(df_combined, window_size, stride)

print(f"Windows created: {len(X_windowed)}")
print(f"Features per window: {X_windowed.shape[1]} (12 sensors × 2 stats)")
print("\nWindow distribution:")
print(pd.Series(y_windowed).value_counts())

# === Split data ===
X_train, X_test, y_train, y_test = train_test_split(
    X_windowed, y_windowed, test_size=0.2, random_state=42, stratify=y_windowed
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# === Train Random Forest ===
print("\n=== Training Model ===")
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# === Evaluate ===
accuracy = rf.score(X_test, y_test)
print(f"\n✅ Accuracy: {accuracy * 100:.1f}%")

y_pred = rf.predict(X_test)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# === Save model ===
joblib.dump(rf, "exercise_rf_windowed.pkl")
print("\n✅ Model saved as exercise_rf_windowed.pkl")

print("\n=== Model Info ===")
print(f"Number of features: {X_windowed.shape[1]}")
print("Feature names:")
feature_cols = ['p_ax','p_ay','p_az','p_gx','p_gy','p_gz',
                'a_ax','a_ay','a_az','a_gx','a_gy','a_gz']
for col in feature_cols:
    print(f"  - {col}_mean, {col}_std")

=== Original Data Info ===
label
squat           5371
pushup          4006
jumping_jack    3075
Name: count, dtype: int64
Total samples: 12452

=== Creating Sliding Windows ===
Windows created: 491
Features per window: 24 (12 sensors × 2 stats)

Window distribution:
squat           212
pushup          158
jumping_jack    121
Name: count, dtype: int64

Training samples: 392
Test samples: 99

=== Training Model ===

✅ Accuracy: 98.0%

=== Classification Report ===
              precision    recall  f1-score   support

jumping_jack       1.00      0.92      0.96        24
      pushup       0.97      1.00      0.98        32
       squat       0.98      1.00      0.99        43

    accuracy                           0.98        99
   macro avg       0.98      0.97      0.98        99
weighted avg       0.98      0.98      0.98        99


=== Confusion Matrix ===
[[22  1  1]
 [ 0 32  0]
 [ 0  0 43]]

✅ Model saved as exercise_rf_windowed.pkl

=== Model Info ===
Number of features: 24
Fea