In [4]:
import pandas as pd

# Load raw data
df = pd.read_csv("../data/raw/WISDM_ar_v1.1_raw.txt", header=None, on_bad_lines='skip')

# Preview
print(df.head())


    0        1               2         3          4             5
0  33  Jogging  49105962326000 -0.694638  12.680544   0.50395286;
1  33  Jogging  49106062271000  5.012288  11.264028   0.95342433;
2  33  Jogging  49106112167000  4.903325  10.882658  -0.08172209;
3  33  Jogging  49106222305000 -0.612916  18.496431    3.0237172;
4  33  Jogging  49106332290000 -1.184970  12.108489     7.205164;


In [7]:
# Rename columns
df.columns = ['user', 'activity', 'timestamp', 'x', 'y', 'z']

# Drop rows with missing data
df.dropna(inplace=True)

# Remove semicolon from 'z' column and convert to float
df['z'] = df['z'].str.replace(';', '', regex=False).astype(float)

# Convert x, y to float
df['x'] = df['x'].astype(float)
df['y'] = df['y'].astype(float)

# Check output
print(df.dtypes)
print(df.head())


user           int64
activity      object
timestamp      int64
x            float64
y            float64
z            float64
dtype: object
   user activity       timestamp         x          y         z
0    33  Jogging  49105962326000 -0.694638  12.680544  0.503953
1    33  Jogging  49106062271000  5.012288  11.264028  0.953424
2    33  Jogging  49106112167000  4.903325  10.882658 -0.081722
3    33  Jogging  49106222305000 -0.612916  18.496431  3.023717
4    33  Jogging  49106332290000 -1.184970  12.108489  7.205164


In [11]:
def extract_features(df_window):
    return {
        'x_mean': df_window['x'].mean(),
        'y_mean': df_window['y'].mean(),
        'z_mean': df_window['z'].mean(),
        'x_std': df_window['x'].std(),
        'y_std': df_window['y'].std(),
        'z_std': df_window['z'].std(),
    }


In [15]:
import numpy as np

window_size = 200  # ~2.5 seconds if 80Hz
step_size = 100

feature_vectors = []
labels = []

for i in range(0, len(df) - window_size, step_size):
    window = df.iloc[i:i+window_size]
    x = window['x']
    y = window['y']
    z = window['z']

    features = [
        x.mean(), y.mean(), z.mean(),
        x.std(), y.std(), z.std(),
        x.max(), y.max(), z.max(),
        x.min(), y.min(), z.min()
    ]
    
    feature_vectors.append(features)
    labels.append(window['activity'].mode()[0])  # most common activity in window


In [16]:
X = np.array(feature_vectors)
y = np.array(labels)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)




In [18]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9581224114127934
Classification Report:
               precision    recall  f1-score   support

  Downstairs       0.90      0.81      0.85       200
     Jogging       0.99      0.99      0.99       673
     Sitting       0.99      0.97      0.98       120
    Standing       0.99      0.97      0.98        97
    Upstairs       0.88      0.87      0.87       246
     Walking       0.96      0.99      0.98       837

    accuracy                           0.96      2173
   macro avg       0.95      0.93      0.94      2173
weighted avg       0.96      0.96      0.96      2173



In [22]:
import joblib

joblib.dump(clf, 'fitness_model.pkl')


['fitness_model.pkl']