### Read data

In [5]:
import pandas as pd

# Define file path
file_path = "../../data/raw/MobiAct_combined.csv"

In [8]:
# Specify pyarrow for data to load faster
df = pd.read_csv(file_path)

### Explore data

In [9]:
df

Unnamed: 0,timestamp,rel_time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
0,1295405261000,0.000000,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1,1295410262000,0.005001,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
2,1295415352000,0.010091,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
3,1295420307000,0.015046,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267320,-38.842915,-11.933741,STD,10,1
4,1295425257000,0.019996,-1.403484,9.608654,-2.078055,0.049480,0.018326,0.016493,90.819680,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16756320,10354577784000,299.969995,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442350,-56.026966,-33.223778,WAL,9,1
16756321,10354582775000,299.974986,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077300,-31.897688,WAL,9,1
16756322,10354588060000,299.980271,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816400,-58.044624,-30.614605,WAL,9,1
16756323,10354592749000,299.984960,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671650,-58.777103,-29.624798,WAL,9,1


In [10]:
# Check unique label
df['label'].nunique()

16

In [11]:
# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ns')

In [12]:
# Set timestamp as index
df.index = pd.to_datetime(df['timestamp'])

# Delete relative time and timestamp from column
del df['rel_time']
del df['timestamp']

In [13]:
df

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:21:35.405261,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1970-01-01 00:21:35.410262,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
1970-01-01 00:21:35.415352,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
1970-01-01 00:21:35.420307,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267320,-38.842915,-11.933741,STD,10,1
1970-01-01 00:21:35.425257,-1.403484,9.608654,-2.078055,0.049480,0.018326,0.016493,90.819680,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442350,-56.026966,-33.223778,WAL,9,1
1970-01-01 02:52:34.582775,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077300,-31.897688,WAL,9,1
1970-01-01 02:52:34.588060,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816400,-58.044624,-30.614605,WAL,9,1
1970-01-01 02:52:34.592749,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671650,-58.777103,-29.624798,WAL,9,1


Examine different resampling frequency to retain the most information

In [14]:
# Define resampling dictionary
sampling = {
    "acc_x": "mean",
    "acc_y": "mean",
    "acc_z": "mean",
    "gyro_x": "mean",
    "gyro_y": "mean",
    "gyro_z": "mean",
    "label": "last",
    "subject_id": "last",
    "trial": "last",
}


In [15]:
# Resample again but preserve activity perform by each subject

results = []
for (subject, trial), group in df.groupby(["subject_id", "trial"]):
    resampled = group.resample("20ms").agg(sampling)  # Use .agg instead of .apply for named aggregations
    resampled["subject_id"] = subject
    resampled["trial"] = trial
    results.append(resampled)


In [18]:
df_resample_20ms = pd.concat(results).dropna().reset_index()

In [19]:
df_resample_20ms

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,label,subject_id,trial
0,1970-01-01 00:04:23.620,0.705479,-9.754311,-0.453826,-0.036448,0.029831,0.011301,STD,1,1
1,1970-01-01 00:04:23.640,0.743307,-9.756553,-0.559145,-0.044135,0.025656,0.017944,STD,1,1
2,1970-01-01 00:04:23.660,0.721394,-9.682911,-0.750469,-0.039172,0.017944,0.016570,STD,1,1
3,1970-01-01 00:04:23.680,0.701371,-9.686890,-0.835315,-0.039859,0.015882,0.012752,STD,1,1
4,1970-01-01 00:04:23.700,0.693497,-9.698679,-0.785186,-0.042226,0.011683,0.017028,STD,1,1
...,...,...,...,...,...,...,...,...,...,...
4138422,1970-01-01 04:19:26.040,2.003467,9.635633,-0.573736,-0.063148,-0.064217,-0.003589,STD,67,6
4138423,1970-01-01 04:19:26.060,2.121049,9.664656,-0.569072,-0.047571,-0.051160,-0.006720,STD,67,6
4138424,1970-01-01 04:19:26.080,2.212409,9.664813,-0.602382,-0.028634,-0.036041,-0.002825,STD,67,6
4138425,1970-01-01 04:19:26.100,2.184638,9.659224,-0.565169,-0.020617,-0.022602,-0.004581,STD,67,6


In [None]:
df_resample_20ms.to_pickle("../../data/raw/df_resample_20ms.pkl") # save for analysis later

### Data Preprocessing

In [21]:
import scipy.signal as signal

def apply_low_pass_filter(data, cutoff=3, fs=10, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    df_filtered = data.copy()
    for col in ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']:
        df_filtered[col] = signal.filtfilt(b, a, data[col])
    return df_filtered


In [22]:
df_filtered = apply_low_pass_filter(df_resample_20ms)

In [23]:
# Normalize sensor data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
df_filtered[sensor_cols] = scaler.fit_transform(df_filtered[sensor_cols])


In [24]:
df_filtered

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,label,subject_id,trial
0,1970-01-01 00:04:23.620,0.120127,-2.217653,-0.298938,-0.006791,0.033568,-0.003344,STD,1,1
1,1970-01-01 00:04:23.640,0.128787,-2.215478,-0.330573,-0.012438,0.028701,0.005826,STD,1,1
2,1970-01-01 00:04:23.660,0.125399,-2.210226,-0.371455,-0.010902,0.023067,0.004100,STD,1,1
3,1970-01-01 00:04:23.680,0.117923,-2.207260,-0.396612,-0.009458,0.018697,0.000048,STD,1,1
4,1970-01-01 00:04:23.700,0.118343,-2.208842,-0.387711,-0.012425,0.017044,0.003507,STD,1,1
...,...,...,...,...,...,...,...,...,...,...
4138422,1970-01-01 04:19:26.040,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028,STD,67,6
4138423,1970-01-01 04:19:26.060,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323,STD,67,6
4138424,1970-01-01 04:19:26.080,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712,STD,67,6
4138425,1970-01-01 04:19:26.100,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067,STD,67,6


### Data Splitting

To ensure no leakage when training our deep learning model we need to group split by subject ID (Leaving some subject out for validation and testing)

In [25]:
from sklearn.model_selection import GroupShuffleSplit

# Get your subject group array
groups = df_filtered['subject_id'].values

# 1. Split off test subjects
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
trainval_idx, test_idx = next(gss1.split(df_filtered, df_filtered['label'], groups=groups))

trainval_subjects = df_filtered.iloc[trainval_idx]['subject_id'].unique()
test_subjects = df_filtered.iloc[test_idx]['subject_id'].unique()

df_trainval = df_filtered[df_filtered['subject_id'].isin(trainval_subjects)].copy()
df_test     = df_filtered[df_filtered['subject_id'].isin(test_subjects)].copy()

# 2. Split val subjects from trainval
groups_trainval = df_trainval['subject_id'].values
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.125, random_state=42)
train_idx, val_idx = next(gss2.split(df_trainval, df_trainval['label'], groups=groups_trainval))

train_subjects = df_trainval.iloc[train_idx]['subject_id'].unique()
val_subjects   = df_trainval.iloc[val_idx]['subject_id'].unique()

df_train = df_trainval[df_trainval['subject_id'].isin(train_subjects)].copy()
df_val   = df_trainval[df_trainval['subject_id'].isin(val_subjects)].copy()

print("Train subjects:", set(train_subjects))
print("Val subjects:", set(val_subjects))
print("Test subjects:", set(test_subjects))


Train subjects: {np.int64(2), np.int64(3), np.int64(4), np.int64(7), np.int64(8), np.int64(11), np.int64(12), np.int64(14), np.int64(15), np.int64(16), np.int64(18), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(25), np.int64(28), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(38), np.int64(39), np.int64(40), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(54), np.int64(56), np.int64(57), np.int64(58), np.int64(61), np.int64(63), np.int64(64), np.int64(66), np.int64(67)}
Val subjects: {np.int64(9), np.int64(59), np.int64(19), np.int64(53), np.int64(55), np.int64(24), np.int64(27)}
Test subjects: {np.int64(1), np.int64(65), np.int64(5), np.int64(6), np.int64(37), np.int64(41), np.int64(10), np.int64(13), np.int64(46), np.int64(17), np.int64(26), np.int64(60), np.int64(29), np.int64(62)}


In [27]:
df_train 

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,label,subject_id,trial
96879,1970-01-01 00:01:09.680,-0.458843,-0.004343,-0.272226,0.032586,-0.057479,0.017799,STD,2,1
96880,1970-01-01 00:01:09.700,-0.881271,0.789259,-0.322166,0.031217,-0.012802,-0.002940,STD,2,1
96881,1970-01-01 00:01:09.720,-0.750800,0.547177,-0.330515,0.042022,0.011359,-0.011404,STD,2,1
96882,1970-01-01 00:01:09.740,-0.640295,0.394234,-0.315634,0.030857,0.012073,-0.015476,STD,2,1
96883,1970-01-01 00:01:09.760,-0.686538,0.550332,-0.359439,0.009593,0.003928,-0.010498,STD,2,1
...,...,...,...,...,...,...,...,...,...,...
4138422,1970-01-01 04:19:26.040,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028,STD,67,6
4138423,1970-01-01 04:19:26.060,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323,STD,67,6
4138424,1970-01-01 04:19:26.080,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712,STD,67,6
4138425,1970-01-01 04:19:26.100,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067,STD,67,6


In [None]:
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']

In [31]:
X_train = df_train[sensor_cols]
X_val = df_val[sensor_cols]
X_test = df_test[sensor_cols]

y_train = df_train['label']
y_val = df_val['label']
y_test = df_test['label']


In [30]:
X_train

Unnamed: 0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z
96879,-0.458843,-0.004343,-0.272226,0.032586,-0.057479,0.017799
96880,-0.881271,0.789259,-0.322166,0.031217,-0.012802,-0.002940
96881,-0.750800,0.547177,-0.330515,0.042022,0.011359,-0.011404
96882,-0.640295,0.394234,-0.315634,0.030857,0.012073,-0.015476
96883,-0.686538,0.550332,-0.359439,0.009593,0.003928,-0.010498
...,...,...,...,...,...,...
4138422,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028
4138423,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323
4138424,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712
4138425,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067


In [33]:
y_train.nunique()

16

In [None]:
y_val.nunique()

16

In [37]:
y_test.value_counts()

label
WAL    279617
STD    234249
SIT     79548
JOG     59806
JUM     56869
LYI     49698
STU     35844
STN     33876
CSI     17292
CSO     17023
SCH      8691
BSC      5874
SDL      4979
FKL      4670
FOL      4234
CHU      3634
Name: count, dtype: int64

In [36]:
# Save datasets using to_pickle
X_train.to_pickle("../../data/interim/X_train.pkl")
X_val.to_pickle("../../data/interim/X_val.pkl")
X_test.to_pickle("../../data/interim/X_test.pkl")
y_train.to_pickle("../../data/interim/y_train.pkl")
y_val.to_pickle("../../data/interim/y_val.pkl")
y_test.to_pickle("../../data/interim/y_test.pkl")