### Read data

In [1]:
import pandas as pd

# Define file path
file_path = "../../data/raw/MobiAct_combined.csv"

In [2]:
import pandas as pd

dtype_map = {
    "subject_id": "int16",
    "trial": "int16",
    "acc_x": "float32", "acc_y": "float32", "acc_z": "float32",
    "gyro_x": "float32","gyro_y": "float32","gyro_z": "float32",
    "azimuth": "float32",	"pitch": "float32",	"roll": "float32",
    "label": "category"
}


df = pd.read_csv(
    file_path,
    dtype=dtype_map,        # reduces memory footprint :contentReference[oaicite:4]{index=4}
    engine='c',             # ensures the fast C parser :contentReference[oaicite:6]{index=6}
)

### Explore data

In [3]:
df

Unnamed: 0,timestamp,rel_time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
0,1295405261000,0.000000,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1,1295410262000,0.005001,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
2,1295415352000,0.010091,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
3,1295420307000,0.015046,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267319,-38.842915,-11.933741,STD,10,1
4,1295425257000,0.019996,-1.403484,9.608654,-2.078054,0.049480,0.018326,0.016493,90.819679,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16756320,10354577784000,299.969995,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442352,-56.026966,-33.223778,WAL,9,1
16756321,10354582775000,299.974986,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077301,-31.897688,WAL,9,1
16756322,10354588060000,299.980271,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816406,-58.044624,-30.614605,WAL,9,1
16756323,10354592749000,299.984960,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671646,-58.777103,-29.624798,WAL,9,1


In [4]:
# Check unique label
df['label'].nunique()

16

In [5]:
# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ns')

In [6]:
# Set timestamp as index
df.index = pd.to_datetime(df['timestamp'])

# Delete relative time and timestamp from column
del df['rel_time']
del df['timestamp']

In [7]:
df

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:21:35.405261,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1970-01-01 00:21:35.410262,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
1970-01-01 00:21:35.415352,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
1970-01-01 00:21:35.420307,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267319,-38.842915,-11.933741,STD,10,1
1970-01-01 00:21:35.425257,-1.403484,9.608654,-2.078054,0.049480,0.018326,0.016493,90.819679,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442352,-56.026966,-33.223778,WAL,9,1
1970-01-01 02:52:34.582775,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077301,-31.897688,WAL,9,1
1970-01-01 02:52:34.588060,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816406,-58.044624,-30.614605,WAL,9,1
1970-01-01 02:52:34.592749,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671646,-58.777103,-29.624798,WAL,9,1


Examine different resampling frequency to retain the most information

In [8]:
# Define resampling dictionary
sampling = {
    "acc_x": "mean",
    "acc_y": "mean",
    "acc_z": "mean",
    "gyro_x": "mean",
    "gyro_y": "mean",
    "gyro_z": "mean",
    # Orientation data for simplicity just take last
    "azimuth": "last",
    "pitch": "last",
    "roll": "last",
    "label": "last",
    "subject_id": "last",
    "trial": "last",
}


In [9]:
new_dfs = []

for (subject, trial), group in df.groupby(['subject_id', 'trial']):
    resampled = group.resample('20ms').agg(sampling)
    resampled['subject_id'] = subject
    resampled['trial'] = trial
    new_dfs.append(resampled)

df_resample_20ms = pd.concat(new_dfs).reset_index().dropna()

In [10]:
df_resample_20ms

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
0,1970-01-01 00:04:23.620,0.705479,-9.754312,-0.453826,-0.036448,0.029831,0.011301,71.765816,42.697929,25.385557,STD,1,1
1,1970-01-01 00:04:23.640,0.743307,-9.756554,-0.559144,-0.044135,0.025656,0.017944,68.568184,47.036865,22.997513,STD,1,1
2,1970-01-01 00:04:23.660,0.721394,-9.682911,-0.750469,-0.039172,0.017944,0.016570,66.112846,50.938175,20.652905,STD,1,1
3,1970-01-01 00:04:23.680,0.701371,-9.686890,-0.835315,-0.039859,0.015882,0.012752,64.245255,54.487350,18.425901,STD,1,1
4,1970-01-01 00:04:23.700,0.693497,-9.698679,-0.785186,-0.042226,0.011683,0.017028,62.895699,57.762157,16.261679,STD,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
109057760,1970-01-01 04:19:26.040,2.003467,9.635633,-0.573736,-0.063148,-0.064217,-0.003589,163.477036,-105.813019,13.144693,STD,67,6
109057761,1970-01-01 04:19:26.060,2.121049,9.664656,-0.569072,-0.047571,-0.051160,-0.006720,163.573837,-105.796631,13.122824,STD,67,6
109057762,1970-01-01 04:19:26.080,2.212409,9.664813,-0.602382,-0.028634,-0.036041,-0.002825,163.642487,-105.817123,13.135675,STD,67,6
109057763,1970-01-01 04:19:26.100,2.184638,9.659225,-0.565169,-0.020617,-0.022602,-0.004581,163.699432,-105.865280,13.176976,STD,67,6


In [None]:
df_resample_20ms.to_pickle("../../data/raw/df_resample_20ms.pkl") # save for analysis later

In [None]:
# List of labels you want to map to 'FALL'
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']

### Data Preprocessing

In [11]:
import scipy.signal as signal

def apply_low_pass_filter(data, cutoff=3, fs=10, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    df_filtered = data.copy()
    for col in ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']:
        df_filtered[col] = signal.filtfilt(b, a, data[col])
    return df_filtered


In [12]:
df_filtered = apply_low_pass_filter(df_resample_20ms)

In [None]:
# Normalize sensor data
from sklearn.preprocessing import StandardScaler

scaler = 
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'azimuth','pitch','roll']
df_filtered[sensor_cols] = scaler.fit_transform(df_filtered[sensor_cols])


In [14]:
df_filtered

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
0,1970-01-01 00:04:23.620,0.120127,-2.217653,-0.298938,-0.006791,0.033568,-0.003344,-0.982106,1.466176,1.076405,STD,1,1
1,1970-01-01 00:04:23.640,0.128787,-2.215478,-0.330573,-0.012438,0.028701,0.005826,-1.011898,1.529641,0.967330,STD,1,1
2,1970-01-01 00:04:23.660,0.125399,-2.210226,-0.371455,-0.010902,0.023067,0.004100,-1.034774,1.586706,0.860239,STD,1,1
3,1970-01-01 00:04:23.680,0.117923,-2.207260,-0.396612,-0.009458,0.018697,0.000048,-1.052174,1.638619,0.758519,STD,1,1
4,1970-01-01 00:04:23.700,0.118343,-2.208842,-0.387711,-0.012425,0.017044,0.003507,-1.064748,1.686520,0.659668,STD,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
109057760,1970-01-01 04:19:26.040,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028,-0.127635,-0.706084,0.517298,STD,67,6
109057761,1970-01-01 04:19:26.060,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323,-0.126733,-0.705844,0.516299,STD,67,6
109057762,1970-01-01 04:19:26.080,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712,-0.126094,-0.706144,0.516886,STD,67,6
109057763,1970-01-01 04:19:26.100,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067,-0.125563,-0.706848,0.518772,STD,67,6


In [27]:
df_filtered['label'].value_counts()

label
STD    1213627
WAL    1208069
SIT     294414
JOG     285529
JUM     277322
LYI     250694
STU     163630
STN     149599
CSO      77993
CSI      73680
SCH      40408
BSC      27091
SDL      23362
FKL      21931
FOL      19630
CHU      11448
Name: count, dtype: int64

In [28]:
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']

df_filtered['fall_label'] = df_filtered['label'].apply(lambda x: 'FALL' if x in fall_labels else 'ADL')


In [15]:
df_filtered = df_filtered.drop(['trial', 'timestamp'], axis =1)

In [30]:
df_filtered.to_pickle('../../data/interim/df_filtered_binary.pkl')

### Data Splitting

To ensure no leakage when training our deep learning model we need to group split by subject ID (Leaving some subject out for validation and testing)

In [31]:
from sklearn.model_selection import GroupShuffleSplit

# Get your subject group array
groups = df_filtered['subject_id'].values

# 1. Split off test subjects
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
trainval_idx, test_idx = next(gss1.split(df_filtered, df_filtered['label'], groups=groups))

trainval_subjects = df_filtered.iloc[trainval_idx]['subject_id'].unique()
test_subjects = df_filtered.iloc[test_idx]['subject_id'].unique()

df_trainval = df_filtered[df_filtered['subject_id'].isin(trainval_subjects)].copy()
df_test     = df_filtered[df_filtered['subject_id'].isin(test_subjects)].copy()

# 2. Split val subjects from trainval
groups_trainval = df_trainval['subject_id'].values
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.125, random_state=42)
train_idx, val_idx = next(gss2.split(df_trainval, df_trainval['label'], groups=groups_trainval))

train_subjects = df_trainval.iloc[train_idx]['subject_id'].unique()
val_subjects   = df_trainval.iloc[val_idx]['subject_id'].unique()

df_train = df_trainval[df_trainval['subject_id'].isin(train_subjects)].copy()
df_val   = df_trainval[df_trainval['subject_id'].isin(val_subjects)].copy()

print("Train subjects:", set(train_subjects))
print("Val subjects:", set(val_subjects))
print("Test subjects:", set(test_subjects))


Train subjects: {np.int16(2), np.int16(3), np.int16(4), np.int16(7), np.int16(8), np.int16(11), np.int16(12), np.int16(14), np.int16(15), np.int16(16), np.int16(18), np.int16(20), np.int16(21), np.int16(22), np.int16(23), np.int16(25), np.int16(28), np.int16(30), np.int16(31), np.int16(32), np.int16(33), np.int16(34), np.int16(35), np.int16(36), np.int16(38), np.int16(39), np.int16(40), np.int16(42), np.int16(43), np.int16(44), np.int16(45), np.int16(47), np.int16(48), np.int16(49), np.int16(50), np.int16(51), np.int16(52), np.int16(54), np.int16(56), np.int16(57), np.int16(58), np.int16(61), np.int16(63), np.int16(64), np.int16(66), np.int16(67)}
Val subjects: {np.int16(9), np.int16(59), np.int16(19), np.int16(53), np.int16(55), np.int16(24), np.int16(27)}
Test subjects: {np.int16(1), np.int16(65), np.int16(5), np.int16(6), np.int16(37), np.int16(41), np.int16(10), np.int16(13), np.int16(46), np.int16(17), np.int16(26), np.int16(60), np.int16(29), np.int16(62)}


In [32]:
train_subjects

array([ 2,  3,  4,  7,  8, 11, 12, 14, 15, 16, 18, 20, 21, 22, 23, 25, 28,
       30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 42, 43, 44, 45, 47, 48, 49,
       50, 51, 52, 54, 56, 57, 58, 61, 63, 64, 66, 67], dtype=int16)

In [33]:
val_subjects

array([ 9, 19, 24, 27, 53, 55, 59], dtype=int16)

In [34]:
test_subjects

array([ 1,  5,  6, 10, 13, 17, 26, 29, 37, 41, 46, 60, 62, 65],
      dtype=int16)

In [35]:
df_train 

Unnamed: 0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,fall_label
634004,-0.458843,-0.004343,-0.272226,0.032586,-0.057479,0.017799,0.151812,-0.795093,-1.048645,STD,2,ADL
634005,-0.881271,0.789259,-0.322166,0.031217,-0.012802,-0.002940,0.136114,-0.778771,-0.997640,STD,2,ADL
634006,-0.750800,0.547177,-0.330515,0.042022,0.011359,-0.011404,0.121327,-0.765772,-0.956557,STD,2,ADL
634007,-0.640295,0.394234,-0.315634,0.030857,0.012073,-0.015476,0.108499,-0.754728,-0.921267,STD,2,ADL
634008,-0.686538,0.550332,-0.359439,0.009593,0.003928,-0.010498,0.098456,-0.744377,-0.889684,STD,2,ADL
...,...,...,...,...,...,...,...,...,...,...,...,...
109057760,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028,-0.127635,-0.706084,0.517298,STD,67,ADL
109057761,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323,-0.126733,-0.705844,0.516299,STD,67,ADL
109057762,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712,-0.126094,-0.706144,0.516886,STD,67,ADL
109057763,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067,-0.125563,-0.706848,0.518772,STD,67,ADL


In [22]:
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'azimuth','pitch','roll']

In [None]:
X_train = df_train[sensor_cols]
X_val = df_val[sensor_cols]
X_test = df_test[sensor_cols]

y_train = df_train['label']
y_val = df_val['label']
y_test = df_test['label']


In [None]:
X_train_binary = df_train[sensor_cols]
X_val_binary = df_val[sensor_cols]
X_test_binary = df_test[sensor_cols]

y_train_binary = df_train['fall_label']
y_val_binary = df_val['fall_label']
y_test_binary = df_test['fall_label']

In [24]:
X_train

Unnamed: 0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll
634004,-0.458843,-0.004343,-0.272226,0.032586,-0.057479,0.017799,0.151812,-0.795093,-1.048645
634005,-0.881271,0.789259,-0.322166,0.031217,-0.012802,-0.002940,0.136114,-0.778771,-0.997640
634006,-0.750800,0.547177,-0.330515,0.042022,0.011359,-0.011404,0.121327,-0.765772,-0.956557
634007,-0.640295,0.394234,-0.315634,0.030857,0.012073,-0.015476,0.108499,-0.754728,-0.921267
634008,-0.686538,0.550332,-0.359439,0.009593,0.003928,-0.010498,0.098456,-0.744377,-0.889684
...,...,...,...,...,...,...,...,...,...
109057760,0.450731,0.535045,-0.324386,-0.031796,-0.059572,-0.027028,-0.127635,-0.706084,0.517298
109057761,0.481948,0.536976,-0.334319,-0.016996,-0.046264,-0.028323,-0.126733,-0.705844,0.516299
109057762,0.502064,0.539007,-0.333029,0.000452,-0.029247,-0.023712,-0.126094,-0.706144,0.516886
109057763,0.498980,0.536931,-0.329682,0.008017,-0.019459,-0.027067,-0.125563,-0.706848,0.518772


In [None]:
y_train.nunique()

In [None]:
y_val.nunique()

In [None]:
y_test.value_counts()

In [25]:
# Save datasets using to_pickle
X_train.to_pickle("../../data/interim/X_train_ori.pkl")
X_val.to_pickle("../../data/interim/X_val_ori.pkl")
X_test.to_pickle("../../data/interim/X_test_ori.pkl")
y_train.to_pickle("../../data/interim/y_train_ori.pkl")
y_val.to_pickle("../../data/interim/y_val_ori.pkl")
y_test.to_pickle("../../data/interim/y_test_ori.pkl")

In [38]:
# Save datasets using to_pickle
X_train_binary.to_pickle("../../data/interim/X_train_binary.pkl")
X_val_binary.to_pickle("../../data/interim/X_val_binary.pkl")
X_test_binary.to_pickle("../../data/interim/X_test_binary.pkl")
y_train_binary.to_pickle("../../data/interim/y_train_binary.pkl")
y_val_binary.to_pickle("../../data/interim/y_val_binary.pkl")
y_test_binary.to_pickle("../../data/interim/y_test_binary.pkl")