### Read data

In [1]:
import pandas as pd

# Define file path
file_path = "../../data/raw/MobiAct_combined.csv"

In [4]:
import pandas as pd

dtype_map = {
    "subject_id": "int16",
    "trial": "int16",
    "acc_x": "float32", "acc_y": "float32", "acc_z": "float32",
    "gyro_x": "float32","gyro_y": "float32","gyro_z": "float32",
    "azimuth": "float32",	"pitch": "float32",	"roll": "float32",
    "label": "category"
}


df = pd.read_csv(
    file_path,
    dtype=dtype_map,        # reduces memory footprint 
    engine='c')            # ensures the fast C parser 

### Explore data

In [None]:
df

In [None]:
# Check unique label
df['label'].nunique()

In [5]:
# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ns')

In [6]:
# Set timestamp as index
df.index = pd.to_datetime(df['timestamp'])

# Delete relative time and timestamp from column
del df['rel_time']
del df['timestamp']

In [97]:
df

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial,label_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1970-01-01 00:21:35.405261,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1,True
1970-01-01 00:21:35.410262,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1,False
1970-01-01 00:21:35.415352,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1,False
1970-01-01 00:21:35.420307,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267319,-38.842915,-11.933741,STD,10,1,False
1970-01-01 00:21:35.425257,-1.403484,9.608654,-2.078054,0.049480,0.018326,0.016493,90.819679,-39.538643,-11.957446,STD,10,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442352,-56.026966,-33.223778,WAL,9,1,False
1970-01-01 02:52:34.582775,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077301,-31.897688,WAL,9,1,False
1970-01-01 02:52:34.588060,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816406,-58.044624,-30.614605,WAL,9,1,False
1970-01-01 02:52:34.592749,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671646,-58.777103,-29.624798,WAL,9,1,False


In [None]:
df['duration'] = df.index.to_series().diff().dt.total_seconds()

In [None]:
df.head(500)

In [None]:
del df['duration']

In [None]:
df

In [None]:
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']

df['fall_label'] = df['label'].apply(lambda x: 'FALL' if x in fall_labels else 'ADL')


In [None]:
df = df.drop(['trial'], axis =1)


In [None]:
df.to_pickle('../../data/interim/df_binary.pkl')

Examine different resampling frequency to retain the most information

In [None]:
'''
# Define resampling dictionary
sampling = {
    "acc_x": "mean",
    "acc_y": "mean",
    "acc_z": "mean",
    "gyro_x": "mean",
    "gyro_y": "mean",
    "gyro_z": "mean",
    # Orientation data for simplicity just take last
    "azimuth": "last",
    "pitch": "last",
    "roll": "last",
    "label": "last",
    "subject_id": "last",
    "trial": "last",
}
'''

In [83]:
new_dfs = []

for subject, group in df.groupby(['subject_id']):
    resampled = group.resample('10ms').agg(sampling)
    new_dfs.append(resampled)

df_resample_20ms = pd.concat(new_dfs).reset_index().dropna()

In [88]:
df_resample_20ms['duration'] = df_resample_20ms['timestamp'].diff().dt.total_seconds()

In [None]:
df_resample_20ms.to_pickle("../../data/raw/df_resample_20ms_ID.pkl") # save for analysis later

In [None]:
# List of labels you want to map to 'FALL'
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']

### Data Preprocessing

In [None]:
df_resample_20ms = pd.read_pickle("../../data/raw/df_resample_20ms.pkl")

In [89]:
import scipy.signal as signal

def apply_low_pass_filter(data, cutoff=3, fs=10, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    df_filtered = data.copy()
    for col in ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']:
        df_filtered[col] = signal.filtfilt(b, a, data[col])
    return df_filtered


In [98]:
df_filtered = apply_low_pass_filter(df)

In [99]:
# Normalize sensor data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
df_filtered[sensor_cols] = scaler.fit_transform(df_filtered[sensor_cols])


In [100]:
df_filtered

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial,label_change
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1970-01-01 00:21:35.405261,-0.410647,0.525941,-0.706532,-0.751413,0.380717,0.103594,92.746895,-36.879684,-11.741077,STD,10,1,True
1970-01-01 00:21:35.410262,-0.410401,0.525734,-0.705975,-0.681002,0.294114,0.079709,92.205360,-37.470173,-11.839779,STD,10,1,False
1970-01-01 00:21:35.415352,-0.410165,0.525536,-0.705442,-0.457682,0.116415,0.022338,91.743050,-38.090790,-11.880902,STD,10,1,False
1970-01-01 00:21:35.420307,-0.409925,0.525335,-0.704901,-0.129473,-0.008210,-0.016840,91.267319,-38.842915,-11.933741,STD,10,1,False
1970-01-01 00:21:35.425257,-0.409666,0.525119,-0.704317,0.065257,0.006163,-0.009515,90.819679,-39.538643,-11.957446,STD,10,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.277076,1.091074,0.958945,1.134623,-5.717552,2.781166,218.442352,-56.026966,-33.223778,WAL,9,1,False
1970-01-01 02:52:34.582775,-0.535365,0.893381,0.425011,0.922183,-5.610354,2.506263,220.688690,-57.077301,-31.897688,WAL,9,1,False
1970-01-01 02:52:34.588060,-0.785462,0.795136,0.023592,0.709663,-5.547993,2.053808,222.816406,-58.044624,-30.614605,WAL,9,1,False
1970-01-01 02:52:34.592749,-1.000564,0.734507,-0.246116,0.520421,-5.430371,1.451017,224.671646,-58.777103,-29.624798,WAL,9,1,False


In [93]:
df_filtered['label_change'] = df_filtered['label'] != df_filtered['label'].shift()
df_transitions = df_filtered[df_filtered['label_change']]

In [94]:
df_transitions

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial,duration,label_change
0,1970-01-01 00:04:23.620,0.112937,-2.213108,-0.277957,-0.007487,0.032133,-0.003218,73.611893,40.307209,26.710962,STD,1.0,1.0,,True
88744,1970-01-01 00:19:11.060,-0.260790,-2.224399,-0.249493,0.769848,0.380504,-0.042652,93.435501,82.974777,-6.902822,SCH,1.0,1.0,0.01,True
88884,1970-01-01 00:19:12.460,-0.671211,-0.764430,2.276025,-0.091068,0.052356,-0.425347,74.101196,-0.321820,-9.959714,SIT,1.0,1.0,0.01,True
91925,1970-01-01 00:19:42.870,-0.547853,-1.170201,2.078271,0.055486,-0.031228,0.205294,100.739632,11.101671,-14.847010,CHU,1.0,1.0,0.01,True
92135,1970-01-01 00:19:44.970,0.189958,-2.187364,-0.207312,-0.053256,-0.091959,-0.036344,98.497643,96.191818,5.434195,STD,1.0,1.0,0.01,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52161496,1970-01-01 05:00:22.690,1.597869,0.248392,-0.785171,-0.085106,-0.313695,-0.090157,308.982941,-72.649567,17.167189,FOL,67.0,2.0,0.01,True
52161633,1970-01-01 05:00:24.060,-1.292420,-0.471086,1.698163,0.115160,0.181560,-0.039792,317.571014,-27.866430,-21.568695,LYI,67.0,2.0,0.01,True
52163964,1970-01-01 05:00:47.370,-0.580184,0.387688,0.055875,0.009117,0.021763,-0.009543,298.571350,-58.705059,-24.052887,STD,67.0,3.0,16.11,True
52164050,1970-01-01 05:00:48.230,-0.141937,0.695965,-0.587636,-0.063532,0.236625,0.109428,303.793488,-97.027657,-0.021469,FOL,67.0,3.0,0.01,True


In [79]:
df_transitions['duration'] = df_transitions.index.to_series().diff().dt.total_seconds()


AttributeError: Can only use .dt accessor with datetimelike values

In [80]:
df_transitions

Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial,duration,label_change
0,1970-01-01 00:04:23.620,0.121591,-2.220753,-0.298699,-0.006835,0.033901,-0.003262,71.765816,42.697929,25.385557,STD,1.0,1.0,,True
44372,1970-01-01 00:19:11.060,-0.260636,-2.226844,-0.244229,0.813111,0.423712,-0.036646,93.736320,82.916855,-6.891407,SCH,1.0,1.0,0.02,True
44442,1970-01-01 00:19:12.460,-0.675235,-0.766005,2.294082,-0.117541,0.062952,-0.451839,74.307365,-0.323424,-10.033054,SIT,1.0,1.0,0.02,True
45962,1970-01-01 00:19:42.860,-0.538493,-1.175417,2.122183,0.033630,-0.027303,0.198727,100.739632,11.101671,-14.847010,CHU,1.0,1.0,0.02,True
46067,1970-01-01 00:19:44.960,0.190485,-2.198486,-0.212719,-0.046765,-0.082118,-0.033174,98.497643,96.191818,5.434195,STD,1.0,1.0,0.02,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26080777,1970-01-01 05:00:22.680,1.625139,0.246317,-0.792200,-0.149662,-0.223823,-0.181887,308.982941,-72.649567,17.167189,FOL,67.0,2.0,0.02,True
26080846,1970-01-01 05:00:24.060,-1.325738,-0.468397,1.716752,0.110120,0.202038,-0.012796,317.613403,-27.813503,-21.785770,LYI,67.0,2.0,0.02,True
26082011,1970-01-01 05:00:47.360,-0.584613,0.388875,0.056535,0.008495,0.022252,-0.009131,298.571350,-58.705059,-24.052887,STD,67.0,3.0,16.10,True
26082054,1970-01-01 05:00:48.220,-0.161230,0.614808,-0.628899,-0.090267,0.194274,0.161571,303.793488,-97.027657,-0.021469,FOL,67.0,3.0,0.02,True


In [95]:
html = df_transitions.to_html(index=True)


In [96]:
# Save to file
with open("label_changes_ID_10ms.html", "w", encoding="utf-8") as f:
    f.write(html)

In [101]:
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']
post_fall = ['LYI']

df_filtered['fall_label'] = df_filtered['label'].apply(
    lambda x: 'FALL' if x in fall_labels else ('POST_FALL' if x in post_fall else 'ADL')
)


In [103]:
df_filtered

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial,label_change,fall_label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1970-01-01 00:21:35.405261,-0.410647,0.525941,-0.706532,-0.751413,0.380717,0.103594,92.746895,-36.879684,-11.741077,STD,10,1,True,ADL
1970-01-01 00:21:35.410262,-0.410401,0.525734,-0.705975,-0.681002,0.294114,0.079709,92.205360,-37.470173,-11.839779,STD,10,1,False,ADL
1970-01-01 00:21:35.415352,-0.410165,0.525536,-0.705442,-0.457682,0.116415,0.022338,91.743050,-38.090790,-11.880902,STD,10,1,False,ADL
1970-01-01 00:21:35.420307,-0.409925,0.525335,-0.704901,-0.129473,-0.008210,-0.016840,91.267319,-38.842915,-11.933741,STD,10,1,False,ADL
1970-01-01 00:21:35.425257,-0.409666,0.525119,-0.704317,0.065257,0.006163,-0.009515,90.819679,-39.538643,-11.957446,STD,10,1,False,ADL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.277076,1.091074,0.958945,1.134623,-5.717552,2.781166,218.442352,-56.026966,-33.223778,WAL,9,1,False,ADL
1970-01-01 02:52:34.582775,-0.535365,0.893381,0.425011,0.922183,-5.610354,2.506263,220.688690,-57.077301,-31.897688,WAL,9,1,False,ADL
1970-01-01 02:52:34.588060,-0.785462,0.795136,0.023592,0.709663,-5.547993,2.053808,222.816406,-58.044624,-30.614605,WAL,9,1,False,ADL
1970-01-01 02:52:34.592749,-1.000564,0.734507,-0.246116,0.520421,-5.430371,1.451017,224.671646,-58.777103,-29.624798,WAL,9,1,False,ADL


In [104]:
df_filtered = df_filtered.drop(['trial'], axis =1)

In [105]:
df_filtered.to_pickle('../../data/interim/df_filtered_200hz.pkl')

### Data Splitting

To ensure no leakage when training our deep learning model we need to group split by subject ID (Leaving some subject out for validation and testing)

In [106]:
from sklearn.model_selection import GroupShuffleSplit

# Get your subject group array
groups = df_filtered['subject_id'].values

# 1. Split off test subjects
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
trainval_idx, test_idx = next(gss1.split(df_filtered, df_filtered['label'], groups=groups))

trainval_subjects = df_filtered.iloc[trainval_idx]['subject_id'].unique()
test_subjects = df_filtered.iloc[test_idx]['subject_id'].unique()

df_trainval = df_filtered[df_filtered['subject_id'].isin(trainval_subjects)].copy()
df_test     = df_filtered[df_filtered['subject_id'].isin(test_subjects)].copy()

# 2. Split val subjects from trainval
groups_trainval = df_trainval['subject_id'].values
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.125, random_state=42)
train_idx, val_idx = next(gss2.split(df_trainval, df_trainval['label'], groups=groups_trainval))

train_subjects = df_trainval.iloc[train_idx]['subject_id'].unique()
val_subjects   = df_trainval.iloc[val_idx]['subject_id'].unique()

df_train = df_trainval[df_trainval['subject_id'].isin(train_subjects)].copy()
df_val   = df_trainval[df_trainval['subject_id'].isin(val_subjects)].copy()

print("Train subjects:", set(train_subjects))
print("Val subjects:", set(val_subjects))
print("Test subjects:", set(test_subjects))


Train subjects: {np.int16(2), np.int16(3), np.int16(4), np.int16(7), np.int16(8), np.int16(11), np.int16(12), np.int16(14), np.int16(15), np.int16(16), np.int16(18), np.int16(20), np.int16(21), np.int16(22), np.int16(23), np.int16(25), np.int16(28), np.int16(30), np.int16(31), np.int16(32), np.int16(33), np.int16(34), np.int16(35), np.int16(36), np.int16(38), np.int16(39), np.int16(40), np.int16(42), np.int16(43), np.int16(44), np.int16(45), np.int16(47), np.int16(48), np.int16(49), np.int16(50), np.int16(51), np.int16(52), np.int16(54), np.int16(56), np.int16(57), np.int16(58), np.int16(61), np.int16(63), np.int16(64), np.int16(66), np.int16(67)}
Val subjects: {np.int16(59), np.int16(9), np.int16(19), np.int16(53), np.int16(55), np.int16(24), np.int16(27)}
Test subjects: {np.int16(1), np.int16(65), np.int16(37), np.int16(5), np.int16(6), np.int16(41), np.int16(10), np.int16(13), np.int16(46), np.int16(17), np.int16(26), np.int16(60), np.int16(29), np.int16(62)}


In [None]:
train_subjects

In [None]:
val_subjects

In [None]:
test_subjects

In [None]:
df_train 

In [14]:
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']

In [None]:
X_train = df_train[sensor_cols]
X_val = df_val[sensor_cols]
X_test = df_test[sensor_cols]

y_train = df_train['label']
y_val = df_val['label']
y_test = df_test['label']


In [None]:
X_train_binary = df_train[sensor_cols]
X_val_binary = df_val[sensor_cols]
X_test_binary = df_test[sensor_cols]

y_train_binary = df_train['fall_label']
y_val_binary = df_val['fall_label']
y_test_binary = df_test['fall_label']

In [107]:
X_train_3 = df_train[sensor_cols]
X_val_3 = df_val[sensor_cols]
X_test_3 = df_test[sensor_cols]

y_train_3 = df_train['fall_label']
y_val_3 = df_val['fall_label']
y_test_3 = df_test['fall_label']

In [None]:
X_train

In [None]:
y_train_3.nunique()

In [None]:
y_val.nunique()

In [None]:
y_test.value_counts()

In [None]:
# Save datasets using to_pickle
X_train.to_pickle("../../data/interim/X_train_ori.pkl")
X_val.to_pickle("../../data/interim/X_val_ori.pkl")
X_test.to_pickle("../../data/interim/X_test_ori.pkl")
y_train.to_pickle("../../data/interim/y_train_ori.pkl")
y_val.to_pickle("../../data/interim/y_val_ori.pkl")
y_test.to_pickle("../../data/interim/y_test_ori.pkl")

In [None]:
# Save datasets using to_pickle
X_train_binary.to_pickle("../../data/interim/X_train_binary.pkl")
X_val_binary.to_pickle("../../data/interim/X_val_binary.pkl")
X_test_binary.to_pickle("../../data/interim/X_test_binary.pkl")
y_train_binary.to_pickle("../../data/interim/y_train_binary.pkl")
y_val_binary.to_pickle("../../data/interim/y_val_binary.pkl")
y_test_binary.to_pickle("../../data/interim/y_test_binary.pkl")

In [108]:
X_train_3.to_pickle("../../data/interim/X_train_3.pkl")
X_val_3.to_pickle("../../data/interim/X_val_3.pkl")
X_test_3.to_pickle("../../data/interim/X_test_3.pkl")
y_train_3.to_pickle("../../data/interim/y_train_3.pkl")
y_val_3.to_pickle("../../data/interim/y_val_3.pkl")
y_test_3.to_pickle("../../data/interim/y_test_3.pkl")