### Read data

In [1]:
import pandas as pd

# Define file path
file_path = "../../data/raw/MobiAct_combined.csv"

In [2]:
# Specify pyarrow for data to load faster
df = pd.read_csv(file_path)

### Explore data

In [3]:
df

Unnamed: 0,timestamp,rel_time,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
0,1295405261000,0.000000,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1,1295410262000,0.005001,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
2,1295415352000,0.010091,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
3,1295420307000,0.015046,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267320,-38.842915,-11.933741,STD,10,1
4,1295425257000,0.019996,-1.403484,9.608654,-2.078055,0.049480,0.018326,0.016493,90.819680,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16756320,10354577784000,299.969995,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442350,-56.026966,-33.223778,WAL,9,1
16756321,10354582775000,299.974986,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077300,-31.897688,WAL,9,1
16756322,10354588060000,299.980271,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816400,-58.044624,-30.614605,WAL,9,1
16756323,10354592749000,299.984960,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671650,-58.777103,-29.624798,WAL,9,1


In [4]:
# Check unique label
df['label'].nunique()

16

In [5]:
# Convert timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ns')

In [6]:
# Set timestamp as index
df.index = pd.to_datetime(df['timestamp'])

# Delete relative time and timestamp from column
del df['rel_time']
del df['timestamp']

In [7]:
df

Unnamed: 0_level_0,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,azimuth,pitch,roll,label,subject_id,trial
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:21:35.405261,-1.407311,9.614395,-2.086666,-0.844216,0.409280,0.086437,92.746895,-36.879684,-11.741077,STD,10,1
1970-01-01 00:21:35.410262,-1.406354,9.612960,-2.084512,-0.711047,0.346971,0.076358,92.205360,-37.470173,-11.839779,STD,10,1
1970-01-01 00:21:35.415352,-1.405380,9.611498,-2.082320,-0.598953,0.093462,0.025045,91.743050,-38.090790,-11.880902,STD,10,1
1970-01-01 00:21:35.420307,-1.404432,9.610076,-2.080186,-0.128893,-0.012828,-0.002443,91.267320,-38.842915,-11.933741,STD,10,1
1970-01-01 00:21:35.425257,-1.403484,9.608654,-2.078055,0.049480,0.018326,0.016493,90.819680,-39.538643,-11.957446,STD,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 02:52:34.577784,-0.907934,13.533889,4.335380,1.207070,-6.215859,1.962099,218.442350,-56.026966,-33.223778,WAL,9,1
1970-01-01 02:52:34.582775,-1.867024,12.331459,2.439285,0.968221,-6.103155,1.773953,220.688690,-57.077300,-31.897688,WAL,9,1
1970-01-01 02:52:34.588060,-2.924407,11.485553,0.782717,0.740674,-6.034738,1.459663,222.816400,-58.044624,-30.614605,WAL,9,1
1970-01-01 02:52:34.592749,-3.726923,11.084407,-0.258194,0.536645,-5.905845,1.027781,224.671650,-58.777103,-29.624798,WAL,9,1


Examine different resampling frequency to retain the most information

In [8]:
# Define resampling dictionary
sampling = {
    "acc_x": "mean",
    "acc_y": "mean",
    "acc_z": "mean",
    "gyro_x": "mean",
    "gyro_y": "mean",
    "gyro_z": "mean",
    "label": "last",
    "subject_id": "last",
    "trial": "last",
}


In [9]:
# Resample again but preserve activity perform by each subject

df_resample_10ms = (
    df
    .groupby(["subject_id", "trial"])
    .resample("10ms")              # this will split each (subject, trial) group
    .agg(sampling)                 # apply your sampling dict or function
    .reset_index()                 # bring subject_id, trial, timestamp back as columns
)


MemoryError: Unable to allocate 1.63 GiB for an array with shape (218115159,) and data type int64

In [None]:
df_resample_10ms

In [None]:
df_resample_10ms.to_pickle("../../data/raw/df_resample_10ms.pkl") # save for analysis later

In [None]:
# List of labels you want to map to 'FALL'
fall_labels = ['BSC', 'FKL', 'SDL', 'FOL']

In [None]:
df_resample_10ms.loc[df_resample_10ms['label'].isin(fall_labels), 'label'] = 'FALL'

### Data Preprocessing

In [None]:
import scipy.signal as signal

def apply_low_pass_filter(data, cutoff=3, fs=10, order=4):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    df_filtered = data.copy()
    for col in ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']:
        df_filtered[col] = signal.filtfilt(b, a, data[col])
    return df_filtered


In [None]:
df_filtered = apply_low_pass_filter(df_resample_10ms)

In [None]:
# Normalize sensor data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']
df_filtered[sensor_cols] = scaler.fit_transform(df_filtered[sensor_cols])


In [None]:
df_filtered

In [None]:
df_filtered = df_filtered.drop(['trial', 'timestamp'], axis =1)

In [None]:
df_filtered.to_pickle('../../data/interim/df_filtered.pkl')

### Data Splitting

To ensure no leakage when training our deep learning model we need to group split by subject ID (Leaving some subject out for validation and testing)

In [None]:
from sklearn.model_selection import GroupShuffleSplit

# Get your subject group array
groups = df_filtered['subject_id'].values

# 1. Split off test subjects
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
trainval_idx, test_idx = next(gss1.split(df_filtered, df_filtered['label'], groups=groups))

trainval_subjects = df_filtered.iloc[trainval_idx]['subject_id'].unique()
test_subjects = df_filtered.iloc[test_idx]['subject_id'].unique()

df_trainval = df_filtered[df_filtered['subject_id'].isin(trainval_subjects)].copy()
df_test     = df_filtered[df_filtered['subject_id'].isin(test_subjects)].copy()

# 2. Split val subjects from trainval
groups_trainval = df_trainval['subject_id'].values
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.125, random_state=42)
train_idx, val_idx = next(gss2.split(df_trainval, df_trainval['label'], groups=groups_trainval))

train_subjects = df_trainval.iloc[train_idx]['subject_id'].unique()
val_subjects   = df_trainval.iloc[val_idx]['subject_id'].unique()

df_train = df_trainval[df_trainval['subject_id'].isin(train_subjects)].copy()
df_val   = df_trainval[df_trainval['subject_id'].isin(val_subjects)].copy()

print("Train subjects:", set(train_subjects))
print("Val subjects:", set(val_subjects))
print("Test subjects:", set(test_subjects))


In [None]:
train_subjects

In [None]:
val_subjects

In [None]:
test_subjects

In [None]:
df_train 

In [None]:
sensor_cols = ['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z']

In [None]:
X_train = df_train[sensor_cols]
X_val = df_val[sensor_cols]
X_test = df_test[sensor_cols]

y_train = df_train['label']
y_val = df_val['label']
y_test = df_test['label']


In [None]:
X_train

In [None]:
y_train.nunique()

In [None]:
y_val.nunique()

In [None]:
y_test.value_counts()

In [None]:
# Save datasets using to_pickle
X_train.to_pickle("../../data/interim/X_train.pkl")
X_val.to_pickle("../../data/interim/X_val.pkl")
X_test.to_pickle("../../data/interim/X_test.pkl")
y_train.to_pickle("../../data/interim/y_train.pkl")
y_val.to_pickle("../../data/interim/y_val.pkl")
y_test.to_pickle("../../data/interim/y_test.pkl")